Herramientas de usuario

Herramientas del sitio


r-tips

Cosas simples en excel y como se hacen en R

# Problemas con git

git push -u origin master

## Sobreescribir repo local con git

git checkout .
git pull

# BÁSICO DE R R ## remove duplicated columns

df <- df[, !duplicated(colnames(df))]

## renombrar nombres eliminar espacios

names(df) <- make.names(names(df))

## random sample

### by groups

sample <- df %>% group_by(grupos) %>% sample_n(n, replace =T) # replace = T si algún grupo tiene menos de n

## odds ratio en regresion logística

se <- sqrt(diag(vcov(m1)))
(tab <- cbind(Est = fixef(m1), LL = fixef(m1) - 1.96 * se, UL = fixef(m1) + 1.96 *
                    se))
#ahora saco el odds ratio
exp(tab

## importar csv con acentos codificacion r

read.csv(file="fichero.csv", fileEncoding="utf-8")

## Organización de datos

Tidy ver http://vita.had.co.nz/papers/tidy-data.pdf

Molten vs tidy

### Reorganización

#### Spread %>%

  group_by(grupo) %>% 
  mutate(id = row_number()) %>% 
  select(-c(TODAS MENOS LAS DOS)) %>%
  spread(grupo, valor) %>%
  select(-id)

## Número de decimales

options(digits=3) # para tres decimales

## extract p value

model <- aov(y~x)
modelsummary <- summary(model)
p_model <- modelsummary[[1]]$'Pr(>F)'

## Convert a numeric vector to a factor with “cut”

ejemplo <- cut(rnorm(100),3,c("Low","Med","High"))
ejemplo

## Tablas resumen

install.packages("plyr")
library("plyr")
ddply(df,~grupo,summarise,mean=mean(y),sd=sd(y))

## Como citar R

R Development Core Team (2008). R: A language and environment for statistical computing. R Foundation for Statistical Computing, Vienna, Austria. ISBN 3-900051-07-0, URL http://www.R-project.org.

## Videos Data Manipulation with dplyr

dplyr

## Tipos de variables Variable Types

  • character – Strings
  • integer – Integers
  • numeric – Integers + Fractions
  • factor – Categorical variable where each level is a category
  • logical – Boolean
  • complex – Complex numbers

Data Types

  • vector – A collection of elements of same class → factors, levels
  • matrix – All columns must uniformly contain only one variable type.
  • data.frame – The columns can contain different classes.
  • list – Can hold objects of different classes and lengths

## Videos

Acerca del dataframe

[http://blog.datacamp.com/15-easy-solutions-data-frame-problems-r/]

## Importar datos [http://blog.datacamp.com/r-data-import-tutorial/]

## Sum NA values in r

colSums(is.na(data))
returning:
A B C D E
0 1 2 2 4

## Cuál es NA?

which(is.na(x))

## encontrar los NA

new_DF <- df[is.na(df$columna),]
write.csv(new_DF, "NA.csv")

## exportar los NA

write.csv(df[is.na(df$col),], "verificarNA.csv")

## remove NA

helper <- apply(data, 1, function(x){any(is.na(x))})
df.clean <- df.messy[!helper]
df.clean <- na.omit(df.messy)

## PLOTS

par(mfrow=c(1,1)) #numero de figuras en el plot;c(filas,columnas) 
par(mar=c(5,5,3,2)+0.1) #margenes

### Varios graphs con ggplot2

library("gridExtra")
p1 = qplot(...)
p2 = qplot(...)
grid.arrange(p1, p2, ncol = 1)

### temas en ggplot2

library(ggplot2)
theme_set(theme_grey(24))
theme_set(theme_bw(24)) 
theme_set(theme_minimal(24))

Muchos más temas, ver en https://github.com/jrnold/ggthemes

install.packages('ggthemes', dependencies = TRUE)
library("ggthemes") 
theme_set(theme_economist(24)) #bueno para pantalla
theme_set(theme_few(24)) #minimalista, con borde ext
theme_set(theme_fivethirtyeight(24))
theme_set(theme_tufte(24)) #bueno para imprimir
theme_set(theme_wsj(24)) #bueno para pantalla
theme_set(theme_base(24)) #minimalista, sin borde ext
theme_set(theme_gdocs(24)) # bueno para imprimir

### add a variable as legend

legend = c(paste("Promedio =", round(meanx))))

### Hide borders in graph

frame.plot = FALSE 

### Export high quality graph in R

png(file="boxplot.png",width=900,height=750)
graph here
dev.off()

### Avoid overlapping in scatter plots R

install.packages("directlabels")
library(directlabels)
direct.label(xyplot(**y**~**x**,**dataframe**,**groups**=**Grupos**, col="black"))

Otra manera

if necessary, install the calibrate package
install.packages("calibrate")
load the calibrate package
library(calibrate)
use the textxy() function to add labels to the preexisting plot's points
add labels for the total enrollment
textxy(enrollmentData$YEAR, enrollmentData$UNEM, enrollmentData$ROLL)

## Short to long form R ## Manejar datos

## check na

sapply(df, function(x) sum(is.na(x)))

## sumar omitiendo NA

 mutate(newVar = rowSums( cbind (colA,colB), na.rm=TRUE))

## replace na

mydf$col[is.na(mydf$col)] <- 'x' #algunos
mydf[is.na(mydf)] <- 'x' #todos

### dplyr

dd %>% mean(na.rm=T)
flights %>% filter(!is.na(arr_delay))
flights %>% 
      filter(!is.na(arr_delay) ) %>% 
      select(ends_with("time"))
flights %>% 
      mutate(speed = distance/air_time * 60) %>% 
      select(carrier, arr_delay, speed)
      
flights %>% 
      filter(!is.na(air_time), !is.na(distance)) %>% 
      summarise(n = n(), n_carriers = n_distinct(carrier), 
                total_time = sum(air_time), total_dist = sum(distance))

s ### Transformaciones

Data <- cbind(Data, Data$GPA^2)
  • square root> sqrt(Data$GPA) )
  • natural logarithm >log(Data$GPA)
  • common logarithm > log10(Data$GPA)
  • reciprocal > 1/Data$GPA
  • reciprocal square root > 1/sqrt(Data$GPA)

### Renombrar factores si ya tienen nombre

levels(df$caso.o.control)[levels(df$caso.o.control) == "Control"] <- "Sin caries"

### Renombrar factores ordenados

$SEXO     : Factor w/ 2 levels "0","1": 1 1 2 2 1 2 2 2 1 1 ...
levels(df$SEXO) <- c("Masculino","Femenino")

### Extraer fila con valor máximo

df[which.max(df$columna), ]

### Limpiar <NA> en dataframes rectangulares

x = data.frame(a = c(1, NA, 2), b = c(2, NA, 3), c = c(NA, "A", NA))
x
#    a  b    c
# 1  1  2 <NA>
# 2 NA NA    A
# 3  2  3 <NA>

Limpieza en una línea

as.data.frame(lapply(x, na.omit))
#   a b c
# 1 1 2 A
# 2 2 3 A

otra solución

y = lapply(x, na.omit)

### Básico

a <- c(1, 2, 3)
b <- c(4, 5, 6)
c <- c("Paciente 1", "Paciente 2")
df <- cbind.data.frame( c(a, b) , c) #combino en dos columnas
colnames(df)[1] <- "Paciente"
rename(df, new = old) # con dplyr, a veces funciona

### Crear un nuevo df con columnas de otro require(dplyr) new_df ← data.frame() new_df ← select(old_df, col1, col2, col3, col4)

### Cambiar a factores

df$col <- factor(df$col, levels = 1:11, labels = LETTERS[1:11]) # en caso que tenga 11 números

### merge datasets

# Load two datasets
income2008 <- read.csv("data/ACS_08_3YR_S1903/ACS_08_3YR_S1903.csv",
    stringsAsFactors=FALSE, sep=",", colClasses=c("GEO.id2"="character"))
income2013 <- read.csv("data/ACS_13_5YR_S1903/ACS_13_5YR_S1903.csv",
    stringsAsFactors=FALSE, sep=",", colClasses=c("GEO.id2"="character"))

Selecciono solo algunas columnas

# Subset
income2008p <- income2008[,c("GEO.id2", "HC02_EST_VC02", "HC02_MOE_VC02")]
income2013p <- income2013[,c("GEO.id2", "HC02_EST_VC02", "HC02_MOE_VC02")]

Renombro columnas

# Rename headers
names(income2008p) <- c("FIPS", "med2008", "moe2008")
names(income2013p) <- c("FIPS", "med2013", "moe2013")

Combino

# Combine
income0813 <- merge(income2008p, income2013p, by="FIPS")

### cambiar nombres de variables

names(df) <- c("col1", "col2", etc)

### remover na

df <- subset(df, !is.na(variable con datos que deseamos filtrar))

otra manera

var_without_na <- na.omit(var)

### Ordenar factores

var <- ordered(var, levels = c("primera", "segunda", "tercera")

lo mismo con factors

var <- factor(var, levels = c("primera", "segunda", "tercera"), 
ordered = T)

Otra manera

temperature_vector <- c("High", "Low", "High","Low", "Medium")

factor_temperature_vector <- factor(temperature_vector, **order = TRUE, levels = c("Low", "Medium", "High")**)

factor_temperature_vector

### Ordenar los datos

df2 <- df[order(df$Var,decreasing=T),]
df2[!duplicated(df2$A1),]
head(df2)
df2

### combinar datos de un df

LIZARD LENGTH DATA

island.1 <- c(0.2, 5.9, 6.1, 6.5)
island.2 <- c(5.6, 14.8, 15.5, 16.4) 
island.3 <- c(0.8, 3.9, 4.3, 4.9) 
sex.codes <- c("Male", "Female", "Male", "Female")
df.1 <- data.frame(island.1, island.2, island.3, sex.codes) 
str(df.1)

### Tablas

tabla <- with(df, table(x, y))
tabla <- with(df, xtabs(explanatory, response))
chisq.test(tabla)

Otras tablas con

library("gmodels")
CrossTable(row, col)

### Tablas resumenes summarise tables

tapply(x, y, FUN)

### Tablas resumen 2×2

tabla <- with(data, tapply(x, list(factor1, factor2), FUN))

### DATA MUNGING BASICS

#### tidyr y dyplr ver https://rpubs.com/bradleyboehmke/data_wrangling y http://datascienceplus.com/data-manipulation-with-tidyr/

  • tidyr
    • gather(data, key, value, …, na.rm = FALSE, convert = FALSE)
    • spread(data, key, value, fill = NA, convert = FALSE, drop = TRUE)
    • separate(data, col, into, sep = “[^[:alnum:]]+”, remove = TRUE, convert = FALSE, extra = “warn”, fill = “warn”, …)
    • unite(data, col, …, sep = “_”, remove = TRUE)
  • dplyr
  • select()
  • filter()
  • group_by()
  • summarise()
  • arrange()
  • join()
  • mutate()

#### Normalizar nombres de variables

library("data.table")
df <- setnames(df, tolower(names(df)))

### melting to long form

install.packages("reshape") 
library(reshape) 
df.2 <- melt(df.1) 
str(df.2) 
df.1 
df.2''
data <- data.frame(sex = c(rep(1, 1000), rep(2, 1000)), 
                 treatment = rep(c(1, 2), 1000),      
                 response1 = rnorm(2000, 0, 1),       
                 response2 = rnorm(2000, 0, 1))       
head(data)                                              
## reshape2 still does its thing:                       
library(reshape2)                                        
melted <- melt(data, id.vars=c("sex", "treatment"))     
head(melted) 

Queda así

> head(data)
    sex treatment  response1  response2
1   1         1  0.1864923  1.2678290
2   1         2 -0.8654485 -0.5403999
3   1         1  0.1947279 -0.1921475
4   1         2 -0.7526925  0.4408610
5   1         1 -0.6723624  0.6399581
6   1         2  0.7225497  0.0368637
> melted <- melt(data, id.vars=c("sex", "treatment"))
> head(melted)
    sex treatment  variable      value
1   1         1 response1  0.1864923
2   1         2 response1 -0.8654485
3   1         1 response1  0.1947279
4   1         2 response1 -0.7526925
5   1         1 response1 -0.6723624
6   1         2 response1  0.7225497

### Convertir fac a num

NuevoNum <- as.numeric(levels(ViejoFac))[ViejoFac] # convertir datos factor a número

\

## Instalar paquete en R desde URL

install.packages("**URL PAQUETE**.tar.gz", repos=NULL) 

### Paquetes

## Decision tree http://mathminers.com/index.php/2015/06/08/r-tutorial-on-social-media-analysis/

## http://stats.stackexchange.com/

Generate a random variable with a defined correlation to an existing variable

Multivariate multiple regression in R

What is the NULL hypothesis for interaction in a two-way ANOVA?

Why are lower p-values not more evidence against the null? Arguments from Johansson 2011

r-tips.txt · Última modificación: 2018/04/28 05:00 por 127.0.0.1