Tabla de Contenidos
Cosas simples en excel y como se hacen en R
# Problemas con git
git push -u origin master
## Sobreescribir repo local con git
git checkout . git pull
# BÁSICO DE R R ## remove duplicated columns
df <- df[, !duplicated(colnames(df))]
## renombrar nombres eliminar espacios
names(df) <- make.names(names(df))
## random sample
### by groups
sample <- df %>% group_by(grupos) %>% sample_n(n, replace =T) # replace = T si algún grupo tiene menos de n
## odds ratio en regresion logística
se <- sqrt(diag(vcov(m1))) (tab <- cbind(Est = fixef(m1), LL = fixef(m1) - 1.96 * se, UL = fixef(m1) + 1.96 * se))
#ahora saco el odds ratio exp(tab
## importar csv con acentos codificacion r
read.csv(file="fichero.csv", fileEncoding="utf-8")
## Organización de datos
Tidy ver http://vita.had.co.nz/papers/tidy-data.pdf
Molten vs tidy
### Reorganización
#### Spread %>%
group_by(grupo) %>% mutate(id = row_number()) %>% select(-c(TODAS MENOS LAS DOS)) %>% spread(grupo, valor) %>% select(-id)
## Número de decimales
options(digits=3) # para tres decimales
## extract p value
model <- aov(y~x) modelsummary <- summary(model) p_model <- modelsummary[[1]]$'Pr(>F)'
## Convert a numeric vector to a factor with “cut”
ejemplo <- cut(rnorm(100),3,c("Low","Med","High")) ejemplo
## Tablas resumen
install.packages("plyr") library("plyr") ddply(df,~grupo,summarise,mean=mean(y),sd=sd(y))
## Como citar R
R Development Core Team (2008). R: A language and environment for statistical computing. R Foundation for Statistical Computing, Vienna, Austria. ISBN 3-900051-07-0, URL http://www.R-project.org.
## Videos Data Manipulation with dplyr
## Tipos de variables Variable Types
- character – Strings
- integer – Integers
- numeric – Integers + Fractions
- factor – Categorical variable where each level is a category
- logical – Boolean
- complex – Complex numbers
Data Types
- vector – A collection of elements of same class → factors, levels
- matrix – All columns must uniformly contain only one variable type.
- data.frame – The columns can contain different classes.
- list – Can hold objects of different classes and lengths
## Videos
Acerca del dataframe
[http://blog.datacamp.com/15-easy-solutions-data-frame-problems-r/]
## Importar datos [http://blog.datacamp.com/r-data-import-tutorial/]
## Sum NA values in r
colSums(is.na(data)) returning: A B C D E 0 1 2 2 4
## Cuál es NA?
which(is.na(x))
## encontrar los NA
new_DF <- df[is.na(df$columna),] write.csv(new_DF, "NA.csv")
## exportar los NA
write.csv(df[is.na(df$col),], "verificarNA.csv")
## remove NA
helper <- apply(data, 1, function(x){any(is.na(x))}) df.clean <- df.messy[!helper] df.clean <- na.omit(df.messy)
## PLOTS
par(mfrow=c(1,1)) #numero de figuras en el plot;c(filas,columnas) par(mar=c(5,5,3,2)+0.1) #margenes
### Varios graphs con ggplot2
library("gridExtra") p1 = qplot(...) p2 = qplot(...) grid.arrange(p1, p2, ncol = 1)
### temas en ggplot2
library(ggplot2) theme_set(theme_grey(24)) theme_set(theme_bw(24)) theme_set(theme_minimal(24))
Muchos más temas, ver en https://github.com/jrnold/ggthemes
install.packages('ggthemes', dependencies = TRUE) library("ggthemes") theme_set(theme_economist(24)) #bueno para pantalla theme_set(theme_few(24)) #minimalista, con borde ext theme_set(theme_fivethirtyeight(24)) theme_set(theme_tufte(24)) #bueno para imprimir theme_set(theme_wsj(24)) #bueno para pantalla theme_set(theme_base(24)) #minimalista, sin borde ext theme_set(theme_gdocs(24)) # bueno para imprimir
### add a variable as legend
legend = c(paste("Promedio =", round(meanx))))
### Hide borders in graph
frame.plot = FALSE
### Export high quality graph in R
png(file="boxplot.png",width=900,height=750) graph here dev.off()
### Avoid overlapping in scatter plots R
install.packages("directlabels") library(directlabels) direct.label(xyplot(**y**~**x**,**dataframe**,**groups**=**Grupos**, col="black"))
Otra manera
if necessary, install the calibrate package install.packages("calibrate") load the calibrate package library(calibrate)
use the textxy() function to add labels to the preexisting plot's points add labels for the total enrollment textxy(enrollmentData$YEAR, enrollmentData$UNEM, enrollmentData$ROLL)
## Short to long form R ## Manejar datos
## check na
sapply(df, function(x) sum(is.na(x)))
## sumar omitiendo NA
mutate(newVar = rowSums( cbind (colA,colB), na.rm=TRUE))
## replace na
mydf$col[is.na(mydf$col)] <- 'x' #algunos mydf[is.na(mydf)] <- 'x' #todos
### dplyr
dd %>% mean(na.rm=T)
flights %>% filter(!is.na(arr_delay))
flights %>% filter(!is.na(arr_delay) ) %>% select(ends_with("time"))
flights %>% mutate(speed = distance/air_time * 60) %>% select(carrier, arr_delay, speed)
flights %>% filter(!is.na(air_time), !is.na(distance)) %>% summarise(n = n(), n_carriers = n_distinct(carrier), total_time = sum(air_time), total_dist = sum(distance))
s ### Transformaciones
Data <- cbind(Data, Data$GPA^2)
- square root> sqrt(Data$GPA) )
- natural logarithm >log(Data$GPA)
- common logarithm > log10(Data$GPA)
- reciprocal > 1/Data$GPA
- reciprocal square root > 1/sqrt(Data$GPA)
### Renombrar factores si ya tienen nombre
levels(df$caso.o.control)[levels(df$caso.o.control) == "Control"] <- "Sin caries"
### Renombrar factores ordenados
$SEXO : Factor w/ 2 levels "0","1": 1 1 2 2 1 2 2 2 1 1 ... levels(df$SEXO) <- c("Masculino","Femenino")
### Extraer fila con valor máximo
df[which.max(df$columna), ]
### Limpiar <NA> en dataframes rectangulares
x = data.frame(a = c(1, NA, 2), b = c(2, NA, 3), c = c(NA, "A", NA)) x # a b c # 1 1 2 <NA> # 2 NA NA A # 3 2 3 <NA>
Limpieza en una línea
as.data.frame(lapply(x, na.omit)) # a b c # 1 1 2 A # 2 2 3 A
otra solución
y = lapply(x, na.omit)
### Básico
a <- c(1, 2, 3) b <- c(4, 5, 6) c <- c("Paciente 1", "Paciente 2") df <- cbind.data.frame( c(a, b) , c) #combino en dos columnas colnames(df)[1] <- "Paciente" rename(df, new = old) # con dplyr, a veces funciona
### Crear un nuevo df con columnas de otro require(dplyr) new_df ← data.frame() new_df ← select(old_df, col1, col2, col3, col4)
### Cambiar a factores
df$col <- factor(df$col, levels = 1:11, labels = LETTERS[1:11]) # en caso que tenga 11 números
### merge datasets
# Load two datasets income2008 <- read.csv("data/ACS_08_3YR_S1903/ACS_08_3YR_S1903.csv", stringsAsFactors=FALSE, sep=",", colClasses=c("GEO.id2"="character")) income2013 <- read.csv("data/ACS_13_5YR_S1903/ACS_13_5YR_S1903.csv", stringsAsFactors=FALSE, sep=",", colClasses=c("GEO.id2"="character"))
Selecciono solo algunas columnas
# Subset income2008p <- income2008[,c("GEO.id2", "HC02_EST_VC02", "HC02_MOE_VC02")] income2013p <- income2013[,c("GEO.id2", "HC02_EST_VC02", "HC02_MOE_VC02")]
Renombro columnas
# Rename headers names(income2008p) <- c("FIPS", "med2008", "moe2008") names(income2013p) <- c("FIPS", "med2013", "moe2013")
Combino
# Combine income0813 <- merge(income2008p, income2013p, by="FIPS")
### cambiar nombres de variables
names(df) <- c("col1", "col2", etc)
### remover na
df <- subset(df, !is.na(variable con datos que deseamos filtrar))
otra manera
var_without_na <- na.omit(var)
### Ordenar factores
var <- ordered(var, levels = c("primera", "segunda", "tercera")
lo mismo con factors
var <- factor(var, levels = c("primera", "segunda", "tercera"), ordered = T)
Otra manera
temperature_vector <- c("High", "Low", "High","Low", "Medium") factor_temperature_vector <- factor(temperature_vector, **order = TRUE, levels = c("Low", "Medium", "High")**) factor_temperature_vector
### Ordenar los datos
df2 <- df[order(df$Var,decreasing=T),] df2[!duplicated(df2$A1),] head(df2) df2
### combinar datos de un df
LIZARD LENGTH DATA
island.1 <- c(0.2, 5.9, 6.1, 6.5) island.2 <- c(5.6, 14.8, 15.5, 16.4) island.3 <- c(0.8, 3.9, 4.3, 4.9) sex.codes <- c("Male", "Female", "Male", "Female") df.1 <- data.frame(island.1, island.2, island.3, sex.codes) str(df.1)
### Tablas
tabla <- with(df, table(x, y)) tabla <- with(df, xtabs(explanatory, response)) chisq.test(tabla)
Otras tablas con
library("gmodels") CrossTable(row, col)
### Tablas resumenes summarise tables
tapply(x, y, FUN)
### Tablas resumen 2×2
tabla <- with(data, tapply(x, list(factor1, factor2), FUN))
### DATA MUNGING BASICS
#### tidyr y dyplr ver https://rpubs.com/bradleyboehmke/data_wrangling y http://datascienceplus.com/data-manipulation-with-tidyr/
- tidyr
- gather(data, key, value, …, na.rm = FALSE, convert = FALSE)
- spread(data, key, value, fill = NA, convert = FALSE, drop = TRUE)
- separate(data, col, into, sep = “[^[:alnum:]]+”, remove = TRUE, convert = FALSE, extra = “warn”, fill = “warn”, …)
- unite(data, col, …, sep = “_”, remove = TRUE)
- dplyr
- select()
- filter()
- group_by()
- summarise()
- arrange()
- join()
- mutate()
#### Normalizar nombres de variables
library("data.table") df <- setnames(df, tolower(names(df)))
### melting to long form
install.packages("reshape") library(reshape) df.2 <- melt(df.1) str(df.2) df.1 df.2''
data <- data.frame(sex = c(rep(1, 1000), rep(2, 1000)), treatment = rep(c(1, 2), 1000), response1 = rnorm(2000, 0, 1), response2 = rnorm(2000, 0, 1)) head(data) ## reshape2 still does its thing: library(reshape2) melted <- melt(data, id.vars=c("sex", "treatment")) head(melted)
Queda así
> head(data) sex treatment response1 response2 1 1 1 0.1864923 1.2678290 2 1 2 -0.8654485 -0.5403999 3 1 1 0.1947279 -0.1921475 4 1 2 -0.7526925 0.4408610 5 1 1 -0.6723624 0.6399581 6 1 2 0.7225497 0.0368637 > melted <- melt(data, id.vars=c("sex", "treatment")) > head(melted) sex treatment variable value 1 1 1 response1 0.1864923 2 1 2 response1 -0.8654485 3 1 1 response1 0.1947279 4 1 2 response1 -0.7526925 5 1 1 response1 -0.6723624 6 1 2 response1 0.7225497
### Convertir fac a num
NuevoNum <- as.numeric(levels(ViejoFac))[ViejoFac] # convertir datos factor a número
\
## Instalar paquete en R desde URL
install.packages("**URL PAQUETE**.tar.gz", repos=NULL)
## Decision tree http://mathminers.com/index.php/2015/06/08/r-tutorial-on-social-media-analysis/
## http://stats.stackexchange.com/
Generate a random variable with a defined correlation to an existing variable
Multivariate multiple regression in R
What is the NULL hypothesis for interaction in a two-way ANOVA?
Why are lower p-values not more evidence against the null? Arguments from Johansson 2011