Etienne Low-Décarie
Combine
Repeat this for all subsets
plyr
and dplyr
do not play nice together
Always load dplyr
last
Be careful of packages that load plyr
require(plyr)
require(dplyr)
you can also use dplyr:::
to ensure functions are called from dplyr
Split -Apply-Combine
group_by()
if(!require(dplyr)){install.packages("dplyr")}
require(dplyr)
CO2_by_Plant_Type_Treatment <- group_by(CO2,
Plant,
Type,
Treatment)
group_by
implies tbl_df
tbl_df
wraps a data.frame
print(CO2_by_Plant_Type_Treatment)
Source: local data frame [84 x 5]
Groups: Plant, Type, Treatment [12]
Plant Type Treatment conc uptake
(fctr) (fctr) (fctr) (dbl) (dbl)
1 Qn1 Quebec nonchilled 95 16.0
2 Qn1 Quebec nonchilled 175 30.4
3 Qn1 Quebec nonchilled 250 34.8
4 Qn1 Quebec nonchilled 350 37.2
5 Qn1 Quebec nonchilled 500 35.3
6 Qn1 Quebec nonchilled 675 39.2
7 Qn1 Quebec nonchilled 1000 39.7
8 Qn2 Quebec nonchilled 95 13.6
9 Qn2 Quebec nonchilled 175 27.3
10 Qn2 Quebec nonchilled 250 37.1
.. ... ... ... ... ...
class(CO2_by_Plant_Type_Treatment)
[1] "grouped_df" "tbl_df" "tbl" "data.frame"
Caution when working with functions that expect a data.frame
class(CO2_by_Plant_Type_Treatment)
[1] "grouped_df" "tbl_df" "tbl" "data.frame"
head(as.data.frame(CO2_by_Plant_Type_Treatment))
Plant Type Treatment conc uptake
1 Qn1 Quebec nonchilled 95 16.0
2 Qn1 Quebec nonchilled 175 30.4
3 Qn1 Quebec nonchilled 250 34.8
4 Qn1 Quebec nonchilled 350 37.2
5 Qn1 Quebec nonchilled 500 35.3
6 Qn1 Quebec nonchilled 675 39.2
Split-Apply-Combine
mutate
summarise
Summarize by extracting maximum uptake per plant
CO2_max_per_plant <- dplyr:::summarise(CO2_by_Plant_Type_Treatment,
max_uptake=max(uptake))
print(CO2_max_per_plant)
Source: local data frame [12 x 4]
Groups: Plant, Type [?]
Plant Type Treatment max_uptake
(fctr) (fctr) (fctr) (dbl)
1 Qn1 Quebec nonchilled 39.7
2 Qn2 Quebec nonchilled 44.3
3 Qn3 Quebec nonchilled 45.5
4 Qc1 Quebec chilled 38.7
5 Qc3 Quebec chilled 41.4
6 Qc2 Quebec chilled 42.4
7 Mn3 Mississippi nonchilled 28.5
8 Mn2 Mississippi nonchilled 32.4
9 Mn1 Mississippi nonchilled 35.5
10 Mc2 Mississippi chilled 14.4
11 Mc3 Mississippi chilled 19.9
12 Mc1 Mississippi chilled 22.2
dodge <- position_dodge(1)
p <- ggplot(data=CO2_max_per_plant,
aes(x=Type,
y=max_uptake,
fill=Treatment))+
geom_bar(stat="summary",
fun.y=mean,
position=dodge)+
stat_summary(fun.data="mean_cl_normal",
geom="errorbar",
position=dodge)
if(!require(gridExtra)){install.packages("gridExtra")}
require(gridExtra)
grid.newpage()
grid.table(head(data.frame(CO2_max_per_plant)))
iris
data setCalculate deviation from the mean uptake for each plant
CO2_with_deviation <- mutate(CO2_by_Plant_Type_Treatment,
deviation_from_mean=uptake-mean(uptake))
CO2.plot <- qplot(data = CO2_with_deviation,
x = conc,
y = deviation_from_mean,
colour = Treatment) +
facet_grid(.~Type)+
geom_line(aes(group = Plant))
print(CO2.plot)
Calculate the slope and intercept for each plant
intercept_slope <- function(conc, uptake){
fit <- lm(uptake~conc)
coefficients <- coef(fit)
return(data.frame(intercept=coefficients[1],
slope=coefficients[2]))
}
note: there is now a better way to do this using require(broom)
CO2_fit <- summarise(CO2_by_Plant_Type_Treatment,
intercept=intercept_slope(conc, uptake)$intercept,
slope=intercept_slope(conc, uptake)$slope)
dodge <- position_dodge(1)
p <- ggplot(data=CO2_fit,aes(x=Type,
y=slope,
fill=Treatment))+
geom_bar(stat="summary",
fun.y=mean,
position=dodge)+
stat_summary(fun.data="mean_cl_normal",
geom="errorbar",
position=dodge)
\[ \sum_{i=January}^{i=December}\left\lvert x_i- \bar x _{i2002-2014} \right\rvert \]
%>%
allows you to apply multiple functions sequentially
(equivalent to “|” in bash)
CO2_by_Plant_Type_Treatment <- group_by(CO2,
Plant,
Type,
Treatment)
CO2_max_per_plant <- summarise(CO2_by_Plant_Type_Treatment,
max_uptake=max(uptake))
becomes
CO2_max_per_plant <-CO2 %>%
group_by(Plant,
Type,
Treatment) %>%
summarise(max_uptake=max(uptake))
do
is like mutate
or summarise
, but returns a list of any R
objects
CO2_fit <- CO2_by_Plant_Type_Treatment %>%
do(model=lm(uptake~conc, data=.))
require(broom)
CO2_fit <- CO2_by_Plant_Type_Treatment %>%
do(tidy(lm(uptake~conc, data=.)))
%>%
chaining
-convert the summarise
of the iris data set
-convert the mutate
of the monthly temperaturefilter
filter(iris, Sepal.Width==3)
mtcars[iris$Sepal.Width == 3,]
select
select(iris, Species)
iris[,"Species"]
select(iris, starts_with("Petal"))
https://cran.rstudio.com/web/packages/dplyr/vignettes/databases.html
create a database
my_db <- src_sqlite("my_db.sqlite3", create = T)
load data into the database
CO2_sqlite <- copy_to(my_db,
CO2,
temporary = FALSE,
indexes = list("Plant",
"Type",
"Treatment",
"conc",
"uptake"))
use the tables from a database as a regular data.frame
CO2_max_per_plant <-CO2_sqlite %>%
group_by(Plant,
Type,
Treatment) %>%
summarise(max_uptake=max(uptake))
note that dplyr
will only execute database calls when needed
(when manipulated is being called eg. by print()
)
Seperate string variable and spreading (reminder)
require(tidyr)
iris$specimen <- 1:nrow(iris)
long_iris<-gather(iris,"Measurement",
"Value",
Sepal.Length:Petal.Width)
seperated_iris <- separate(long_iris,
Measurement,
c("Organ", "Dimension"))
wide_iris <- spread(seperated_iris,
Dimension,
Value)
list_plots <- wide_iris %>% group_by(Species) %>%
do(print(qplot(data=.,
ymin=I(0),
ymax=Length,
xmin=I(0),
xmax=Width,
xlim=c(0,10),
ylim=c(0,10),
geom="rect",
facets=~specimen,
alpha=I(0.3),
fill=Organ)))
_input_output_ply
functions:
ddply
: data.frame in, data.frame out
can be done in dplyr
ldply
: list in, data.frame out
__ply(.data,
.variables,
.fun = NULL,
.progress = "text",
.parallel = FALSE,
.paropts = NULL)
How to load and merge into a single data frame all files in a directory
file_list <- list.files("./Data/",
".txt")
path_list <- paste0("./Data/",
file_list)
loaded_data <- ldply(.data=path_list,
function(x){
loaded_data <- read.csv(x)
loaded_data$path <- x
return(loaded_data)
}
temperature_timeseries
using ldply()
mutate
)qplot
)