import os
print(os.getcwd().strip('"\''))
Setup
#this line make sure that you have the packages pacman ("package manager") and devtools ("devlopper tools") installed to be able to install and load all the required packages for the rest of the scripts to run smoothly
if(!require(pacman)) {
install.packages("pacman")
install.packages("devtools")
library(pacman)
}
pacman::p_load(rstudioapi, tidyverse, ggthemes, corrplot, moments, afex, viridis, Rmisc, glmnet, caret, MASS, GGally, emmeans)
#name of package -> #why or what does it do ? (but feel free to type ?? followed byt the package or function you want information, e.g. type ??Rmisc::summarySEwithin in your console)
#rstudioapi -> getActiveDocumentContext() =>Retrieve Information about an RStudio Editor
#tidyverse -> load pretty much all you first aid needs for data science (ggplot2, tibble, tidyr, readr, dplyr, purr and so much more) .. a bit on the heavy side though
#ggthemes -> extra themes for ggplots
#corrplot -> for matrix correlation plots
#moments -> skewness metrics
#afex -> anova, lmer and all that stuff
#viridis -> extra color palette (specially designed to read by those with colorblindness)
# Rmisc -> summarySEwithin => Summarize within-subjects data
#glmnet -> generalized linear models via penalized maximum likelihood
#caret -> Classification And REgression Training utilities
#MASS -> Modern Applied Statistics with S (needed for PES_CI function)
#GGally -> extension to ggplot2
#emmeans -> to calculat estimate means of a model
#PS: you don't need to call packagename::function for example devtools::source_gist() to use source_gist() when the package (here 'devtools') has been loaded but I do in these scripts so you can know where the functions came from (and it's genereally good practice to do so also)
devtools::source_gist("2a1bb0133ff568cbe28d", filename = "geom_flat_violin.R") # to download small scripts (AKA "gists")
options(scipen = 666, warn=-1, contrasts=c("contr.sum","contr.poly")) #this is general options for ouputs
#PPS: I use ';' to collapse to line together sometimes for efficiency but what it does is basically just acts as a virtual line break
Load data
#homepath = dirname(rstudioapi::getActiveDocumentContext()$path) #only works on Rstudio or like this #
homepath = getwd()
#data <- read_csv("data/data.csv") # or
data <- read_csv(paste(homepath, "data/data.csv", sep="/"))
homepath = pwd;
disp(homepath)
It will automatically save module requirements in a ./requirements.txt file
$ pip install pipreqs # install pipreqs on your machine
$ pipreqs . #run that in the directory where you have you script
Then someone can run “pip install -r requirements.txt” before running your script
data$intervention = as.factor(data$intervention); data$id = as.factor(data$id) #factorize categorical variables
head(data)
## # A tibble: 6 x 30
## id intervention var1_ses1 var1_ses2 var2_ses1 var2_ses2 var3_ses1 var3_ses2
## <fct> <fct> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 232 1 108 102 20.3 19.6 1.21 1.02
## 2 205 1 107 92 16.3 4 2 1.1
## 3 268 1 92 85 19.2 20.5 0.88 0.95
## 4 211 1 117 112 19.9 16.4 0.47 2.3
## 5 262 1 121 112 16.4 8.6 1.12 0.45
## 6 206 1 127 114 20.3 12 0.25 0.580
## # … with 22 more variables: var4_ses1 <dbl>, var4_ses2 <dbl>, var5_ses1 <dbl>,
## # var5_ses2 <dbl>, var6_ses1 <dbl>, var6_ses2 <dbl>, var7_ses1 <dbl>,
## # var7_ses2 <dbl>, var8_ses1 <dbl>, var8_ses2 <dbl>, var9_ses1 <dbl>,
## # var9_ses2 <dbl>, var10_ses1 <dbl>, var10_ses2 <dbl>, var11_ses1 <dbl>,
## # var11_ses2 <dbl>, var12_ses1 <dbl>, var12_ses2 <dbl>, var13_ses1 <dbl>,
## # var13_ses2 <dbl>, var14_ses1 <dbl>, var14_ses2 <dbl>
nums1 <- unlist(lapply(data, is.numeric)) #find all numeric variables
## check ??pivot_longer
df_plot = data %>% pivot_longer( # "pivot" the data from wide to long
cols = names(data[ , nums1]), #select all numeric variables to be coexerted
names_to = c("var", "session"), #choose names of new grouping columns (categorical)
names_pattern = "var(.*)_(.*)", #select pattern to separate data "(.*)" means "everything that fits this pattern here"
values_to = "measure" #name of the dv
)
df_plot$session = as.factor(df_plot$session) #factorize categorical variables
df_plot$var = ordered(as.numeric(df_plot$var)) #order for neat plots
head(df_plot) #I did that here so I have 1 column with all the data numerical data that I can plot allong different variables
## # A tibble: 6 x 5
## id intervention var session measure
## <fct> <fct> <ord> <fct> <dbl>
## 1 232 1 1 ses1 108
## 2 232 1 1 ses2 102
## 3 232 1 2 ses1 20.3
## 4 232 1 2 ses2 19.6
## 5 232 1 3 ses1 1.21
## 6 232 1 3 ses2 1.02
df = df_plot %>% pivot_wider(names_from = var, values_from = measure) # now here I spread the 1 column data into unique columns for each variable
head(df) #this is no the same as the beginning because I have now 1 column for each measurment (instead of 1 column per measrument per session)
## # A tibble: 6 x 17
## id intervention session `1` `2` `3` `4` `5` `6` `7` `8`
## <fct> <fct> <fct> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 232 1 ses1 108 20.3 1.21 220. 1.49 0.554 1.22 5.8
## 2 232 1 ses2 102 19.6 1.02 219. 1.5 1.44 0.820 5.3
## 3 205 1 ses1 107 16.3 2 32.0 14.2 0.185 1.26 5.7
## 4 205 1 ses2 92 4 1.1 13.2 5.04 0.181 0.801 4.8
## 5 268 1 ses1 92 19.2 0.88 24.4 0 0 0 5.9
## 6 268 1 ses2 85 20.5 0.95 25.5 0 0 0.326 5.3
## # … with 6 more variables: 9 <dbl>, 10 <dbl>, 11 <dbl>, 12 <dbl>, 13 <dbl>,
## # 14 <dbl>
nums <- unlist(lapply(df, is.numeric)) #find numeric variables again
#plot densities
dens_plot = ggplot(df_plot)+
geom_density(aes(x=measure))+
facet_wrap(~var, scales = "free")+ #separate plots by variable
labs(title = "Density per variable")+
theme(axis.title = element_text()) +
ylab("Density") + xlab('Value'); dens_plot
corrplot::corrplot(cor(df[ , nums], use="pairwise.complete.obs"), type="lower") #using corrplot to plot correlations
#box plot
box_plot = ggplot(df_plot)+
geom_boxplot(aes(x=intervention, y=measure))+
scale_x_discrete(labels=c("Placebo", "Control")) +
facet_wrap(~var, scales = "free")+
labs(title = "Boxplot by group")+
ggthemes::theme_fivethirtyeight()+
theme(axis.title = element_text()) +
ylab("predictors") + xlab(''); box_plot
Try to “clean” the dataset (e.g. remove outliers, log transform, etc..) as best as you think to be able to analyze the data based on the observations you did on the plots. Then maybe try to do an 2x2 anova (whatch out for the repeated measurment) on a problematic variable before and after you cleaned/transformed it.
That’s cool but it’s hard to interpret non scaled variables so let’s standardize them
df_scaled = df #just retaining the old df for later
df_scaled[nums] <- lapply(df_scaled[nums], scale) #standardize each column separately
#here I show how to do so with lapply which allows you to "apply" a function, in this case "scale", over a list of vectors
#Here I show how to create a function to do automatize a simple routine. For example her eis to gather a "1 column per variable data" to an extra long "1 column data" format for plotting.
#You don't need that in any case but being familiar with how to do so is one of the objectives of this workshop and will also make your life much easier in the long run.
#pivot_re will be the name of the function and 'data' and 'nums' will be its arguments
#'data' is.. the data you want to modify and 'nums' is the index of the colums you want to gather
pivot_re <- function(data, nums) {
x = data %>% pivot_longer(
cols = names(data[ , nums]),
names_to = "var",
values_to = "measure")
x$var = ordered(as.numeric(x$var)) #order for neat plots
return(x)
} #I would usually put my functions either at the beginning or in a separate file for more readability but that will be in another workshop
df_scaled_plot = pivot_re(df_scaled, nums) #an example of how to use it
# we use %+% to change the dataset of a ggplot object without need to rewrite the whole code again (neat or not ?)
box_plot_scaled = box_plot %+% df_scaled_plot + labs(title = "Boxplot by group (standardized)")+ ylab("standardized predictors") ; box_plot_scaled
#if you ever want/need to remove outlier based uniquely on the sd here is a little helper
#arguments are dataraw: the non standardized data, datasd: the standardized data, sd: the cutoff for max/min standard deviation, nums: index of the colums to inspect
remove_out <- function(dataraw, datasd, sd, nums){
#creating a dictionnary with our parameters
df_dict <- data.frame(variable = names(datasd[nums]), out_low = rep(-sd,length(names(datasd[nums]))), out_high = rep(sd,length(names(datasd[nums]))))
#for loop to go through each observations and replace sd based outliers by NA
for (var in df_dict$variable) {
dataraw[[var]] [datasd[[var]] < df_dict[df_dict$variable == var, ]$out_low | datasd[[var]] > df_dict[df_dict$variable == var, ]$out_high] <- NaN}
return(dataraw)
}
#for example
#df_out = remove_out(df,df_scaled, 3, nums)
col_names = 1:14%>% str_c("var", .); col_names # create vector of string of the variable names
## [1] "var1" "var2" "var3" "var4" "var5" "var6" "var7" "var8" "var9"
## [10] "var10" "var11" "var12" "var13" "var14"
diff_names = as.character(1:14)# create vector of variable names
#function to create difference scores
#arguments are data: the data, col_names: vector of string of the variable names
diff_column <- function(data, col_names) {
i = 0 # intialize
for (col in col_names) {
i = i+1 # counter
data[,str_c(i)] = data[,str_c(col, "_ses2")] - data[,str_c(col, "_ses1")]
#create new variable (difference score) with the col_name you gave, = , subsract var"x" ses1 from var"x" ses2
}
return(data)
}
df_diff = diff_column(data, col_names) # example
df_diff = df_diff[c("id", "intervention",diff_names)] # keep only columns of interest (here only diff scores to investigate baseline changes)
df_diff[diff_names] = scale(df_diff[diff_names]) #scale
df_out = remove_out(df_diff,df_diff, 5, diff_names) # bluntly remove outliers
df_out[diff_names] = scale(df_out[diff_names], scale = F) #center at 0 for ANOVA and stuff
df_out = na.omit(df_out) # remove missign data (we wouldn't have to do that if we used lmer but out of this workshop's scope)
df_diff_plot <- pivot_re(df_out, diff_names) # prepare for plot
#box plot
box_plot_diff= box_plot %+% df_diff_plot + labs(title = "Boxplot by group (diff score Z)")+ ylab("standardized diff scores") ; box_plot_diff
#dens plot
dens_plot_diff= dens_plot %+% df_diff_plot + geom_density(aes(x=measure, fill = intervention)) + xlab("standardized diff scores") ; dens_plot_diff
#or you could just plot everything in on command on a massive matrix plot
GGally::ggpairs(df_diff[, -c(1)], aes(colour=df_diff$intervention, alpha=0.4))
Check skewness ! Transform: log ? sqrt? log1p?
X = lapply(na.omit(df[diff_names]), moments::skewness); rbind(X) #check # i just use rbind to bind the data vertically (cbind to bind horizontally)
## 1 2 3 4 5 6 7 8
## X 0.3268273 2.288469 1.824335 5.652491 1.678218 5.366602 1.63765 -0.3543178
## 9 10 11 12 13 14
## X 2.647198 1.101269 2.009514 1.832194 1.108136 0.1617942
df_sqrt = df; df_sqrt[diff_names] <- lapply(df_sqrt[diff_names], sqrt); df_sqrt_plot = pivot_re(df_sqrt, diff_names) #create square root transformed df
df_log = df; df_log[diff_names] <- lapply(df_log[diff_names], log); df_log_plot = pivot_re(df_log, diff_names) #create log transformed df
df_log1p = df; df_log1p[diff_names] <- lapply(df_log1p[diff_names], log1p); df_log1p_plot = pivot_re(df_log1p, diff_names) #create log(1+x) df
#Square root (handles non negative data and zeroes)
dens_sqrt_plot = dens_plot_diff %+% df_sqrt_plot + labs(title = "Density per variable (square root transformed)"); dens_sqrt_plot
X = lapply(na.omit(df_sqrt[diff_names]), moments::skewness); rbind(X) # calculate skewness of the data
## 1 2 3 4 5 6 7 8
## X 0.1170117 1.439333 0.6457027 2.874378 0.9847146 1.839002 0.2557674 -1.169921
## 9 10 11 12 13 14
## X 1.645342 0.6667252 1.152868 1.14676 0.6013354 -0.1268951
#Log (handles non negative data and but NOT zeroes)
dens_log_plot = dens_plot_diff %+% df_log_plot + labs(title = "Density per variable (log transformed)"); dens_log_plot
X =lapply(na.omit(df_log[diff_names]), moments::skewness); rbind(X) # here we see NA apparearing, why ? because we introduced -inf ( log(0) = -inf ) !
## 1 2 3 4 5 6 7 8 9 10
## X -0.1002578 0.5115342 -0.5802683 NaN NaN NaN NaN -2.275663 0.5160858 0.1903609
## 11 12 13 14
## X 0.4194621 0.598077 0.1029559 -0.4439815
#Log1p (handles non negative data AND zeroes)
dens_log1p_plot = dens_plot_diff %+% df_log1p_plot + labs(title = "Density per variable (log + 1 transformed)"); dens_log1p_plot
X = lapply(na.omit(df_log1p[diff_names]), moments::skewness); rbind(X) # here we see NO NA apparearing, why ? because ( log(0+1) = 0 ) !
## 1 2 3 4 5 6 7
## X -0.09563731 0.6125633 0.4990023 -0.04983626 0.8047163 1.344196 0.4472332
## 8 9 10 11 12 13 14
## X -1.786739 0.9200051 0.7775229 1.031717 0.8849994 0.4274009 -0.4367489
#because zero-inflated/ skewed data => problem
## Create a formula for a model with a large number of variables:
fmla <- as.formula(paste(" `1` ~ intervention + ", paste0(sprintf("`%s`", diff_names[c(-1,-2)]), collapse= "+"), " + Error(id)")); fmla # overly to complicated for what is does but Im just THAT lazy
## `1` ~ intervention + `3` + `4` + `5` + `6` + `7` + `8` + `9` +
## `10` + `11` + `12` + `13` + `14` + Error(id)
#this really just to demonstrate the influence of outliers do not see that as an example of what do do with this data
#with huge outliers
model1 = aov_car(fmla, data= na.omit(df_diff), factorize = F, anova_table = list(correction = "GG", es = "pes"))
nice(model1, MSE=F); #emmeans::ref_grid(model1) #triple check everything is centered at 0
## Anova Table (Type 3 tests)
##
## Response: 1
## Effect df F pes p.value
## 1 intervention 1, 40 26.24 *** .396 <.001
## 2 `3` 1, 40 1.30 .031 .261
## 3 `4` 1, 40 2.03 .048 .162
## 4 `5` 1, 40 0.16 .004 .690
## 5 `6` 1, 40 0.62 .015 .436
## 6 `7` 1, 40 0.08 .002 .779
## 7 `8` 1, 40 7.64 ** .160 .009
## 8 `9` 1, 40 8.30 ** .172 .006
## 9 `10` 1, 40 2.24 .053 .142
## 10 `11` 1, 40 2.46 .058 .125
## 11 `12` 1, 40 0.08 .002 .784
## 12 `13` 1, 40 0.12 .003 .730
## 13 `14` 1, 40 0.87 .021 .356
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '+' 0.1 ' ' 1
#without huge outliers
model = aov_car(fmla, data= na.omit(df_out), factorize = F, anova_table = list(correction = "GG", es = "pes"))
nice(model, MSE=F); #emmeans::ref_grid(model) #triple check everything is centered at 0
## Anova Table (Type 3 tests)
##
## Response: 1
## Effect df F pes p.value
## 1 intervention 1, 38 16.82 *** .307 <.001
## 2 `3` 1, 38 0.04 .001 .839
## 3 `4` 1, 38 3.01 + .073 .091
## 4 `5` 1, 38 0.01 <.001 .914
## 5 `6` 1, 38 2.63 .065 .113
## 6 `7` 1, 38 0.04 .001 .837
## 7 `8` 1, 38 0.12 .003 .732
## 8 `9` 1, 38 0.61 .016 .438
## 9 `10` 1, 38 0.02 <.001 .900
## 10 `11` 1, 38 1.35 .034 .253
## 11 `12` 1, 38 0.08 .002 .773
## 12 `13` 1, 38 0.50 .013 .483
## 13 `14` 1, 38 0.60 .016 .442
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '+' 0.1 ' ' 1
devtools::source_gist("383aa93ffa161665c0dca1103ef73d9d", filename = "effect_CI.R") # to download pes_ci # function to calculate CI around partial eta squared
pes_ci(fmla, data= na.omit(df_out), factorize = F) # default CI is 90% -> here is why https://daniellakens.blogspot.com/2014/06/calculating-confidence-intervals-for.html
## Partial eta-squared 90 % CI lower limit 90 % CI upper limit
## intervention 0.3067634438 0.1132256 0.46473856
## `3` 0.0010969704 0.0000000 0.05592432
## `4` 0.0734635701 0.0000000 0.22436663
## `5` 0.0003095440 0.0000000 0.02639789
## `6` 0.0647322279 0.0000000 0.21238097
## `7` 0.0011270533 0.0000000 0.05654807
## `8` 0.0031194365 0.0000000 0.08035206
## `9` 0.0159248801 0.0000000 0.12804570
## `10` 0.0004191403 0.0000000 0.03357207
## `11` 0.0342722172 0.0000000 0.16511509
## `12` 0.0022140274 0.0000000 0.07220268
## `13` 0.0130401838 0.0000000 0.12064843
## `14` 0.0156121645 0.0000000 0.12727930
#this is just stuff to make the formating nice
averaged_theme <- theme_bw(base_size = 24, base_family = "Helvetica")+ #font stuff
theme(strip.text.x = element_text(size = 24, face = "bold"),
strip.background = element_rect(color="white", fill="white", linetype="solid"),
legend.position=c(.9,.9),
legend.title = element_text(size = 12),
legend.text = element_text(size = 10),
legend.key.size = unit(0.2, "cm"),
legend.key = element_rect(fill = "transparent", colour = "transparent"),
panel.grid.major.x = element_blank() ,
panel.grid.major.y = element_line(size=.2, color="lightgrey") ,
panel.grid.minor = element_blank(),
axis.title.x = element_text(size = 22),
axis.title.y = element_text(size = 22),
axis.line = element_line(size = 0.5),
panel.border = element_blank())
pal = viridis::viridis(n=3) # specialy conceived for colorblindness color palette (just input the number of categories you want here I took 3 instaed of two to have more room
# AVERAGED EFFECT
dfH <- summarySEwithin(df,
measurevar = "1",
betweenvars = "intervention",
withinvars = "session",
idvar = "id", na.rm = T) #get standard error adjusted for within design
#this is to have more control on my jittering but really not necessary
dfH$cond <- ifelse(dfH$session == "ses1", -0.25, 0.25) # means and se
df$cond <- ifelse(df$session == "ses1", -0.25, 0.25) # individual points
#(what I do is that I transform by categorical data into dummy numerical data to "jitter" it
set.seed(666)#setting "random seed" so the random jitter will always be the same
df <- df %>% mutate(condjit = jitter(as.numeric(cond), 0.3),
grouping = interaction(id, cond))
labels = c("0"="Placebo", "1"="Treatment") # labels for top strip
plt <- ggplot(df, aes(x = cond, y = `1`, fill = session, color = session)) +
geom_point(data = dfH, alpha = 0.5) + # means
geom_line(aes(x = condjit, group = id, y = `1`), alpha = .3, size = 0.5, color = 'gray') + # lines that connect indivudals across time
geom_flat_violin(scale = "count", trim = FALSE, alpha = .2, aes(fill = session, color = NA))+
geom_point(aes(x = condjit), alpha = .3,) + # individual points
geom_crossbar(data = dfH, aes(y = `1`, ymin=`1`-ci, ymax=`1`+ci,), width = 0.2 , alpha = 0.1)+ #error bar +- Se
ylab('Var 1') +
xlab('') +
#scale_y_continuous(expand = c(0, 0), breaks = c(seq.int(0,100, by = 20)), limits = c(-0.5,100.5)) + # choose limits
scale_x_continuous(labels=c("Pre", "Post"), breaks = c(-.25,.25), limits = c(-.5,.5)) + # rename levels of categorical variable
scale_fill_manual(values=c("ses1"= pal[1], "ses2"=pal[2]), guide = 'none') + # assign color from palette
scale_color_manual(values=c("ses1"=pal[1], "ses2"=pal[2]), guide = 'none') + # assign color from palette
theme_bw() + facet_wrap(~intervention, labeller=labeller(intervention =labels)) + #split plot into two groups for more readability
averaged_theme ; plt # add theme adjustments
#if you want to save the figures
pdf('figures/inter.pdf')
print(plt)
dev.off()
## png
## 2
Let’s say you want to test for the significance of the intervention variable (placebo VS treatment) on var1. You also want to control for baseline difference between groups (session pre-post) and other nuissance covariates (e.g. age, weight, number of cigarette smoked per week, etc..). Here you have many “nuissance” variables (var2 all the way to var20) and not that much observations…
One thing you could do is enter ALL of them into an ANCOVA, but that’s a bit dubious because you are artifically decreasing your degrees of freedom (and increasing the chances of multi-colinearity). We are not gonna enter too much into the details about that right here but try to think why we generally wouldn’t want that.
Another thing one might do is to enter all these variables and then removing the ones that are not “statiscally significant” and then rerun the analysis with only the ones left. Again that is definitely not the best way to go for because you doing multiple tests (and also some variables that were significant before might still be once you removed others).
So one way to circumvent these issues is to first implement a proper variable selection method (not based on p-values). You can achieve this via several methods. One way to do that could be to fit different models (with and without certain variables) and compare them with a model selection criterion (AIC, BIC, SIC, loo, R², etc..) to only keep the “best model”.
Alternatively you could also select only variables that explain a certain thershold of variance (i.e. how useful is this variable to predict an outcome, here “is the individual in the placebo or treatment group?”). Finally, you can also create composite variables (component) that reflects most of the variance from those variables (however try to think about what is the drawback of this last method if you use it).
This is a rather complex and technical exercise (as well as time and computationally consuming) but I think it’s definitely primordial for scientists to know how to select variables (feature) from dataset.
PS: No need to do the ANCOVA, the purpose of this exercise is really about wich variables you would retain in the final model and there is defintely no “right or wrong” answers.
Using penalized (lasso) maximum likelihood for variable selection (cv.glmnet), i.e. finding the minimum lambda (regularization parameter) to infer the best number of features to use in order to simplify the model and avoid overfitting.
Using that because:
- Handles the problem of correlated inputs - Perform better than stepwise selection
#1) with Recursive Feature Eliminations (CARET)
control <- rfeControl(functions=rfFuncs, method="cv", number=10) # control options
rfeResults = rfe(x = df_out[c(-1, -2)], y = df_out$intervention, sizes=1:14, rfeControl=control)
predictors(rfeResults)
## [1] "14" "1" "3" "6"
plot(rfeResults, type=c("g", "o")) # look for the "elbow"
#2) with Stepwise Feature Selection
full.model = glm(intervention ~., data = df_out[-1], family = binomial)
step.model <- full.model %>% MASS::stepAIC(trace = FALSE) #Choose a model by AIC in a Stepwise Algorithm
coef(step.model)
## (Intercept) `1` `10` `14`
## -65.63880 -285.86268 50.34475 244.99213
#3) with lasso regularization
mat<-model.matrix(intervention~.,data=df_out[-1]); mat=mat[,-1] # remove intercept
set.seed(123);
mat<-model.matrix(intervention~.,data=df_out[-1]); mat=mat[,-1] # remove intercept
foldid <- sample(rep(seq(3), length.out = nrow(df_out[-1]))) # to be reproducible
fit<-glmnet::cv.glmnet(x=mat,y=df_out[-1]$intervention,family ="binomial",type.measure='mse',alpha=1) #, foldid = foldid, nfolds = 3, standardize = FALSE)
plot(fit)
c<-coef(fit,s='lambda.min',exact=TRUE); inds<-which(c!=0) #include only variable lambda coeficients bigger than 0
variables<-row.names(c)[inds]; variables<-variables[!variables %in% '(Intercept)']; variables #extract those variables
## [1] "`1`" "`3`" "`10`" "`14`"
df_diff_plot <- pivot_re(df_diff, c("1", "3", "10", "14"))
#plot
GGally::ggpairs(df_diff[, c("intervention", "1", "3", "10", "14")], aes(colour=df_diff$intervention, alpha=0.4))
Now we are entering the realm of task-based functional MRI. We will defintely go more into details in a future workshop but let’s just go through a bit of data wrangling anyone confronted to fMRI will have to go through.
Let’s start by loading the events associated to an fMRI acquisition. Here (in data/sub-control100/data.mat) is just an example of a typical file that you will have to extract from: the onsets (AKA when does the stuff you showed to your participant appeared), the durations (how long did it appear) and the behavioral (also generally called modulators) data associated to your task (e.g. reaction times or liking ratings for each condition). You WILL need these regressors to model you fMRI data later on.
The thing is.. these files are generally (at least here in Geneva) generated for each participantin a .mat format. So that’s whyI gave it to you hee like you probably get it in your lab too. If you don’t feel concerned by that and your lab and moved on to python as its presentation software for fMRI, good for you and you might skip this. Otherwise this task might be quite instructive for you. So let’s dive in.
This files contain a matlab “structure array” which simply put is “a data type that groups related data using data containers called ‘fields’. Each field can contain any type of data”. This might be confusing at first (trust me .. it is) but so let’s go step by step. This is a really simple design were we have only 3 odor conditions and the “start” of each trial.
load('data/sub-control100/data.mat') %loading data
disp('Each structure has two fields named:'); disp(fieldnames(durations2))
%to access the data in a field you must use dot notation (e.g. structName.fieldName)
disp(['The "start" field contains ', num2str(length(durations2.start)), ' observattions'])
disp('The "odor" field has 3 nested substructures named'); disp(fieldnames(durations2.odor))
disp(['Each of the 3 nested substructure in the "odor" field contains ', num2str(length(durations2.odor.reward)), ' observattions'])
Each structure has two fields named:
{
[1,1] = odor
[2,1] = start
}
The "start" field contains 54 observattions
The "odor" field has 3 nested substructures named
{
[1,1] = reward
[2,1] = neutral
[3,1] = control
}
Each of the 3 nested substructure in the "odor" field contains 18 observattions
Now what we would like for most MRI softwares is a “3 column timming files” with respectively the onset time (in seconds); the duration (in seconds); the relative magnitude of each stimulus (set to 1 for none). I put examples of what it should look like in the /data folder.
So basically the task will be to recreate that type of files for sub-01 and make you code easilly scale to more subjects (here is just duplicated the sub-01 into more subjects to test your code).
Of course you are not forced to use matlab/octave here to achieve that.
import scipy.io
mat = scipy.io.loadmat('data/sub-control100/data.mat')
dict.keys(mat) #its read as a dictionnary so this gonna get ugly but stay with me
#mat["modulators2"]["start"][0] #this gets you an array within an array !not good enough!
## dict_keys(['__header__', '__version__', '__globals__', 'durations2', 'modulators2', 'onsets2'])
mat["modulators2"]["start"][0][0][0:2] #this actually get you the array (but remove the [0:2])
#mat["modulators2"]["odor"]["control"][0][0] #this doesn't work!
## array([[1.],
## [1.]])
mat["modulators2"]["odor"][0][0]["control"][0][0][0:2] #you have to get you dictionnary from the array first ! (but remove the [0:2])
## array([[67.79661017],
## [50. ]])
Try to create a dataset that includes each subjects behavioral dataset appended together with a column with their ID (numbers following control or treatment), a column with their group (control or treatment), a column with the condition name (reward, control or neutral) and 1 column with their behavioral data (from the modulators2.odor field) and a column with the number of the trial (1-54). Have extra caution to put the behavioral data back into the right order (you can get that by sorting them by their onsets !). Hang in there..
subject = dir("data", pattern="sub-*", all.files=T, full.names=T) #find data via pattern
data = tibble() #initialize
#go through subjects
for (s in subject) {
mat <- R.matlab::readMat(paste(s, "/data.mat", sep="")) #read mat file
#start
onset = mat$onsets2[[2]]
duration = mat$durations2[[2]]
modulator = mat$modulators2[[1]] #watchout it's reversed here !
start = cbind(onset, duration, modulator) #bind all vertically
write.table(start, paste(s, "/start.txt", sep=""), row.names = F, sep="\t", col.names = F) # write to txt (or csv or tsv)
#odors
##reward
onset = mat$onsets2[[1]][[1]]
duration = mat$durations2[[1]][[1]]
modulator = as_tibble(mat$modulators2[[2]][[1]]) #watchout it's reversed here !
odor_rew = cbind(onset, duration, modulator)
write.table(odor_rew, paste(s, "/odor_rew.txt", sep=""), row.names = F, sep="\t", col.names = F)
modulator$condition = "reward"; mod_rew = cbind(modulator, onset); #bind all vertically
##neutral
onset = mat$onsets2[[1]][[2]]
duration = mat$durations2[[1]][[2]]
modulator = as_tibble(mat$modulators2[[2]][[2]]) #watchout it's reversed here !
odor_neu = cbind(onset, duration, modulator)
write.table(odor_neu, paste(s, "/odor_neu.txt", sep=""), row.names = F, sep="\t", col.names = F)
modulator$condition = "neutral"; mod_neu = cbind(modulator, onset); #bind all vertically
##control
onsets_odor_con = mat$onsets2[[1]][[3]]
durations_odor_con = mat$durations2[[1]][[3]]
modulator_odor_con = as_tibble(mat$modulators2[[2]][[3]]) #watchout it's reversed here !
odor_con = cbind(onset, duration, modulator)
write.table(odor_con, paste(s, "/odor_con.txt", sep=""), row.names = F, sep="\t", col.names = F)
modulator$condition = "control"; mod_con = cbind(modulator, onset); #bind all vertically
#behavioral
df = rbind (mod_rew, mod_neu, mod_con); #stack them
df = df[order(df$onset),] #order by onset
df$trial = 1:54 #trial number vector
df$id = str_sub(s,-3,-1) #extract last 3 charcaters of the string (i.e. subject ID)
df$group = str_sub(s,-10,-4) #extract last X charcaters of the string (i.e. group)
data = rbind(data,df) #stack them
}
There you are, you finished this list of exercise. Good job! You deserve a pat on the shoulder and a good cup (jar?) of coffee.
I hope you learned things that will be useful for your research/career and let me know whta you liked/disliked.