Data wrangling workshop for Neuroscience Master students - Part I

1) Make you code reproducible (AKA work on other machines)

For python:

import os
print(os.getcwd().strip('"\''))

For R:

Setup

#this line make sure that you have the packages pacman ("package manager") and devtools  ("devlopper tools") installed to be able to install and load all the required packages for the rest of the scripts to run smoothly
if(!require(pacman)) {
  install.packages("pacman")
  install.packages("devtools")
  library(pacman)
}
pacman::p_load(rstudioapi, tidyverse, ggthemes, corrplot, moments, afex, viridis, Rmisc, glmnet, caret, MASS, GGally, emmeans) 
#name of package   ->  #why or what does it do ? (but feel free to type ?? followed byt the package or function you want information, e.g. type ??Rmisc::summarySEwithin in your console)

#rstudioapi ->  getActiveDocumentContext() =>Retrieve Information about an RStudio Editor
#tidyverse -> load pretty much all you first aid needs for data science (ggplot2, tibble, tidyr, readr, dplyr, purr and so much more) .. a bit on the heavy side though
#ggthemes -> extra themes for ggplots
#corrplot -> for matrix correlation plots
#moments -> skewness metrics
#afex -> anova, lmer and all that stuff 
#viridis -> extra color palette (specially designed to read by those with colorblindness)
# Rmisc -> summarySEwithin => Summarize within-subjects data
#glmnet -> generalized linear models via penalized maximum likelihood
#caret -> Classification And REgression Training utilities
#MASS -> Modern Applied Statistics with S (needed for PES_CI function)
#GGally -> extension to ggplot2
#emmeans -> to calculat estimate means of a model

#PS: you don't need to call packagename::function for example devtools::source_gist() to use source_gist() when the package (here 'devtools') has been loaded but I do in these scripts so you can know where the functions came from (and it's genereally good practice to do so also)

devtools::source_gist("2a1bb0133ff568cbe28d", filename = "geom_flat_violin.R") # to download small scripts (AKA "gists") 

options(scipen = 666, warn=-1, contrasts=c("contr.sum","contr.poly")) #this is general options for ouputs

#PPS: I use ';' to collapse to line together sometimes for efficiency but what it does is basically just acts as a virtual line break

Load data

#homepath = dirname(rstudioapi::getActiveDocumentContext()$path) #only works on Rstudio or like this # 
homepath = getwd()
#data <- read_csv("data/data.csv") # or
data <- read_csv(paste(homepath, "data/data.csv", sep="/"))

For Matlab/Octave:

homepath = pwd;
disp(homepath)

For Python (do that within your terminal when you finished your scripts):

It will automatically save module requirements in a ./requirements.txt file

$ pip install pipreqs # install pipreqs on your machine
$ pipreqs .  #run that in the directory where you have you script

Then someone can run “pip install -r requirements.txt” before running your script

2) Rearrange the data

data$intervention = as.factor(data$intervention); data$id = as.factor(data$id) #factorize categorical variables
head(data)
## # A tibble: 6 x 30
##   id    intervention var1_ses1 var1_ses2 var2_ses1 var2_ses2 var3_ses1 var3_ses2
##   <fct> <fct>            <dbl>     <dbl>     <dbl>     <dbl>     <dbl>     <dbl>
## 1 232   1                  108       102      20.3      19.6      1.21     1.02 
## 2 205   1                  107        92      16.3       4        2        1.1  
## 3 268   1                   92        85      19.2      20.5      0.88     0.95 
## 4 211   1                  117       112      19.9      16.4      0.47     2.3  
## 5 262   1                  121       112      16.4       8.6      1.12     0.45 
## 6 206   1                  127       114      20.3      12        0.25     0.580
## # … with 22 more variables: var4_ses1 <dbl>, var4_ses2 <dbl>, var5_ses1 <dbl>,
## #   var5_ses2 <dbl>, var6_ses1 <dbl>, var6_ses2 <dbl>, var7_ses1 <dbl>,
## #   var7_ses2 <dbl>, var8_ses1 <dbl>, var8_ses2 <dbl>, var9_ses1 <dbl>,
## #   var9_ses2 <dbl>, var10_ses1 <dbl>, var10_ses2 <dbl>, var11_ses1 <dbl>,
## #   var11_ses2 <dbl>, var12_ses1 <dbl>, var12_ses2 <dbl>, var13_ses1 <dbl>,
## #   var13_ses2 <dbl>, var14_ses1 <dbl>, var14_ses2 <dbl>
nums1 <- unlist(lapply(data, is.numeric)) #find all numeric variables

## check ??pivot_longer
df_plot = data %>% pivot_longer( # "pivot" the data from wide to long
  cols = names(data[ , nums1]),  #select all numeric variables to be coexerted
  names_to = c("var", "session"), #choose names of new grouping columns (categorical)
  names_pattern = "var(.*)_(.*)", #select pattern to separate data "(.*)" means "everything that fits this pattern here"
  values_to = "measure" #name of the dv
) 

df_plot$session = as.factor(df_plot$session) #factorize categorical variables
df_plot$var = ordered(as.numeric(df_plot$var)) #order for neat plots

head(df_plot) #I did that here so I have 1 column with all the data numerical data that I can plot allong different variables
## # A tibble: 6 x 5
##   id    intervention var   session measure
##   <fct> <fct>        <ord> <fct>     <dbl>
## 1 232   1            1     ses1     108   
## 2 232   1            1     ses2     102   
## 3 232   1            2     ses1      20.3 
## 4 232   1            2     ses2      19.6 
## 5 232   1            3     ses1       1.21
## 6 232   1            3     ses2       1.02
df = df_plot %>% pivot_wider(names_from = var, values_from = measure) # now here I spread the 1 column data into unique columns for each variable 

head(df) #this is no the same as the beginning because I have now 1 column for each measurment (instead of 1 column per measrument per session)
## # A tibble: 6 x 17
##   id    intervention session   `1`   `2`   `3`   `4`   `5`   `6`   `7`   `8`
##   <fct> <fct>        <fct>   <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 232   1            ses1      108  20.3  1.21 220.   1.49 0.554 1.22    5.8
## 2 232   1            ses2      102  19.6  1.02 219.   1.5  1.44  0.820   5.3
## 3 205   1            ses1      107  16.3  2     32.0 14.2  0.185 1.26    5.7
## 4 205   1            ses2       92   4    1.1   13.2  5.04 0.181 0.801   4.8
## 5 268   1            ses1       92  19.2  0.88  24.4  0    0     0       5.9
## 6 268   1            ses2       85  20.5  0.95  25.5  0    0     0.326   5.3
## # … with 6 more variables: 9 <dbl>, 10 <dbl>, 11 <dbl>, 12 <dbl>, 13 <dbl>,
## #   14 <dbl>
nums <- unlist(lapply(df, is.numeric)) #find numeric variables again

3) Plot the data

#plot densities
dens_plot = ggplot(df_plot)+
  geom_density(aes(x=measure))+
  facet_wrap(~var, scales = "free")+ #separate plots by variable
  labs(title = "Density per variable")+
  theme(axis.title = element_text()) + 
  ylab("Density") + xlab('Value'); dens_plot

corrplot::corrplot(cor(df[ , nums], use="pairwise.complete.obs"), type="lower") #using corrplot to plot correlations

#box plot
box_plot = ggplot(df_plot)+
  geom_boxplot(aes(x=intervention, y=measure))+
  scale_x_discrete(labels=c("Placebo", "Control")) + 
  facet_wrap(~var, scales = "free")+
  labs(title = "Boxplot by group")+
  ggthemes::theme_fivethirtyeight()+
  theme(axis.title = element_text()) + 
  ylab("predictors") + xlab(''); box_plot

Optional:

Try to “clean” the dataset (e.g. remove outliers, log transform, etc..) as best as you think to be able to analyze the data based on the observations you did on the plots. Then maybe try to do an 2x2 anova (whatch out for the repeated measurment) on a problematic variable before and after you cleaned/transformed it.

Scaling

That’s cool but it’s hard to interpret non scaled variables so let’s standardize them

df_scaled = df #just retaining the old df for later 
df_scaled[nums] <- lapply(df_scaled[nums], scale) #standardize each column separately
#here I show how to do so with lapply which allows you to "apply" a function, in this case "scale", over a list of vectors 


#Here I show how to create a function to do automatize a simple routine. For example her eis to gather a "1 column per variable data" to an extra long "1 column data" format for plotting.
#You don't need that in any case but being familiar with how to do so is one of the objectives of this workshop and will also make your life much easier in the long run. 

#pivot_re will be the name of the function and 'data' and 'nums' will be its arguments
#'data' is.. the data you want to modify and 'nums' is the index of the colums you want to gather
pivot_re <- function(data, nums) { 
  x = data %>% pivot_longer(
  cols = names(data[ , nums]),
  names_to = "var", 
  values_to = "measure")
  x$var = ordered(as.numeric(x$var)) #order for neat plots
  return(x)
} #I would usually put my functions either at the beginning or in a separate file for more readability but that will be in another workshop

df_scaled_plot = pivot_re(df_scaled, nums) #an example of how to use it

# we use %+% to change the dataset of a ggplot object without need to rewrite the whole code again (neat or not ?)
box_plot_scaled = box_plot %+% df_scaled_plot +   labs(title = "Boxplot by group (standardized)")+  ylab("standardized predictors") ; box_plot_scaled

#if you ever want/need to remove outlier based uniquely on the sd here is a little helper
#arguments are dataraw: the non standardized data, datasd: the standardized data, sd: the cutoff for max/min standard deviation, nums: index of the colums to inspect
remove_out <- function(dataraw, datasd, sd, nums){
  
  #creating a dictionnary with our parameters
  df_dict <- data.frame(variable = names(datasd[nums]), out_low = rep(-sd,length(names(datasd[nums]))),  out_high = rep(sd,length(names(datasd[nums]))))
  
  #for loop to go through each observations and replace sd based outliers by NA
  for (var in df_dict$variable) {
    dataraw[[var]] [datasd[[var]] < df_dict[df_dict$variable == var, ]$out_low | datasd[[var]] > df_dict[df_dict$variable == var, ]$out_high] <- NaN}
  return(dataraw)
}

#for example
#df_out = remove_out(df,df_scaled, 3, nums)



Bi-variate outliers ?

col_names = 1:14%>% str_c("var", .); col_names # create vector of string of the variable names
##  [1] "var1"  "var2"  "var3"  "var4"  "var5"  "var6"  "var7"  "var8"  "var9" 
## [10] "var10" "var11" "var12" "var13" "var14"
diff_names = as.character(1:14)# create vector of variable names

#function to create difference scores
#arguments are data: the data, col_names: vector of string of the variable names
diff_column <- function(data, col_names) {
  i = 0 # intialize
  for (col in col_names) {
    i = i+1 # counter
    data[,str_c(i)] = data[,str_c(col, "_ses2")] - data[,str_c(col, "_ses1")]
    #create new variable (difference score) with the col_name you gave, = , subsract var"x" ses1 from var"x" ses2    
  }
  return(data)
}

df_diff = diff_column(data, col_names) # example

df_diff = df_diff[c("id", "intervention",diff_names)] # keep only columns of interest (here only diff scores to investigate baseline changes)

df_diff[diff_names] = scale(df_diff[diff_names]) #scale

df_out = remove_out(df_diff,df_diff, 5, diff_names) # bluntly remove outliers

df_out[diff_names] = scale(df_out[diff_names], scale = F) #center at 0 for ANOVA and stuff

df_out = na.omit(df_out) # remove missign data (we wouldn't have to do that if we used lmer but out of this workshop's scope)

df_diff_plot <- pivot_re(df_out, diff_names) # prepare for plot

#box plot
box_plot_diff= box_plot %+% df_diff_plot +   labs(title = "Boxplot by group (diff score Z)")+  ylab("standardized diff scores") ; box_plot_diff

#dens plot
dens_plot_diff= dens_plot %+% df_diff_plot + geom_density(aes(x=measure, fill = intervention)) + xlab("standardized diff scores") ; dens_plot_diff

#or you could just plot everything in on command on a massive matrix plot
GGally::ggpairs(df_diff[, -c(1)], aes(colour=df_diff$intervention, alpha=0.4))

Skewness

Check skewness ! Transform: log ? sqrt? log1p?

X = lapply(na.omit(df[diff_names]), moments::skewness); rbind(X) #check # i just use  rbind to bind the data vertically (cbind to bind horizontally)
##   1         2        3        4        5        6        7       8         
## X 0.3268273 2.288469 1.824335 5.652491 1.678218 5.366602 1.63765 -0.3543178
##   9        10       11       12       13       14       
## X 2.647198 1.101269 2.009514 1.832194 1.108136 0.1617942
df_sqrt = df; df_sqrt[diff_names] <- lapply(df_sqrt[diff_names], sqrt); df_sqrt_plot = pivot_re(df_sqrt, diff_names) #create square root transformed df
df_log = df; df_log[diff_names] <- lapply(df_log[diff_names], log); df_log_plot = pivot_re(df_log, diff_names) #create log transformed df
df_log1p = df; df_log1p[diff_names] <- lapply(df_log1p[diff_names], log1p); df_log1p_plot = pivot_re(df_log1p, diff_names) #create log(1+x)  df

#Square root (handles non negative data and zeroes)
dens_sqrt_plot = dens_plot_diff %+% df_sqrt_plot + labs(title = "Density per variable (square root transformed)"); dens_sqrt_plot

X = lapply(na.omit(df_sqrt[diff_names]), moments::skewness); rbind(X) # calculate skewness of the data
##   1         2        3         4        5         6        7         8        
## X 0.1170117 1.439333 0.6457027 2.874378 0.9847146 1.839002 0.2557674 -1.169921
##   9        10        11       12      13        14        
## X 1.645342 0.6667252 1.152868 1.14676 0.6013354 -0.1268951
#Log (handles non negative data and but NOT zeroes)
dens_log_plot = dens_plot_diff %+% df_log_plot + labs(title = "Density per variable (log transformed)"); dens_log_plot

X =lapply(na.omit(df_log[diff_names]), moments::skewness); rbind(X) # here we see NA apparearing, why ? because we introduced -inf ( log(0) = -inf ) !
##   1          2         3          4   5   6   7   8         9         10       
## X -0.1002578 0.5115342 -0.5802683 NaN NaN NaN NaN -2.275663 0.5160858 0.1903609
##   11        12       13        14        
## X 0.4194621 0.598077 0.1029559 -0.4439815
#Log1p (handles non negative data AND  zeroes)
dens_log1p_plot = dens_plot_diff %+% df_log1p_plot + labs(title = "Density per variable (log + 1 transformed)"); dens_log1p_plot

X = lapply(na.omit(df_log1p[diff_names]), moments::skewness); rbind(X) # here we see NO NA apparearing, why ? because ( log(0+1) = 0 ) !
##   1           2         3         4           5         6        7        
## X -0.09563731 0.6125633 0.4990023 -0.04983626 0.8047163 1.344196 0.4472332
##   8         9         10        11       12        13        14        
## X -1.786739 0.9200051 0.7775229 1.031717 0.8849994 0.4274009 -0.4367489
#because zero-inflated/ skewed data => problem

Modeling

## Create a formula for a model with a large number of variables:
fmla <- as.formula(paste(" `1` ~ intervention + ", paste0(sprintf("`%s`",  diff_names[c(-1,-2)]), collapse= "+"), " + Error(id)")); fmla # overly to complicated for what is does but Im just THAT lazy
## `1` ~ intervention + `3` + `4` + `5` + `6` + `7` + `8` + `9` + 
##     `10` + `11` + `12` + `13` + `14` + Error(id)
#this really just to demonstrate the influence of outliers do not see that as an example of what do do with this data

#with huge outliers
model1 = aov_car(fmla, data= na.omit(df_diff), factorize = F, anova_table = list(correction = "GG", es = "pes"))
nice(model1, MSE=F); #emmeans::ref_grid(model1) #triple check everything is centered at 0
## Anova Table (Type 3 tests)
## 
## Response: 1
##          Effect    df         F  pes p.value
## 1  intervention 1, 40 26.24 *** .396   <.001
## 2           `3` 1, 40      1.30 .031    .261
## 3           `4` 1, 40      2.03 .048    .162
## 4           `5` 1, 40      0.16 .004    .690
## 5           `6` 1, 40      0.62 .015    .436
## 6           `7` 1, 40      0.08 .002    .779
## 7           `8` 1, 40   7.64 ** .160    .009
## 8           `9` 1, 40   8.30 ** .172    .006
## 9          `10` 1, 40      2.24 .053    .142
## 10         `11` 1, 40      2.46 .058    .125
## 11         `12` 1, 40      0.08 .002    .784
## 12         `13` 1, 40      0.12 .003    .730
## 13         `14` 1, 40      0.87 .021    .356
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '+' 0.1 ' ' 1
#without huge outliers
model = aov_car(fmla, data= na.omit(df_out), factorize = F, anova_table = list(correction = "GG", es = "pes"))
nice(model, MSE=F); #emmeans::ref_grid(model) #triple check everything is centered at 0
## Anova Table (Type 3 tests)
## 
## Response: 1
##          Effect    df         F   pes p.value
## 1  intervention 1, 38 16.82 ***  .307   <.001
## 2           `3` 1, 38      0.04  .001    .839
## 3           `4` 1, 38    3.01 +  .073    .091
## 4           `5` 1, 38      0.01 <.001    .914
## 5           `6` 1, 38      2.63  .065    .113
## 6           `7` 1, 38      0.04  .001    .837
## 7           `8` 1, 38      0.12  .003    .732
## 8           `9` 1, 38      0.61  .016    .438
## 9          `10` 1, 38      0.02 <.001    .900
## 10         `11` 1, 38      1.35  .034    .253
## 11         `12` 1, 38      0.08  .002    .773
## 12         `13` 1, 38      0.50  .013    .483
## 13         `14` 1, 38      0.60  .016    .442
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '+' 0.1 ' ' 1
devtools::source_gist("383aa93ffa161665c0dca1103ef73d9d", filename = "effect_CI.R") # to download  pes_ci # function to calculate CI around partial eta squared

pes_ci(fmla, data= na.omit(df_out), factorize = F) # default CI is 90% -> here is why https://daniellakens.blogspot.com/2014/06/calculating-confidence-intervals-for.html
##              Partial eta-squared 90 % CI lower limit 90 % CI upper limit
## intervention        0.3067634438           0.1132256          0.46473856
## `3`                 0.0010969704           0.0000000          0.05592432
## `4`                 0.0734635701           0.0000000          0.22436663
## `5`                 0.0003095440           0.0000000          0.02639789
## `6`                 0.0647322279           0.0000000          0.21238097
## `7`                 0.0011270533           0.0000000          0.05654807
## `8`                 0.0031194365           0.0000000          0.08035206
## `9`                 0.0159248801           0.0000000          0.12804570
## `10`                0.0004191403           0.0000000          0.03357207
## `11`                0.0342722172           0.0000000          0.16511509
## `12`                0.0022140274           0.0000000          0.07220268
## `13`                0.0130401838           0.0000000          0.12064843
## `14`                0.0156121645           0.0000000          0.12727930

Plot interaction

#this is just stuff to make the formating nice
averaged_theme <- theme_bw(base_size = 24, base_family = "Helvetica")+ #font stuff
    theme(strip.text.x = element_text(size = 24, face = "bold"),
          strip.background = element_rect(color="white", fill="white", linetype="solid"),
          legend.position=c(.9,.9),
          legend.title  = element_text(size = 12),
          legend.text  = element_text(size = 10),
          legend.key.size = unit(0.2, "cm"),
          legend.key = element_rect(fill = "transparent", colour = "transparent"),
          panel.grid.major.x = element_blank() ,
          panel.grid.major.y = element_line(size=.2, color="lightgrey") ,
          panel.grid.minor = element_blank(),
          axis.title.x = element_text(size = 22),
          axis.title.y = element_text(size = 22),
          axis.line = element_line(size = 0.5),
          panel.border = element_blank())
  
pal = viridis::viridis(n=3) # specialy conceived for colorblindness  color palette (just input the number of categories you want here I took 3 instaed of two to have more room
  

# AVERAGED EFFECT 
dfH <- summarySEwithin(df,
                       measurevar = "1",
                       betweenvars = "intervention",
                       withinvars = "session", 
                       idvar = "id", na.rm = T) #get standard error adjusted for within design

#this is to have more control on my jittering but really not necessary
dfH$cond <- ifelse(dfH$session == "ses1", -0.25, 0.25) # means and se
df$cond <- ifelse(df$session == "ses1", -0.25, 0.25) # individual points
#(what I do is that I transform by categorical data into dummy numerical data to "jitter" it 
set.seed(666)#setting "random seed" so the random jitter will always be the same 
df <- df %>% mutate(condjit = jitter(as.numeric(cond), 0.3),
                                  grouping = interaction(id, cond))

labels = c("0"="Placebo", "1"="Treatment") # labels for top strip

plt <- ggplot(df, aes(x = cond, y = `1`, fill = session, color = session)) +
  geom_point(data = dfH, alpha = 0.5) + # means
  geom_line(aes(x = condjit, group = id, y = `1`), alpha = .3, size = 0.5, color = 'gray') + # lines that connect indivudals across time
  geom_flat_violin(scale = "count", trim = FALSE, alpha = .2, aes(fill = session, color = NA))+
  geom_point(aes(x = condjit), alpha = .3,) + # individual points
  geom_crossbar(data = dfH, aes(y = `1`, ymin=`1`-ci, ymax=`1`+ci,), width = 0.2 , alpha = 0.1)+ #error bar +- Se
  ylab('Var 1') +
  xlab('') +
  #scale_y_continuous(expand = c(0, 0), breaks = c(seq.int(0,100, by = 20)), limits = c(-0.5,100.5)) + # choose limits
  scale_x_continuous(labels=c("Pre", "Post"), breaks = c(-.25,.25), limits = c(-.5,.5)) + # rename levels of categorical variable
  scale_fill_manual(values=c("ses1"= pal[1], "ses2"=pal[2]), guide = 'none') + # assign color from palette
  scale_color_manual(values=c("ses1"=pal[1], "ses2"=pal[2]), guide = 'none') + # assign color from palette
  theme_bw() + facet_wrap(~intervention, labeller=labeller(intervention =labels)) + #split plot into two groups for more readability 
 averaged_theme ; plt # add theme adjustments

#if you want to save the figures
pdf('figures/inter.pdf')
print(plt)
dev.off()
## png 
##   2



4) Feature selection (optional but at least try to think about it)

Let’s say you want to test for the significance of the intervention variable (placebo VS treatment) on var1. You also want to control for baseline difference between groups (session pre-post) and other nuissance covariates (e.g. age, weight, number of cigarette smoked per week, etc..). Here you have many “nuissance” variables (var2 all the way to var20) and not that much observations…

One thing you could do is enter ALL of them into an ANCOVA, but that’s a bit dubious because you are artifically decreasing your degrees of freedom (and increasing the chances of multi-colinearity). We are not gonna enter too much into the details about that right here but try to think why we generally wouldn’t want that.

Another thing one might do is to enter all these variables and then removing the ones that are not “statiscally significant” and then rerun the analysis with only the ones left. Again that is definitely not the best way to go for because you doing multiple tests (and also some variables that were significant before might still be once you removed others).

So one way to circumvent these issues is to first implement a proper variable selection method (not based on p-values). You can achieve this via several methods. One way to do that could be to fit different models (with and without certain variables) and compare them with a model selection criterion (AIC, BIC, SIC, loo, R², etc..) to only keep the “best model”.



Alternatively you could also select only variables that explain a certain thershold of variance (i.e. how useful is this variable to predict an outcome, here “is the individual in the placebo or treatment group?”). Finally, you can also create composite variables (component) that reflects most of the variance from those variables (however try to think about what is the drawback of this last method if you use it).

This is a rather complex and technical exercise (as well as time and computationally consuming) but I think it’s definitely primordial for scientists to know how to select variables (feature) from dataset.

PS: No need to do the ANCOVA, the purpose of this exercise is really about wich variables you would retain in the final model and there is defintely no “right or wrong” answers.



A bit of machine learning

Using penalized (lasso) maximum likelihood for variable selection (cv.glmnet), i.e. finding the minimum lambda (regularization parameter) to infer the best number of features to use in order to simplify the model and avoid overfitting.
Using that because:
- Handles the problem of correlated inputs - Perform better than stepwise selection

#1) with Recursive Feature Eliminations (CARET)
control <- rfeControl(functions=rfFuncs, method="cv", number=10) # control options

rfeResults = rfe(x = df_out[c(-1, -2)], y = df_out$intervention, sizes=1:14, rfeControl=control)
predictors(rfeResults)
## [1] "14" "1"  "3"  "6"
plot(rfeResults, type=c("g", "o")) # look for the "elbow"

#2) with Stepwise Feature Selection
full.model = glm(intervention ~., data = df_out[-1], family = binomial)
step.model <- full.model %>% MASS::stepAIC(trace = FALSE) #Choose a model by AIC in a Stepwise Algorithm
coef(step.model)
## (Intercept)         `1`        `10`        `14` 
##   -65.63880  -285.86268    50.34475   244.99213
#3) with lasso regularization
mat<-model.matrix(intervention~.,data=df_out[-1]); mat=mat[,-1] # remove intercept

set.seed(123);
mat<-model.matrix(intervention~.,data=df_out[-1]); mat=mat[,-1] # remove intercept
foldid <- sample(rep(seq(3), length.out = nrow(df_out[-1]))) # to be reproducible
fit<-glmnet::cv.glmnet(x=mat,y=df_out[-1]$intervention,family ="binomial",type.measure='mse',alpha=1) #, foldid = foldid, nfolds = 3, standardize = FALSE)
plot(fit)

c<-coef(fit,s='lambda.min',exact=TRUE); inds<-which(c!=0) #include only variable lambda coeficients bigger than 0
variables<-row.names(c)[inds]; variables<-variables[!variables %in% '(Intercept)']; variables #extract those variables
## [1] "`1`"  "`3`"  "`10`" "`14`"
df_diff_plot <- pivot_re(df_diff, c("1", "3", "10", "14"))

#plot
GGally::ggpairs(df_diff[, c("intervention", "1", "3", "10", "14")], aes(colour=df_diff$intervention, alpha=0.4))

5) Preparing behavioral data for fMRI analysis

Now we are entering the realm of task-based functional MRI. We will defintely go more into details in a future workshop but let’s just go through a bit of data wrangling anyone confronted to fMRI will have to go through.

Let’s start by loading the events associated to an fMRI acquisition. Here (in data/sub-control100/data.mat) is just an example of a typical file that you will have to extract from: the onsets (AKA when does the stuff you showed to your participant appeared), the durations (how long did it appear) and the behavioral (also generally called modulators) data associated to your task (e.g. reaction times or liking ratings for each condition). You WILL need these regressors to model you fMRI data later on.

The thing is.. these files are generally (at least here in Geneva) generated for each participantin a .mat format. So that’s whyI gave it to you hee like you probably get it in your lab too. If you don’t feel concerned by that and your lab and moved on to python as its presentation software for fMRI, good for you and you might skip this. Otherwise this task might be quite instructive for you. So let’s dive in.

This files contain a matlab “structure array” which simply put is “a data type that groups related data using data containers called ‘fields’. Each field can contain any type of data”. This might be confusing at first (trust me .. it is) but so let’s go step by step. This is a really simple design were we have only 3 odor conditions and the “start” of each trial.

For Matlab/Octave

load('data/sub-control100/data.mat')  %loading data

disp('Each structure has two fields named:'); disp(fieldnames(durations2))

%to access the data in a field you must use dot notation (e.g. structName.fieldName)

disp(['The "start" field contains ', num2str(length(durations2.start)), ' observattions'])
disp('The "odor" field has 3 nested substructures named'); disp(fieldnames(durations2.odor))
disp(['Each of the 3 nested substructure in the "odor" field contains ', num2str(length(durations2.odor.reward)), ' observattions'])
Each structure has two fields named:
{
  [1,1] = odor
  [2,1] = start
}
The "start" field contains 54 observattions
The "odor" field has 3 nested substructures named
{
  [1,1] = reward
  [2,1] = neutral
  [3,1] = control
}
Each of the 3 nested substructure in the "odor" field contains 18 observattions

Now what we would like for most MRI softwares is a “3 column timming files” with respectively the onset time (in seconds); the duration (in seconds); the relative magnitude of each stimulus (set to 1 for none). I put examples of what it should look like in the /data folder.

So basically the task will be to recreate that type of files for sub-01 and make you code easilly scale to more subjects (here is just duplicated the sub-01 into more subjects to test your code).

Of course you are not forced to use matlab/octave here to achieve that.

For python:

import scipy.io
mat = scipy.io.loadmat('data/sub-control100/data.mat')
dict.keys(mat) #its read as a dictionnary so this gonna get ugly but stay with me
#mat["modulators2"]["start"][0] #this gets you an array within an array !not good enough!  
## dict_keys(['__header__', '__version__', '__globals__', 'durations2', 'modulators2', 'onsets2'])
mat["modulators2"]["start"][0][0][0:2] #this actually get you the array (but remove the [0:2])
#mat["modulators2"]["odor"]["control"][0][0] #this doesn't work!
## array([[1.],
##        [1.]])
mat["modulators2"]["odor"][0][0]["control"][0][0][0:2] #you have to get you dictionnary from the array first ! (but remove the [0:2])
## array([[67.79661017],
##        [50.        ]])



Optional:

Try to create a dataset that includes each subjects behavioral dataset appended together with a column with their ID (numbers following control or treatment), a column with their group (control or treatment), a column with the condition name (reward, control or neutral) and 1 column with their behavioral data (from the modulators2.odor field) and a column with the number of the trial (1-54). Have extra caution to put the behavioral data back into the right order (you can get that by sorting them by their onsets !). Hang in there..

For R:

subject = dir("data", pattern="sub-*", all.files=T, full.names=T) #find data via pattern

data = tibble() #initialize

#go through subjects
for (s in subject) { 
  
  mat <- R.matlab::readMat(paste(s, "/data.mat", sep="")) #read mat file
  
  #start
  onset = mat$onsets2[[2]]
  duration = mat$durations2[[2]]
  modulator = mat$modulators2[[1]] #watchout it's reversed here !
  start = cbind(onset, duration, modulator) #bind all vertically
  write.table(start, paste(s, "/start.txt", sep=""), row.names = F, sep="\t", col.names = F) # write to txt (or csv or tsv)
  
  
  #odors
  ##reward
  onset =  mat$onsets2[[1]][[1]]
  duration = mat$durations2[[1]][[1]]
  modulator = as_tibble(mat$modulators2[[2]][[1]]) #watchout it's reversed here !
  odor_rew = cbind(onset, duration, modulator)
  write.table(odor_rew, paste(s, "/odor_rew.txt", sep=""), row.names = F, sep="\t", col.names = F)
  
  modulator$condition = "reward"; mod_rew = cbind(modulator, onset); #bind all vertically
  
  ##neutral
  onset =  mat$onsets2[[1]][[2]]
  duration = mat$durations2[[1]][[2]]
  modulator = as_tibble(mat$modulators2[[2]][[2]]) #watchout it's reversed here !
  odor_neu = cbind(onset, duration, modulator)
  write.table(odor_neu, paste(s, "/odor_neu.txt", sep=""), row.names = F, sep="\t", col.names = F)
  
  modulator$condition = "neutral"; mod_neu = cbind(modulator, onset); #bind all vertically
  
  ##control
  onsets_odor_con =  mat$onsets2[[1]][[3]]
  durations_odor_con = mat$durations2[[1]][[3]]
  modulator_odor_con = as_tibble(mat$modulators2[[2]][[3]]) #watchout it's reversed here !
  odor_con = cbind(onset, duration, modulator)
  write.table(odor_con, paste(s, "/odor_con.txt", sep=""), row.names = F, sep="\t", col.names = F)
  
  modulator$condition = "control"; mod_con = cbind(modulator, onset); #bind all vertically
  
  #behavioral
  df = rbind (mod_rew, mod_neu, mod_con); #stack them 
  df = df[order(df$onset),] #order by onset
  df$trial = 1:54 #trial number vector
  df$id = str_sub(s,-3,-1) #extract last 3 charcaters of the string (i.e. subject ID)
  df$group = str_sub(s,-10,-4)  #extract last X charcaters of the string (i.e. group)
  
  data = rbind(data,df) #stack them
}

6) The end



There you are, you finished this list of exercise. Good job! You deserve a pat on the shoulder and a good cup (jar?) of coffee.

I hope you learned things that will be useful for your research/career and let me know whta you liked/disliked.