Jeff Gui

Biomedical Informatics, Undergraduate
Zhejiang University-University of Edinburgh Institute

Data science review

A simple review of knowledge taught in data science course.

ADS2 Review

Methods

# Difference within or between group
group_dif = function(x){
  # input: dataframe, first n column be group, last column be value
  same_group_diff = {}
  between_group_diff = {}
  n = dim(x)[2]-1 # n group columns
  for (i in 1:(dim(x)[1]-1)){
    for (j in (i+1):dim(x)[1]){
      diff = abs(x[i,n+1]- x[j,n+1])
      if (sum(x[i,1:n]==x[j,1:n])==n){ 
        # only all the group same will be considered as same.
          same_group_diff = c(same_group_diff,diff)
        } else {
          between_group_diff = c(between_group_diff, diff)
        }
    }
  }
  return(list("same_group_diff"=same_group_diff,
              "between_group_diff"=between_group_diff))
}

# sampling error and mean (for t test assumption)
sample_Se_Mean = function(x,n){
  # x: vector data, n: bootstrap times
  sampling_errors = vector() 
  sampling_means = vector()
  for (replicate in 1:n){
    BSsample = sample(x, size = length(x), replace = TRUE) 
    standard_error = sd(BSsample)/sqrt(length(BSsample)) 
    sampling_errors = c(sampling_errors, standard_error) 
    sampling_means = c(sampling_means, mean(BSsample))
  }
  lmfit = lm(sampling_errors~sampling_means)
  return(list("sampling_errors"=sampling_errors,"sampling_means"=sampling_means,
              "lmfit"=lmfit))
}

# bootstrapping CI for binary counting data
bt_bin_CI = function(x,y,n){
  # x:count of category 1, e.g. infected ; y: total count
  # n: bootstrap times
  # method: case resampling; output: confidence interval of x
  obs_sample = c(rep("1",x),rep("2",y-x))
  bt = 1:n
  for (b in 1:n){
    bt_sample = sample(obs_sample, y,replace = T)
    bt[b] = sum(bt_sample=="1")
  }
 lower_ci = quantile(bt, 0.025)/y
 upper_ci = quantile(bt, 0.975)/y
 return(list("low_CI"=lower_ci,"up_CI"=upper_ci))
}
#bt_bin_CI(18,80796,100)

# bootstrapping CI for numeric data (between groups)
bt_num_CI = function(x,n){
  # x: dataframe:gruop~value
  # n: bootstrap times
  # method: case resampling
  allc = summary(as.factor(x[,1]))
  result=list()
  for (g in names(allc)){
    means=1:n
    for (i in 1:n){
      means[i] =  mean(sample(x[,2],allc[g],replace = T))
    }
    lower_ci = quantile(means, 0.025)
    upper_ci = quantile(means, 0.975)
    this_group = list(list("low_CI"=lower_ci, "up_CI"=upper_ci))
    names(this_group) = g
    result = c(result, this_group)
  }
  return(result)
}
#bt_num_CI(data.frame("Country"=c(rep("A",100),rep("B",100)),"Value"=rnorm(200)),100)

Collection: Hypothesis testing assumptions

T test (problem set 9)

Independent random sampling on continuous variable.
Normality of the sampling distribution.
Independence of mean and variation (standard error).

data = scan("/Users/jefft/Library/Mobile Documents/com~apple~CloudDocs/Year 2/ADS/Week 9 T-test/barley.txt")

# Independent random sampling on continuous variable.
head(data)

## [1] 41.03 45.99 50.01 51.44 49.53 47.47

# Normality of the sampling distribution.
sampling_means<-vector() 
for (replicate in 1:100){
  barley_sample = sample(data, size = length(data), replace = TRUE)
  sampling_means = c(sampling_means, mean(barley_sample))
}
hist(sampling_means, xlab = "Sample means", main = "")

shapiro.test(sampling_means)

## 
##  Shapiro-Wilk normality test
## 
## data:  sampling_means
## W = 0.99381, p-value = 0.9317

# Independence of mean and variation (standard error).
sampling_errors = vector() 
sampling_means = vector()
for (replicate in 1:100){
  barley_sample = sample(data, size = length(data), replace = TRUE) 
  standard_error = sd(barley_sample)/sqrt(length(barley_sample)) 
  sampling_errors = c(sampling_errors, standard_error) 
  sampling_means = c(sampling_means, mean(barley_sample))
}
plot(sampling_means, sampling_errors, xlab = "Sample mean", ylab = "Standard error") 
lmfit = lm(sampling_errors~sampling_means)
abline(lmfit, col = 'red')

Chi square test

Discrete, catrgorical data.
Expected cell frequency >= 1.
Less than 20% cell count <5.

# One way sample test
chisq.test(c(84,82,34),p=c(0.45,0.43,0.12))

## 
##  Chi-squared test for given probabilities
## 
## data:  c(84, 82, 34)
## X-squared = 4.7527, df = 2, p-value = 0.09289

# Test on 3 way data (test on homogenity)
ThreeWayData = array(data=c(40,9,34,7,20,15,25,20), dim = c(2,2,2), 
      dimnames =
      list("status"=c("Alive","Dead"),"sex"=c("Male","Female"),
           "Genotype"=c("WT","KO")))
ThreeWayData

## , , Genotype = WT
## 
##        sex
## status  Male Female
##   Alive   40     34
##   Dead     9      7
## 
## , , Genotype = KO
## 
##        sex
## status  Male Female
##   Alive   20     25
##   Dead    15     20

summary(as.table(ThreeWayData))

## Number of cases in table: 170 
## Number of factors: 3 
## Test for independence of all factors:
##  Chisq = 15.765, df = 4, p-value = 0.003351

Fisher’s exact test

Discrete, categorical count data
Small data.

KO = data.frame("WT"=c(7,3),"KO"=c(2,7),row.names = c("Alive","Dead"))
KO_plot = as.data.frame(as.table(as.matrix(KO)))
names(KO_plot) = c("Phenotype","Genotype","Freq")
ggplot(data=KO_plot, aes(x=Genotype,y=Phenotype)) +
  geom_point(aes(size=Freq,color=Freq))

fisher.test(KO)

## 
##  Fisher's Exact Test for Count Data
## 
## data:  KO
## p-value = 0.06978
## alternative hypothesis: true odds ratio is not equal to 1
## 95 percent confidence interval:
##    0.7520079 113.4668907
## sample estimates:
## odds ratio 
##   7.166282

Correlation

Continuous data.
Linearity
Absence of outliers.
Normality distribution of data.

# Continuous data
summary(data)

##      Height          Weight     
##  Min.   :63.43   Min.   : 97.9  
##  1st Qu.:66.52   1st Qu.:119.9  
##  Median :67.94   Median :127.9  
##  Mean   :67.95   Mean   :127.2  
##  3rd Qu.:69.20   3rd Qu.:136.1  
##  Max.   :73.90   Max.   :159.0

  # "A quick summary shows that both variables are continuous and numerical."

# Linearity
plot(data$Height, data$Weight, xlab = "Height", ylab = "Weight")

  # looking at the plot

# Absence of outliers
  # looking at the plot. (find #200 be outlier)
data = data[-200,]

# Normality distribution of data.
shapiro.test(data$Height)

## 
##  Shapiro-Wilk normality test
## 
## data:  data$Height
## W = 0.99415, p-value = 0.627

shapiro.test(data$Weight)

## 
##  Shapiro-Wilk normality test
## 
## data:  data$Weight
## W = 0.99398, p-value = 0.6013

# Test
cor.test(data$Height, data$Weight)

## 
##  Pearson's product-moment correlation
## 
## data:  data$Height and data$Weight
## t = 9.508, df = 197, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  0.4574401 0.6492864
## sample estimates:
##      cor 
## 0.560846

Linear Regression (problem set 29)

Continuous data.
Linearity.
Absence of outliers.
Normality distribution of data.
Equality of variance.

# Continuous data. (same)
# Linearity. (same)
# Absence of outliers. (same)
# Normality distribution of data. (same)

# Equality of variance
fit = lm(data$Weight~data$Height)
plot(data$Height, resid(fit), xlab = "Height", ylab = "Residuals")
fit2 = lm(resid(fit)~data$Height)
abline(fit2, col = 'red', lwd = 3)

# Test
fit<-lm(data$Weight~data$Height)
plot(data$Height, data$Weight, xlab = "Height", ylab = "Weight")
abline(fit, col = 'firebrick3', lwd = 3)

ANOVA (coding challenge 1)

Independent random sampling on continuous variable.
Normality of the sampling distribution.
Homoscedasticity (equality of variances).

model <- aov(Time ~ Gender+Origin, data=marathon)

# Independent random sampling on continuous variable.
  # Garanteed.

# Normality of the sampling distribution.
hist(resid(model))

# Homoscedasticity (equality of variances).
plot(model,1)

# Test
summary(model)

##              Df Sum Sq Mean Sq F value  Pr(>F)   
## Gender        1   5.69   5.689   7.681 0.00621 **
## Origin        1   2.82   2.821   3.809 0.05265 . 
## Residuals   167 123.70   0.741                   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

2020 5

2020

Journal club - 2D or 3D culture?

10 minute read

Chromosome missegregation and the consequent aneuploidy are lethal to normal cells, though, fitness of cancer cells is paradoxically enhanced by aneuploidy.

Journal club - Permutations of Immunoblotting

10 minute read

Sister chromatid separation is triggered by cohesin cleavage mediated by separase. Therefore, it is important to keep separase in check. There are two previo...

Journal club - Modeling the cell cycle – Just Timing

11 minute read

Cell cycle is an orchestration of colossal molecules in the elaborate cell machine. Analogous to the central theorem, genetic code to protein expression and ...

Markdown demo

2 minute read

This is a collection for the basic markdown syntax modified from markdown guide.