Data science review

A simple review of knowledge taught in data science course.

ADS2 Review

Methods

# Difference within or between group
group_dif = function(x){
  # input: dataframe, first n column be group, last column be value
  same_group_diff = {}
  between_group_diff = {}
  n = dim(x)[2]-1 # n group columns
  for (i in 1:(dim(x)[1]-1)){
    for (j in (i+1):dim(x)[1]){
      diff = abs(x[i,n+1]- x[j,n+1])
      if (sum(x[i,1:n]==x[j,1:n])==n){ 
        # only all the group same will be considered as same.
          same_group_diff = c(same_group_diff,diff)
        } else {
          between_group_diff = c(between_group_diff, diff)
        }
    }
  }
  return(list("same_group_diff"=same_group_diff,
              "between_group_diff"=between_group_diff))
}

# sampling error and mean (for t test assumption)
sample_Se_Mean = function(x,n){
  # x: vector data, n: bootstrap times
  sampling_errors = vector() 
  sampling_means = vector()
  for (replicate in 1:n){
    BSsample = sample(x, size = length(x), replace = TRUE) 
    standard_error = sd(BSsample)/sqrt(length(BSsample)) 
    sampling_errors = c(sampling_errors, standard_error) 
    sampling_means = c(sampling_means, mean(BSsample))
  }
  lmfit = lm(sampling_errors~sampling_means)
  return(list("sampling_errors"=sampling_errors,"sampling_means"=sampling_means,
              "lmfit"=lmfit))
}

# bootstrapping CI for binary counting data
bt_bin_CI = function(x,y,n){
  # x:count of category 1, e.g. infected ; y: total count
  # n: bootstrap times
  # method: case resampling; output: confidence interval of x
  obs_sample = c(rep("1",x),rep("2",y-x))
  bt = 1:n
  for (b in 1:n){
    bt_sample = sample(obs_sample, y,replace = T)
    bt[b] = sum(bt_sample=="1")
  }
 lower_ci = quantile(bt, 0.025)/y
 upper_ci = quantile(bt, 0.975)/y
 return(list("low_CI"=lower_ci,"up_CI"=upper_ci))
}
#bt_bin_CI(18,80796,100)

# bootstrapping CI for numeric data (between groups)
bt_num_CI = function(x,n){
  # x: dataframe:gruop~value
  # n: bootstrap times
  # method: case resampling
  allc = summary(as.factor(x[,1]))
  result=list()
  for (g in names(allc)){
    means=1:n
    for (i in 1:n){
      means[i] =  mean(sample(x[,2],allc[g],replace = T))
    }
    lower_ci = quantile(means, 0.025)
    upper_ci = quantile(means, 0.975)
    this_group = list(list("low_CI"=lower_ci, "up_CI"=upper_ci))
    names(this_group) = g
    result = c(result, this_group)
  }
  return(result)
}
#bt_num_CI(data.frame("Country"=c(rep("A",100),rep("B",100)),"Value"=rnorm(200)),100)

Collection: Hypothesis testing assumptions

T test (problem set 9)

  • Independent random sampling on continuous variable.
  • Normality of the sampling distribution.
  • Independence of mean and variation (standard error).
data = scan("/Users/jefft/Library/Mobile Documents/com~apple~CloudDocs/Year 2/ADS/Week 9 T-test/barley.txt")

# Independent random sampling on continuous variable.
head(data)
## [1] 41.03 45.99 50.01 51.44 49.53 47.47
# Normality of the sampling distribution.
sampling_means<-vector() 
for (replicate in 1:100){
  barley_sample = sample(data, size = length(data), replace = TRUE)
  sampling_means = c(sampling_means, mean(barley_sample))
}
hist(sampling_means, xlab = "Sample means", main = "")

shapiro.test(sampling_means)
## 
##  Shapiro-Wilk normality test
## 
## data:  sampling_means
## W = 0.99381, p-value = 0.9317
# Independence of mean and variation (standard error).
sampling_errors = vector() 
sampling_means = vector()
for (replicate in 1:100){
  barley_sample = sample(data, size = length(data), replace = TRUE) 
  standard_error = sd(barley_sample)/sqrt(length(barley_sample)) 
  sampling_errors = c(sampling_errors, standard_error) 
  sampling_means = c(sampling_means, mean(barley_sample))
}
plot(sampling_means, sampling_errors, xlab = "Sample mean", ylab = "Standard error") 
lmfit = lm(sampling_errors~sampling_means)
abline(lmfit, col = 'red')

Chi square test

  • Discrete, catrgorical data.
  • Expected cell frequency >= 1.
  • Less than 20% cell count <5.
# One way sample test
chisq.test(c(84,82,34),p=c(0.45,0.43,0.12))
## 
##  Chi-squared test for given probabilities
## 
## data:  c(84, 82, 34)
## X-squared = 4.7527, df = 2, p-value = 0.09289
# Test on 3 way data (test on homogenity)
ThreeWayData = array(data=c(40,9,34,7,20,15,25,20), dim = c(2,2,2), 
      dimnames =
      list("status"=c("Alive","Dead"),"sex"=c("Male","Female"),
           "Genotype"=c("WT","KO")))
ThreeWayData
## , , Genotype = WT
## 
##        sex
## status  Male Female
##   Alive   40     34
##   Dead     9      7
## 
## , , Genotype = KO
## 
##        sex
## status  Male Female
##   Alive   20     25
##   Dead    15     20
summary(as.table(ThreeWayData))
## Number of cases in table: 170 
## Number of factors: 3 
## Test for independence of all factors:
##  Chisq = 15.765, df = 4, p-value = 0.003351

Fisher’s exact test

  • Discrete, categorical count data
  • Small data.
KO = data.frame("WT"=c(7,3),"KO"=c(2,7),row.names = c("Alive","Dead"))
KO_plot = as.data.frame(as.table(as.matrix(KO)))
names(KO_plot) = c("Phenotype","Genotype","Freq")
ggplot(data=KO_plot, aes(x=Genotype,y=Phenotype)) +
  geom_point(aes(size=Freq,color=Freq))

fisher.test(KO)
## 
##  Fisher's Exact Test for Count Data
## 
## data:  KO
## p-value = 0.06978
## alternative hypothesis: true odds ratio is not equal to 1
## 95 percent confidence interval:
##    0.7520079 113.4668907
## sample estimates:
## odds ratio 
##   7.166282

Correlation

  • Continuous data.
  • Linearity
  • Absence of outliers.
  • Normality distribution of data.
# Continuous data
summary(data) 
##      Height          Weight     
##  Min.   :63.43   Min.   : 97.9  
##  1st Qu.:66.52   1st Qu.:119.9  
##  Median :67.94   Median :127.9  
##  Mean   :67.95   Mean   :127.2  
##  3rd Qu.:69.20   3rd Qu.:136.1  
##  Max.   :73.90   Max.   :159.0
  # "A quick summary shows that both variables are continuous and numerical."

# Linearity
plot(data$Height, data$Weight, xlab = "Height", ylab = "Weight")

  # looking at the plot

# Absence of outliers
  # looking at the plot. (find #200 be outlier)
data = data[-200,]

# Normality distribution of data.
shapiro.test(data$Height)
## 
##  Shapiro-Wilk normality test
## 
## data:  data$Height
## W = 0.99415, p-value = 0.627
shapiro.test(data$Weight)
## 
##  Shapiro-Wilk normality test
## 
## data:  data$Weight
## W = 0.99398, p-value = 0.6013
# Test
cor.test(data$Height, data$Weight)
## 
##  Pearson's product-moment correlation
## 
## data:  data$Height and data$Weight
## t = 9.508, df = 197, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  0.4574401 0.6492864
## sample estimates:
##      cor 
## 0.560846

Linear Regression (problem set 29)

  • Continuous data.
  • Linearity.
  • Absence of outliers.
  • Normality distribution of data.
  • Equality of variance.
# Continuous data. (same)
# Linearity. (same)
# Absence of outliers. (same)
# Normality distribution of data. (same)

# Equality of variance
fit = lm(data$Weight~data$Height)
plot(data$Height, resid(fit), xlab = "Height", ylab = "Residuals")
fit2 = lm(resid(fit)~data$Height)
abline(fit2, col = 'red', lwd = 3)

# Test
fit<-lm(data$Weight~data$Height)
plot(data$Height, data$Weight, xlab = "Height", ylab = "Weight")
abline(fit, col = 'firebrick3', lwd = 3)

ANOVA (coding challenge 1)

  • Independent random sampling on continuous variable.
  • Normality of the sampling distribution.
  • Homoscedasticity (equality of variances).
model <- aov(Time ~ Gender+Origin, data=marathon)

# Independent random sampling on continuous variable.
  # Garanteed.

# Normality of the sampling distribution.
hist(resid(model))

# Homoscedasticity (equality of variances).
plot(model,1)

# Test
summary(model)
##              Df Sum Sq Mean Sq F value  Pr(>F)   
## Gender        1   5.69   5.689   7.681 0.00621 **
## Origin        1   2.82   2.821   3.809 0.05265 . 
## Residuals   167 123.70   0.741                   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

2020

Journal club - 2D or 3D culture?

10 minute read

Chromosome missegregation and the consequent aneuploidy are lethal to normal cells, though, fitness of cancer cells is paradoxically enhanced by aneuploidy.

Markdown demo

2 minute read

This is a collection for the basic markdown syntax modified from markdown guide.

Back to top ↑