30-03-2015

R project

R is a free, open-software statistical analysis package

You can read in all the standard data formats, csv, xls, xlsx, mat, npy, and probably many more


Outline of tutorial

[1] Coding conventions
[2] Read in data
[3] Linear models

Assignment of values and types of objects

Values can be assigned to variables in three different ways that result in (almost) the same:

animal <- 'cat'
other_animal = 'also a cat'
'a very big cat' -> yet_another_animal

Concatenations and lists

c()    ## when using "c" values have to be of the same kind
list() ## "lists" are containers that can contain almost anything
integers <- as.integer(c(1, 2)) 
characters <- c('cat', 'dog')
logical_values <- c(TRUE, FALSE)
numbers <- c(1.5, 3.7)
factors <- factor('Male', 'Female')
List <- list(numbers, integers) 
mat <- matrix(data=c(1, 2, 3, 4, 5, 6), nrow=2, ncol=3)

The following won't work:

integers <- 1, 2
characters <- 'spam', 'eggs'

Sequences

The colon can be used to make sequences:

1:10
##  [1]  1  2  3  4  5  6  7  8  9 10

or the function seq() can be used

seq(from=2, to=12, by=3) ## either by size of steps ...
## [1]  2  5  8 11
seq(from=2, to=12, length.out=5) ## or by number of steps
## [1]  2.0  4.5  7.0  9.5 12.0

Slicing vectors (square brackets)

n <- 7:42 ## a sequence with 36 elements 
n[1] ## first element
## [1] 7
n[3:4] ## third and fourth element
## [1]  9 10
n[c(5, 10)] ## fifth and tenth element
## [1] 11 16

Slicing matrices

(mat <- matrix(11:30, ncol=10))
##      [,1] [,2] [,3] [,4] [,5] [,6] [,7] [,8] [,9] [,10]
## [1,]   11   13   15   17   19   21   23   25   27    29
## [2,]   12   14   16   18   20   22   24   26   28    30
mat[1,1] ## that value
## [1] 11
mat[1, ]; mat[, 1] ## row or column
##  [1] 11 13 15 17 19 21 23 25 27 29
## [1] 11 12

Logical indexing

n[n > 35]
## [1] 36 37 38 39 40 41 42
n[n == 7]
## [1] 7
n[n == 4] ## results in an empty integer object
## integer(0)

Logical Operators

<: lesser than
>: greater than
!=: not equal to
==: equal to
<=: lesser than or equal to
>=: greater than or equal to
|: or
&: and
!: not

Flow Control (for and if)

for(i in 1:5)
{
  print(i)
  if(i <= 3) print(i * 10)
}
## [1] 1
## [1] 10
## [1] 2
## [1] 20
## [1] 3
## [1] 30
## [1] 4
## [1] 5

Flow Control (while and if)

do_again <- TRUE
i <- 2
while(do_again)
{
  print(i)
  i <- i + 2
  if(i > 10) do_again <- FALSE
}
## [1] 2
## [1] 4
## [1] 6
## [1] 8
## [1] 10

Data frames

Another very important class of objects is the data frame:

## cars is a dataset included in R, head shows the first n rows
head(cars, n=10)
##    speed dist
## 1      4    2
## 2      4   10
## 3      7    4
## 4      7   22
## 5      8   16
## 6      9   10
## 7     10   18
## 8     10   26
## 9     10   34
## 10    11   17

Summarizing data frames

## summary statistics of "cars" speed (mph), dist (braking distance in feet)
summary(cars) 
##      speed           dist       
##  Min.   : 4.0   Min.   :  2.00  
##  1st Qu.:12.0   1st Qu.: 26.00  
##  Median :15.0   Median : 36.00  
##  Mean   :15.4   Mean   : 42.98  
##  3rd Qu.:19.0   3rd Qu.: 56.00  
##  Max.   :25.0   Max.   :120.00
str(cars) ## the structure of "cars"
## 'data.frame':    50 obs. of  2 variables:
##  $ speed: num  4 4 7 7 8 9 10 10 10 11 ...
##  $ dist : num  2 10 4 22 16 10 18 26 34 17 ...

Accessing columns

Columns in data frames can be accessed via the dollar sign:

cars$speed
##  [1]  4  4  7  7  8  9 10 10 10 11 11 12 12 12 12 13 13 13 13 14 14 14 14
## [24] 15 15 15 16 16 17 17 17 18 18 18 18 19 19 19 20 20 20 20 20 22 23 24
## [47] 24 24 24 25
cars$dist
##  [1]   2  10   4  22  16  10  18  26  34  17  28  14  20  24  28  26  34
## [18]  34  46  26  36  60  80  20  26  54  32  40  32  40  50  42  56  76
## [35]  84  36  46  68  32  48  52  56  64  66  54  70  92  93 120  85

Plotting

And can be plotted rather easily:

## the x ~ y syntax means "x as a function of y"
plot(cars$dist ~ cars$speed) 

Assessing normality

par(font=2, font.lab=2, font.axis=2, lwd=3, mfrow=c(1, 2))
hist(cars$speed)
qqnorm(cars$speed)
qqline(cars$speed)

Reading in data

read.csv()
read.xls() ## in package "gdata""
readMat() ## in package "R.matlab"
npyLoad() ## in package "RcppCNPy"

Installing packages

install.packages()
## e.g. install.packages('R.matlab')

Linear models and ANOVA's

Introducing formulae

formula <- y ~ x ## y as a function of x
y ~ 1 ## model the intercept for "y"
y ~ x ## model the main effect of x and the intercept for y
y ~ x + 1 ## the same as above (+ 1 is implicit)
y ~ x + 0 ## model the main effect of x and no intercept
y ~ x - 1 ## the same as above
y ~ 0 ## doesn't model anything
y ~ x + z ## model the main effects x and z (and an intercept)
y ~ x:z ## model interaction of x and z
y ~ x * z ## model the main effects x and z and their interaction
y ~ x + z + x:z ## the same as above

Linear model

model <- lm(dist ~ speed, data=cars)
print(model)
## 
## Call:
## lm(formula = dist ~ speed, data = cars)
## 
## Coefficients:
## (Intercept)        speed  
##     -17.579        3.932

The parameters can be interpreted as follows. The braking distance when riding 0 mph (!) is -17.6 feet, and the increase in braking distance per mph is 3.9 feet. (These are the maximally likely parameters for a flat prior)

Plotting the model

Let's plot the linear model on top of the individual points!

plot(dist ~ speed, data=cars)
abline(model)

Significance testing (1)

The model can be tested for significance by running the "anova" command

anova(model)
## Analysis of Variance Table
## 
## Response: dist
##           Df Sum Sq Mean Sq F value   Pr(>F)    
## speed      1  21186 21185.5  89.567 1.49e-12 ***
## Residuals 48  11354   236.5                     
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Significance testing (2)

summary(model)
## 
## Call:
## lm(formula = dist ~ speed, data = cars)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -29.069  -9.525  -2.272   9.215  43.201 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -17.5791     6.7584  -2.601   0.0123 *  
## speed         3.9324     0.4155   9.464 1.49e-12 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 15.38 on 48 degrees of freedom
## Multiple R-squared:  0.6511, Adjusted R-squared:  0.6438 
## F-statistic: 89.57 on 1 and 48 DF,  p-value: 1.49e-12

Binomial and Poisson variables

Note that "glm" can be used if the dependent variable is binomially distributed or poisson distrbuted

glm(y ~ x, data=data, family='poisson')
glm(y ~ x, data=data, family='binomial')

mtcars (data)

Let's look at another of the datasets that come with R, "mtcars", which has more data about cars:

[, 1] mpg Miles/(US) gallon
[, 2] cyl Number of cylinders
[, 3] disp Displacement (cu.in.)
[, 4] hp Gross horsepower
[, 5] drat Rear axle ratio
[, 6] wt Weight (lb/1000)
[, 7] qsec 1/4 mile time
[, 8] vs V/S
[, 9] am Transmission (0 = automatic, 1 = manual)
[,10] gear Number of forward gears
[,11] carb Number of carburetors

Miles per gallon and weight

plot(mpg ~ wt, data=mtcars, xlab='weight (lb/1000)')

Now let's create some models!

linear <- lm(mpg ~ wt, data=mtcars)
quadratic <- lm(mpg ~ I(wt^2) + wt, data=mtcars)
cubic <- lm(mpg ~ I(wt^3) + I(wt^2) + wt, data=mtcars)

Plotting, linear

## 'log Lik.' -80.01471 (df=3)

Plotting, quadratic

## 'log Lik.' -75.0242 (df=4)

Plotting, cubic

## 'log Lik.' -75.01827 (df=5)

Comparing models (log likelihood test)

Linear vs Quadratic

log.diff <- as.numeric(-2*logLik(linear) +2*logLik(quadratic))
df.diff <- df.residual(linear) - df.residual(quadratic)
(p.value <- 1-pchisq(q=log.diff, df=df.diff))
## [1] 0.001581615

Quadratic vs Cubic

log.diff <- as.numeric(-2*logLik(quadratic) +2*logLik(cubic))
df.diff <- df.residual(quadratic) - df.residual(cubic)
(p.value <- 1-pchisq(q=log.diff, df=df.diff))
## [1] 0.9132424

(The package "epicalc"" has the function "lrtest" likelihood ratio test)

Factors vs numeric variables

str(mtcars)
## 'data.frame':    32 obs. of  11 variables:
##  $ mpg : num  21 21 22.8 21.4 18.7 18.1 14.3 24.4 22.8 19.2 ...
##  $ cyl : num  6 6 4 6 8 6 8 4 4 6 ...
##  $ disp: num  160 160 108 258 360 ...
##  $ hp  : num  110 110 93 110 175 105 245 62 95 123 ...
##  $ drat: num  3.9 3.9 3.85 3.08 3.15 2.76 3.21 3.69 3.92 3.92 ...
##  $ wt  : num  2.62 2.88 2.32 3.21 3.44 ...
##  $ qsec: num  16.5 17 18.6 19.4 17 ...
##  $ vs  : num  0 0 1 1 0 1 0 1 1 1 ...
##  $ am  : num  1 1 1 0 0 0 0 0 0 0 ...
##  $ gear: num  4 4 4 3 3 3 3 4 4 4 ...
##  $ carb: num  4 4 1 1 2 1 4 2 2 4 ...

A weird model

weird.model <- lm(mpg ~ am, data=mtcars)
plot(mpg ~ am, data=mtcars, xlab='transmission')
abline(weird.model)

Factors vs numeric variables

mtcars$am <- factor(mtcars$am) 
str(mtcars)
## 'data.frame':    32 obs. of  11 variables:
##  $ mpg : num  21 21 22.8 21.4 18.7 18.1 14.3 24.4 22.8 19.2 ...
##  $ cyl : num  6 6 4 6 8 6 8 4 4 6 ...
##  $ disp: num  160 160 108 258 360 ...
##  $ hp  : num  110 110 93 110 175 105 245 62 95 123 ...
##  $ drat: num  3.9 3.9 3.85 3.08 3.15 2.76 3.21 3.69 3.92 3.92 ...
##  $ wt  : num  2.62 2.88 2.32 3.21 3.44 ...
##  $ qsec: num  16.5 17 18.6 19.4 17 ...
##  $ vs  : num  0 0 1 1 0 1 0 1 1 1 ...
##  $ am  : Factor w/ 2 levels "0","1": 2 2 2 1 1 1 1 1 1 1 ...
##  $ gear: num  4 4 4 3 3 3 3 4 4 4 ...
##  $ carb: num  4 4 1 1 2 1 4 2 2 4 ...

A sensible model

sensible.model <- lm(mpg ~ am, data=mtcars)
## Note how the plot command gives a boxplot this time!  
plot(mpg ~ am, data=mtcars, xlab='transmission (0=automatic, 1=manual)')

Comparing models (no difference)

anova(weird.model)
## Analysis of Variance Table
## 
## Response: mpg
##           Df Sum Sq Mean Sq F value   Pr(>F)    
## am         1 405.15  405.15   16.86 0.000285 ***
## Residuals 30 720.90   24.03                     
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
anova(sensible.model)
## Analysis of Variance Table
## 
## Response: mpg
##           Df Sum Sq Mean Sq F value   Pr(>F)    
## am         1 405.15  405.15   16.86 0.000285 ***
## Residuals 30 720.90   24.03                     
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Comparing models (makes a diffence)

weird.model <- lm(mpg ~ gear, data=mtcars)
sensible.model <- lm(mpg ~ factor(gear), data=mtcars)
anova(weird.model)
## Analysis of Variance Table
## 
## Response: mpg
##           Df Sum Sq Mean Sq F value   Pr(>F)   
## gear       1 259.75 259.749  8.9951 0.005401 **
## Residuals 30 866.30  28.877                    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
anova(sensible.model)
## Analysis of Variance Table
## 
## Response: mpg
##              Df Sum Sq Mean Sq F value    Pr(>F)    
## factor(gear)  2 483.24 241.622  10.901 0.0002948 ***
## Residuals    29 642.80  22.166                      
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Comparing models

summary(weird.model)
## 
## Call:
## lm(formula = mpg ~ gear, data = mtcars)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -10.240  -2.793  -0.205   2.126  12.583 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)   
## (Intercept)    5.623      4.916   1.144   0.2618   
## gear           3.923      1.308   2.999   0.0054 **
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 5.374 on 30 degrees of freedom
## Multiple R-squared:  0.2307, Adjusted R-squared:  0.205 
## F-statistic: 8.995 on 1 and 30 DF,  p-value: 0.005401

Comparing models

summary(sensible.model)
## 
## Call:
## lm(formula = mpg ~ factor(gear), data = mtcars)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -6.7333 -3.2333 -0.9067  2.8483  9.3667 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)     16.107      1.216  13.250 7.87e-14 ***
## factor(gear)4    8.427      1.823   4.621 7.26e-05 ***
## factor(gear)5    5.273      2.431   2.169   0.0384 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 4.708 on 29 degrees of freedom
## Multiple R-squared:  0.4292, Adjusted R-squared:  0.3898 
## F-statistic:  10.9 on 2 and 29 DF,  p-value: 0.0002948

Plotting mpg ~ gear

More than one effect

additive.model <- lm(mpg ~ wt + factor(gear), data=mtcars)
anova(additive.model)
## Analysis of Variance Table
## 
## Response: mpg
##              Df Sum Sq Mean Sq F value    Pr(>F)    
## wt            1 847.73  847.73 99.7532 9.891e-11 ***
## factor(gear)  2  40.37   20.19  2.3753    0.1115    
## Residuals    28 237.95    8.50                      
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

So gear does apparently not explain anything

Interaction

But the interaction seems to explain something

interaction.model <- lm(mpg ~ wt * factor(gear), data=mtcars)
anova(interaction.model)
## Analysis of Variance Table
## 
## Response: mpg
##                 Df Sum Sq Mean Sq  F value    Pr(>F)    
## wt               1 847.73  847.73 133.6587 9.529e-12 ***
## factor(gear)     2  40.37   20.19   3.1826  0.058026 .  
## wt:factor(gear)  2  73.05   36.52   5.7585  0.008505 ** 
## Residuals       26 164.90    6.34                       
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Plot Interaction Model Fit

Complex summary

summary(interaction.model)
## 
## Call:
## lm(formula = mpg ~ wt * factor(gear), data = mtcars)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -3.7695 -1.7659 -0.2816  1.1631  5.0069 
## 
## Coefficients:
##                  Estimate Std. Error t value Pr(>|t|)    
## (Intercept)        28.395      3.212   8.841 2.58e-09 ***
## wt                 -3.157      0.808  -3.907 0.000596 ***
## factor(gear)4      14.098      4.551   3.098 0.004633 ** 
## factor(gear)5      14.168      5.289   2.679 0.012639 *  
## wt:factor(gear)4   -3.707      1.447  -2.562 0.016557 *  
## wt:factor(gear)5   -4.889      1.737  -2.815 0.009180 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 2.518 on 26 degrees of freedom
## Multiple R-squared:  0.8536, Adjusted R-squared:  0.8254 
## F-statistic: 30.31 on 5 and 26 DF,  p-value: 4.629e-10

Binomial Example

binomial.model <- glm(am ~ wt, data=mtcars, family=binomial)
newdata <- data.frame('wt'=seq(0, max(mtcars$wt), length.out=length(mtcars$am)))
predictions.logit <- predict(binomial.model, newdata)
print(predictions.logit)
##           1           2           3           4           5           6 
## 12.04036966 11.33630472 10.63223979  9.92817486  9.22410992  8.52004499 
##           7           8           9          10          11          12 
##  7.81598005  7.11191512  6.40785019  5.70378525  4.99972032  4.29565538 
##          13          14          15          16          17          18 
##  3.59159045  2.88752552  2.18346058  1.47939565  0.77533071  0.07126578 
##          19          20          21          22          23          24 
## -0.63279915 -1.33686409 -2.04092902 -2.74499396 -3.44905889 -4.15312382 
##          25          26          27          28          29          30 
## -4.85718876 -5.56125369 -6.26531863 -6.96938356 -7.67344850 -8.37751343 
##          31          32 
## -9.08157836 -9.78564330

Responses

predictions.response <- predict(binomial.model, newdata, type='response')
print(predictions.response)
##             1             2             3             4             5 
## 0.99999409892 0.99998806836 0.99997587505 0.99995122163 0.99990137725 
##             6             7             8             9            10 
## 0.99980060931 0.99959692311 0.99918533219 0.99835414849 0.99667775006 
##            11            12            13            14            15 
## 0.99330528948 0.98655557827 0.97318441766 0.94722632233 0.89875440232 
##            16            17            18            19            20 
## 0.81448127953 0.68467290443 0.51780890822 0.34687610926 0.20802623109 
##            21            22            23            24            25 
## 0.11497216687 0.06036999610 0.03079693764 0.01547210016 0.00771236022 
##            26            27            28            29            30 
## 0.00382923483 0.00189750023 0.00093934912 0.00046479533 0.00022992822 
##            31            32 
## 0.00011372900 0.00005625028

Plot

plot(newdata$wt, predictions.response, type='l', lwd=3,xlab='weight (lb/1000)',
     ylab='Probability of having manual transmission')

Centring

mtcars$wt.centred <- mtcars$wt - mean(mtcars$wt)
centred.model <- glm(am ~ wt.centred, data=mtcars, family=binomial)
newdata <- data.frame('wt.centred'=seq(min(mtcars$wt.centred), max(mtcars$wt.centred),
                                       length.out=length(mtcars$am)))
predictions.centred <- predict(centred.model, newdata, type='response')

Plot centred

plot(newdata$wt.centred, predictions.centred, type='l', lwd=3,
     xlab='Difference from mean weight (lb/1000)',
     ylab='Probability of having manual transmission')