30-03-2015

## R project

R is a free, open-software statistical analysis package

You can read in all the standard data formats, csv, xls, xlsx, mat, npy, and probably many more

## Assignment of values and types of objects

Values can be assigned to variables in three different ways that result in (almost) the same:

animal <- 'cat'
other_animal = 'also a cat'
'a very big cat' -> yet_another_animal

## Concatenations and lists

c()    ## when using "c" values have to be of the same kind
list() ## "lists" are containers that can contain almost anything
integers <- as.integer(c(1, 2))
characters <- c('cat', 'dog')
logical_values <- c(TRUE, FALSE)
numbers <- c(1.5, 3.7)
factors <- factor('Male', 'Female')
List <- list(numbers, integers)
mat <- matrix(data=c(1, 2, 3, 4, 5, 6), nrow=2, ncol=3)

The following won't work:

integers <- 1, 2
characters <- 'spam', 'eggs'

## Sequences

The colon can be used to make sequences:

1:10
##  [1]  1  2  3  4  5  6  7  8  9 10

or the function seq() can be used

seq(from=2, to=12, by=3) ## either by size of steps ...
## [1]  2  5  8 11
seq(from=2, to=12, length.out=5) ## or by number of steps
## [1]  2.0  4.5  7.0  9.5 12.0

## Slicing vectors (square brackets)

n <- 7:42 ## a sequence with 36 elements
n[1] ## first element
## [1] 7
n[3:4] ## third and fourth element
## [1]  9 10
n[c(5, 10)] ## fifth and tenth element
## [1] 11 16

## Slicing matrices

(mat <- matrix(11:30, ncol=10))
##      [,1] [,2] [,3] [,4] [,5] [,6] [,7] [,8] [,9] [,10]
## [1,]   11   13   15   17   19   21   23   25   27    29
## [2,]   12   14   16   18   20   22   24   26   28    30
mat[1,1] ## that value
## [1] 11
mat[1, ]; mat[, 1] ## row or column
##  [1] 11 13 15 17 19 21 23 25 27 29
## [1] 11 12

## Logical indexing

n[n > 35]
## [1] 36 37 38 39 40 41 42
n[n == 7]
## [1] 7
n[n == 4] ## results in an empty integer object
## integer(0)

## Logical Operators

<: lesser than
>: greater than
!=: not equal to
==: equal to
<=: lesser than or equal to
>=: greater than or equal to
|: or
&: and
!: not

## Flow Control (for and if)

for(i in 1:5)
{
print(i)
if(i <= 3) print(i * 10)
}
## [1] 1
## [1] 10
## [1] 2
## [1] 20
## [1] 3
## [1] 30
## [1] 4
## [1] 5

## Flow Control (while and if)

do_again <- TRUE
i <- 2
while(do_again)
{
print(i)
i <- i + 2
if(i > 10) do_again <- FALSE
}
## [1] 2
## [1] 4
## [1] 6
## [1] 8
## [1] 10

## Data frames

Another very important class of objects is the data frame:

## cars is a dataset included in R, head shows the first n rows
head(cars, n=10)
##    speed dist
## 1      4    2
## 2      4   10
## 3      7    4
## 4      7   22
## 5      8   16
## 6      9   10
## 7     10   18
## 8     10   26
## 9     10   34
## 10    11   17

## Summarizing data frames

## summary statistics of "cars" speed (mph), dist (braking distance in feet)
summary(cars) 
##      speed           dist
##  Min.   : 4.0   Min.   :  2.00
##  1st Qu.:12.0   1st Qu.: 26.00
##  Median :15.0   Median : 36.00
##  Mean   :15.4   Mean   : 42.98
##  3rd Qu.:19.0   3rd Qu.: 56.00
##  Max.   :25.0   Max.   :120.00
str(cars) ## the structure of "cars"
## 'data.frame':    50 obs. of  2 variables:
##  $speed: num 4 4 7 7 8 9 10 10 10 11 ... ##$ dist : num  2 10 4 22 16 10 18 26 34 17 ...

## Accessing columns

Columns in data frames can be accessed via the dollar sign:

cars$speed ## [1] 4 4 7 7 8 9 10 10 10 11 11 12 12 12 12 13 13 13 13 14 14 14 14 ## [24] 15 15 15 16 16 17 17 17 18 18 18 18 19 19 19 20 20 20 20 20 22 23 24 ## [47] 24 24 24 25 cars$dist
##  [1]   2  10   4  22  16  10  18  26  34  17  28  14  20  24  28  26  34
## [18]  34  46  26  36  60  80  20  26  54  32  40  32  40  50  42  56  76
## [35]  84  36  46  68  32  48  52  56  64  66  54  70  92  93 120  85

## Plotting

And can be plotted rather easily:

## the x ~ y syntax means "x as a function of y"
plot(cars$dist ~ cars$speed) 

## Assessing normality

par(font=2, font.lab=2, font.axis=2, lwd=3, mfrow=c(1, 2))
hist(cars$speed) qqnorm(cars$speed)
qqline(cars$speed) ## Reading in data read.csv() read.xls() ## in package "gdata"" readMat() ## in package "R.matlab" npyLoad() ## in package "RcppCNPy" ## Installing packages install.packages() ## e.g. install.packages('R.matlab') ## Linear models and ANOVA's Introducing formulae formula <- y ~ x ## y as a function of x y ~ 1 ## model the intercept for "y" y ~ x ## model the main effect of x and the intercept for y y ~ x + 1 ## the same as above (+ 1 is implicit) y ~ x + 0 ## model the main effect of x and no intercept y ~ x - 1 ## the same as above y ~ 0 ## doesn't model anything y ~ x + z ## model the main effects x and z (and an intercept) y ~ x:z ## model interaction of x and z y ~ x * z ## model the main effects x and z and their interaction y ~ x + z + x:z ## the same as above ## Linear model model <- lm(dist ~ speed, data=cars) print(model) ## ## Call: ## lm(formula = dist ~ speed, data = cars) ## ## Coefficients: ## (Intercept) speed ## -17.579 3.932 The parameters can be interpreted as follows. The braking distance when riding 0 mph (!) is -17.6 feet, and the increase in braking distance per mph is 3.9 feet. (These are the maximally likely parameters for a flat prior) ## Plotting the model Let's plot the linear model on top of the individual points! plot(dist ~ speed, data=cars) abline(model) ## Significance testing (1) The model can be tested for significance by running the "anova" command anova(model) ## Analysis of Variance Table ## ## Response: dist ## Df Sum Sq Mean Sq F value Pr(>F) ## speed 1 21186 21185.5 89.567 1.49e-12 *** ## Residuals 48 11354 236.5 ## --- ## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1 ## Significance testing (2) summary(model) ## ## Call: ## lm(formula = dist ~ speed, data = cars) ## ## Residuals: ## Min 1Q Median 3Q Max ## -29.069 -9.525 -2.272 9.215 43.201 ## ## Coefficients: ## Estimate Std. Error t value Pr(>|t|) ## (Intercept) -17.5791 6.7584 -2.601 0.0123 * ## speed 3.9324 0.4155 9.464 1.49e-12 *** ## --- ## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1 ## ## Residual standard error: 15.38 on 48 degrees of freedom ## Multiple R-squared: 0.6511, Adjusted R-squared: 0.6438 ## F-statistic: 89.57 on 1 and 48 DF, p-value: 1.49e-12 ## Binomial and Poisson variables Note that "glm" can be used if the dependent variable is binomially distributed or poisson distrbuted glm(y ~ x, data=data, family='poisson') glm(y ~ x, data=data, family='binomial') ## mtcars (data) Let's look at another of the datasets that come with R, "mtcars", which has more data about cars: [, 1] mpg Miles/(US) gallon [, 2] cyl Number of cylinders [, 3] disp Displacement (cu.in.) [, 4] hp Gross horsepower [, 5] drat Rear axle ratio [, 6] wt Weight (lb/1000) [, 7] qsec 1/4 mile time [, 8] vs V/S [, 9] am Transmission (0 = automatic, 1 = manual) [,10] gear Number of forward gears [,11] carb Number of carburetors ## Miles per gallon and weight plot(mpg ~ wt, data=mtcars, xlab='weight (lb/1000)') ## Now let's create some models! linear <- lm(mpg ~ wt, data=mtcars) quadratic <- lm(mpg ~ I(wt^2) + wt, data=mtcars) cubic <- lm(mpg ~ I(wt^3) + I(wt^2) + wt, data=mtcars) ## Plotting, linear ## 'log Lik.' -80.01471 (df=3) ## Plotting, quadratic ## 'log Lik.' -75.0242 (df=4) ## Plotting, cubic ## 'log Lik.' -75.01827 (df=5) ## Comparing models (log likelihood test) Linear vs Quadratic log.diff <- as.numeric(-2*logLik(linear) +2*logLik(quadratic)) df.diff <- df.residual(linear) - df.residual(quadratic) (p.value <- 1-pchisq(q=log.diff, df=df.diff)) ## [1] 0.001581615 Quadratic vs Cubic log.diff <- as.numeric(-2*logLik(quadratic) +2*logLik(cubic)) df.diff <- df.residual(quadratic) - df.residual(cubic) (p.value <- 1-pchisq(q=log.diff, df=df.diff)) ## [1] 0.9132424 (The package "epicalc"" has the function "lrtest" likelihood ratio test) ## Factors vs numeric variables str(mtcars) ## 'data.frame': 32 obs. of 11 variables: ##$ mpg : num  21 21 22.8 21.4 18.7 18.1 14.3 24.4 22.8 19.2 ...
##  $cyl : num 6 6 4 6 8 6 8 4 4 6 ... ##$ disp: num  160 160 108 258 360 ...
##  $hp : num 110 110 93 110 175 105 245 62 95 123 ... ##$ drat: num  3.9 3.9 3.85 3.08 3.15 2.76 3.21 3.69 3.92 3.92 ...
##  $wt : num 2.62 2.88 2.32 3.21 3.44 ... ##$ qsec: num  16.5 17 18.6 19.4 17 ...
##  $vs : num 0 0 1 1 0 1 0 1 1 1 ... ##$ am  : num  1 1 1 0 0 0 0 0 0 0 ...
##  $gear: num 4 4 4 3 3 3 3 4 4 4 ... ##$ carb: num  4 4 1 1 2 1 4 2 2 4 ...

## A weird model

weird.model <- lm(mpg ~ am, data=mtcars)
plot(mpg ~ am, data=mtcars, xlab='transmission')
abline(weird.model)

## Factors vs numeric variables

mtcars$am <- factor(mtcars$am)
str(mtcars)
## 'data.frame':    32 obs. of  11 variables:
##  $mpg : num 21 21 22.8 21.4 18.7 18.1 14.3 24.4 22.8 19.2 ... ##$ cyl : num  6 6 4 6 8 6 8 4 4 6 ...
##  $disp: num 160 160 108 258 360 ... ##$ hp  : num  110 110 93 110 175 105 245 62 95 123 ...
##  $drat: num 3.9 3.9 3.85 3.08 3.15 2.76 3.21 3.69 3.92 3.92 ... ##$ wt  : num  2.62 2.88 2.32 3.21 3.44 ...
##  $qsec: num 16.5 17 18.6 19.4 17 ... ##$ vs  : num  0 0 1 1 0 1 0 1 1 1 ...
##  $am : Factor w/ 2 levels "0","1": 2 2 2 1 1 1 1 1 1 1 ... ##$ gear: num  4 4 4 3 3 3 3 4 4 4 ...
##  $carb: num 4 4 1 1 2 1 4 2 2 4 ... ## A sensible model sensible.model <- lm(mpg ~ am, data=mtcars) ## Note how the plot command gives a boxplot this time! plot(mpg ~ am, data=mtcars, xlab='transmission (0=automatic, 1=manual)') ## Comparing models (no difference) anova(weird.model) ## Analysis of Variance Table ## ## Response: mpg ## Df Sum Sq Mean Sq F value Pr(>F) ## am 1 405.15 405.15 16.86 0.000285 *** ## Residuals 30 720.90 24.03 ## --- ## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1 anova(sensible.model) ## Analysis of Variance Table ## ## Response: mpg ## Df Sum Sq Mean Sq F value Pr(>F) ## am 1 405.15 405.15 16.86 0.000285 *** ## Residuals 30 720.90 24.03 ## --- ## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1 ## Comparing models (makes a diffence) weird.model <- lm(mpg ~ gear, data=mtcars) sensible.model <- lm(mpg ~ factor(gear), data=mtcars) anova(weird.model) ## Analysis of Variance Table ## ## Response: mpg ## Df Sum Sq Mean Sq F value Pr(>F) ## gear 1 259.75 259.749 8.9951 0.005401 ** ## Residuals 30 866.30 28.877 ## --- ## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1 anova(sensible.model) ## Analysis of Variance Table ## ## Response: mpg ## Df Sum Sq Mean Sq F value Pr(>F) ## factor(gear) 2 483.24 241.622 10.901 0.0002948 *** ## Residuals 29 642.80 22.166 ## --- ## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1 ## Comparing models summary(weird.model) ## ## Call: ## lm(formula = mpg ~ gear, data = mtcars) ## ## Residuals: ## Min 1Q Median 3Q Max ## -10.240 -2.793 -0.205 2.126 12.583 ## ## Coefficients: ## Estimate Std. Error t value Pr(>|t|) ## (Intercept) 5.623 4.916 1.144 0.2618 ## gear 3.923 1.308 2.999 0.0054 ** ## --- ## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1 ## ## Residual standard error: 5.374 on 30 degrees of freedom ## Multiple R-squared: 0.2307, Adjusted R-squared: 0.205 ## F-statistic: 8.995 on 1 and 30 DF, p-value: 0.005401 ## Comparing models summary(sensible.model) ## ## Call: ## lm(formula = mpg ~ factor(gear), data = mtcars) ## ## Residuals: ## Min 1Q Median 3Q Max ## -6.7333 -3.2333 -0.9067 2.8483 9.3667 ## ## Coefficients: ## Estimate Std. Error t value Pr(>|t|) ## (Intercept) 16.107 1.216 13.250 7.87e-14 *** ## factor(gear)4 8.427 1.823 4.621 7.26e-05 *** ## factor(gear)5 5.273 2.431 2.169 0.0384 * ## --- ## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1 ## ## Residual standard error: 4.708 on 29 degrees of freedom ## Multiple R-squared: 0.4292, Adjusted R-squared: 0.3898 ## F-statistic: 10.9 on 2 and 29 DF, p-value: 0.0002948 ## Plotting mpg ~ gear ## More than one effect additive.model <- lm(mpg ~ wt + factor(gear), data=mtcars) anova(additive.model) ## Analysis of Variance Table ## ## Response: mpg ## Df Sum Sq Mean Sq F value Pr(>F) ## wt 1 847.73 847.73 99.7532 9.891e-11 *** ## factor(gear) 2 40.37 20.19 2.3753 0.1115 ## Residuals 28 237.95 8.50 ## --- ## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1 So gear does apparently not explain anything ## Interaction But the interaction seems to explain something interaction.model <- lm(mpg ~ wt * factor(gear), data=mtcars) anova(interaction.model) ## Analysis of Variance Table ## ## Response: mpg ## Df Sum Sq Mean Sq F value Pr(>F) ## wt 1 847.73 847.73 133.6587 9.529e-12 *** ## factor(gear) 2 40.37 20.19 3.1826 0.058026 . ## wt:factor(gear) 2 73.05 36.52 5.7585 0.008505 ** ## Residuals 26 164.90 6.34 ## --- ## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1 ## Plot Interaction Model Fit ## Complex summary summary(interaction.model) ## ## Call: ## lm(formula = mpg ~ wt * factor(gear), data = mtcars) ## ## Residuals: ## Min 1Q Median 3Q Max ## -3.7695 -1.7659 -0.2816 1.1631 5.0069 ## ## Coefficients: ## Estimate Std. Error t value Pr(>|t|) ## (Intercept) 28.395 3.212 8.841 2.58e-09 *** ## wt -3.157 0.808 -3.907 0.000596 *** ## factor(gear)4 14.098 4.551 3.098 0.004633 ** ## factor(gear)5 14.168 5.289 2.679 0.012639 * ## wt:factor(gear)4 -3.707 1.447 -2.562 0.016557 * ## wt:factor(gear)5 -4.889 1.737 -2.815 0.009180 ** ## --- ## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1 ## ## Residual standard error: 2.518 on 26 degrees of freedom ## Multiple R-squared: 0.8536, Adjusted R-squared: 0.8254 ## F-statistic: 30.31 on 5 and 26 DF, p-value: 4.629e-10 ## Binomial Example binomial.model <- glm(am ~ wt, data=mtcars, family=binomial) newdata <- data.frame('wt'=seq(0, max(mtcars$wt), length.out=length(mtcars$am))) predictions.logit <- predict(binomial.model, newdata) print(predictions.logit) ## 1 2 3 4 5 6 ## 12.04036966 11.33630472 10.63223979 9.92817486 9.22410992 8.52004499 ## 7 8 9 10 11 12 ## 7.81598005 7.11191512 6.40785019 5.70378525 4.99972032 4.29565538 ## 13 14 15 16 17 18 ## 3.59159045 2.88752552 2.18346058 1.47939565 0.77533071 0.07126578 ## 19 20 21 22 23 24 ## -0.63279915 -1.33686409 -2.04092902 -2.74499396 -3.44905889 -4.15312382 ## 25 26 27 28 29 30 ## -4.85718876 -5.56125369 -6.26531863 -6.96938356 -7.67344850 -8.37751343 ## 31 32 ## -9.08157836 -9.78564330 ## Responses predictions.response <- predict(binomial.model, newdata, type='response') print(predictions.response) ## 1 2 3 4 5 ## 0.99999409892 0.99998806836 0.99997587505 0.99995122163 0.99990137725 ## 6 7 8 9 10 ## 0.99980060931 0.99959692311 0.99918533219 0.99835414849 0.99667775006 ## 11 12 13 14 15 ## 0.99330528948 0.98655557827 0.97318441766 0.94722632233 0.89875440232 ## 16 17 18 19 20 ## 0.81448127953 0.68467290443 0.51780890822 0.34687610926 0.20802623109 ## 21 22 23 24 25 ## 0.11497216687 0.06036999610 0.03079693764 0.01547210016 0.00771236022 ## 26 27 28 29 30 ## 0.00382923483 0.00189750023 0.00093934912 0.00046479533 0.00022992822 ## 31 32 ## 0.00011372900 0.00005625028 ## Plot plot(newdata$wt, predictions.response, type='l', lwd=3,xlab='weight (lb/1000)',
ylab='Probability of having manual transmission')

## Centring

mtcars$wt.centred <- mtcars$wt - mean(mtcars$wt) centred.model <- glm(am ~ wt.centred, data=mtcars, family=binomial) newdata <- data.frame('wt.centred'=seq(min(mtcars$wt.centred), max(mtcars$wt.centred), length.out=length(mtcars$am)))
predictions.centred <- predict(centred.model, newdata, type='response')

## Plot centred

plot(newdata\$wt.centred, predictions.centred, type='l', lwd=3,
xlab='Difference from mean weight (lb/1000)',
ylab='Probability of having manual transmission')