# A Crash Course in R

This code has been kindly contributed by Robin Edwards (from UCL CASA).

There are many useful introductory guides out there to R, but below is the kind of thing I now wish I’d been given when I first started using it – something with simple logically-progressive examples and minimal explanatory text. Copy the text below into a new script in R and run line-by-line to give a quick intro to many of R’s most basic principles and functionality. You can also download a text file with it here. It is by no means comprehensive, even at the most basic level, but still I hope someone finds it useful. You may want to look at RStudio as it is more user-friendly.

## A CRASH COURSE IN [R] PROGRAMMING

## Robin Edwards (geotheory.co.uk), March 2013

## In RStudio run through line-by-line using Ctrl + Enter

# basic R environmental functions

x=3.14159; y=’hello world'; z=TRUE # create some objects. In RStudio they’ll appear in ‘Workspace’

ls() # list the objects in the Workspace

print(y) # print information to R ‘Console’

rm(y) # remove an object

rm(list=ls()) # remove all

getwd() # find current working directory

setwd(“/Users/robinedwards/Documents”) # set working directory as preferred

print ( “R ignores the ‘white-space’ in command syntax” )

# use ‘?’ for help on any R function (if its library is loaded in the session)

?max

??csv # search for a text string in R documentation library

library(help=utils) # get help on a particular package (list its functions)

# ‘str’ is a powerful tool for investigating the underlying structure of any R object

str(max)

# CREATING AND MANIPULATING R OBJECTS

# assigning values to variables

n = 5 # is possible but

n 5 -> n

rm(n)

# R objects can be of various data types, but probably most common are ‘numeric’ and ‘character’

( num ( char <- ‘any text string’ )

# create a VECTOR (array) using the ‘c()’ concatenate function

( vec

# a vector series

( vec

# R vectors can be accessed in various ways using [ ] brackets

vec[3]

vec[3:6]

vec[ c(1,3,8) ]

vec[vec > 15]

# check a vector contains a value

5 %in% vec

12 %in% vec

# finding first index position of a matching value/sting

( x = c(‘one’, ‘five’, ‘two’, 3, ‘two’) )

match(‘two’, x)

match(c(‘two’,’five’), x)

# a MATRIX is a 2D vector (essentially a vector of vectors) of matching data type

( matrx = matrix(1:15, 3, 5) )

( matrx dim(matrx) print(matrx)

t(matrx) # a matrix can be easily transposed

# an ARRAY is a generic vector but with more flexibiity. A 1D array is the same as a normal vector,

# and a 2D array is like a matrix. But arrays can store data with ‘n’ dimensions:

( arry

# Using square brackets on arrays

arry[12] # a single criterion (argument) selects the array’s n’th record

arry[3,1,2] # or use multiple arguments that reflect the array’s dimensionality

arry[,,2]

arry[,1,]

# a DATA.FRAME is like a matrix, but accomodates fields (columns) with different data types

(df

# They can be viewed easily

View(df)

# examine their internal stucture

str(df)

# data interrogation with square brackets

df[1,]

df[2:3,]

df[,1]

df[2,1]

# data.frame and matrix objects can have field (column) and record (row) names

dimnames(df)

colnames(df)

names(df) # not for matrix objects

row.names(df)

# interrogate data.frames by field name using the ‘$’ operator. the result is a simple vector

df$name

df$name[2]

# names can be reassigned

names(df) row.names(df) print(df)

# check dimensions of vector/matrix/array/data.frame objects

length(vec)

dim(df)

dim(arry)

nrow(df)

ncol(df)

# R has various inbuilt data.frame datasets used to illustrate how functions operate e.g.

data()

InsectSprays # this guide makes use of these datasets

warpbreaks

# examine contents

head(InsectSprays) # list the top records of a vector / matrix / d.f.

tail(InsectSprays, n=3) # bottom the 3

summary(InsectSprays) # summarise a data vector

# aggregate() is a powerful function for summarising categorical data

aggregate(InsectSprays$count, by=list(InsectSprays$spray), FUN=mean)

sumInsects names(sumInsects) print(sumInsects)

# subset/apply filter to a data.frame

warpbreaks[warpbreaks$wool=='A',] # by 1 condition

warpbreaks[warpbreaks$tension %in% c('L','M'),] # multiple conditions

# adding entries is possible (if a bit tricky)

(newrow (warpbreaks

# but LISTS are better at this

lst = list()

# ways to assign/add items

lst[1] = “one”

lst[[2]] <- “two”

lst[length(lst)+1] <- “three”

print(lst)

# data retrieval

lst[[1]] # double brackets means the object returned is of the data class of the list item

lst[2:3] # selecting a more than 1 list item is possible with single brackets..

lst[c(1,3)] # but the object returned (from single bracket interrogation) is a list

# delete list items

lst[[3]] lst[1:2] lst

# entries can be any object type (like python), including other lists (double bracketting)

lst[[1]] lst[[2]] <- ‘item2′

lst

lst[[1]][[1]]

# Data in lists can also be stored and recalled by key word/number (like Python’s dictionary class)

dict dict['wed'] print(dict)

dict[['tues']]

dict[c('mon','wed')]

# reorder a vector with ‘sort’

vec sort(vec)

# or a dataframe with ‘order’

df[order(df$years),]

# LOGICAL objects (booleans) are binary true/false objects that facilitate conditional data processing

(bool (bool

# query an object’s data/structure type with ‘class()’

class(bool)

class(num) # numeric is the default data type for number objects

class(as.integer(num)) # integer class exists but is not default

class(char) # character class

class(‘237′ ) # numbers aren’t always numeric type

as.numeric(‘237′) # but can be converted

as.character(237) # and vice verse

# Child-objects are often of different class to parents

class(df)

class(df[,2])

class(df[,1])

# FACTOR objects are vectors of items that have been categorised by unique values

factr str(factr)

levels(factr)

table(factr)

# you may encounter problems converting a factor of numeric data to numeric type

as.numeric(factr)

# instead do this

as.numeric(as.character(factr))

# editing factors can be tricky

print(df)

df$person[1] <- ‘Matthew’

# instead convert to character or numeric etc

df$person df$person[1] <- ‘Matthew’

df$person levels(df$person)

# LOGICAL OPERATIONS

2 + 2 == 4 # ‘==’ denotes value equality

3 <= 2 # less than or equal to

3 >= 2 # greater than or equal to

‘string’ == “string”

‘b’ >= ‘a’ # strings can be ranked

3 != 3 # NOT operator

c(4,2,6) == c(4,2,8) # vector comparisons return locical vectors

TRUE == T # ‘T’ and ‘F’ default as boolean shortcuts (until overwritten)

TRUE & TRUE # AND operator

TRUE | FALSE # OR operator

F | F

# IF/ELSE statement (used in most logical procedures)

x if(x < 5){

print(‘x is less than 5′)

} else{

print(‘x is not less than 5′)

}

if(T|F) print(‘single liners can dispense with curly brackets’)

if(T&F) print(”) else print(“but then ‘else’ only works on the same line”)

# LOOPING FUNCTIONS – very useful for handling repetitive operations

# ‘FOR’ loop

for(i in 1:10){

print(paste(‘number ‘,i))

}

# WHILE loop (be careful to include safeguards to prevent infinite loops)

i = 30

while(i > 0){

print(paste(‘number ‘,i))

i = i – 3

}

# creating a function

multiply tot return(tot)

}

multiply(3,5)

# note ‘tot’ wasn’t remembered outside the function – functions are contained environments

# if required use ‘<<-‘ for global assignment but be careful not to overwrite R’s internal objects

# its generally better to do this:

newVar

# handling ‘NA’ values

(x = 1:5)

x[8] = 8

x[3] = NA

print(x) # sometimes functions will fail because of NA values

na.omit(x) # iterates full list but ignores NAs

x[na.omit(x)]

is.na(x) # alternatively

x[!is.na(x)]

# useful basic math functions

seq(-2, 2, by=.2) # sequence of equal difference

seq(length=10, from=-5, by=.2) # with range defined by vector length

rnorm(20, mean = 0, sd = 1) # random normal distribution

runif(20, min=0, max=100) # array of random numbers

sample(0:100, 20, replace=TRUE) # array of random integers

table(warpbreaks[,2:3]) # array summary stats (powerful summary tool)

min(vec)

max(vec)

range(vec)

mean(vec)

median(vec)

sum(vec)

prod(vec)

abs(-5) # magnitude

sd(rnorm(10)) # standard deviation

4^2 # square

sqrt(16) # square root

5%%3 # modulo (remainder after subtraction of any multiple)

6%%2

for(i in 1:100) if(i%%20==0) print(i) # useful for running an operation every n’th cycle

# Importing and exporting data using comma-separated file

write.csv(df, ‘example.csv’) # save to csv file

rm(df)

(df

# PLOTTING IN R

# some basic functionality

plot(1:10)

plot(sort(rnorm(100)), pch=16, cex=0.5) # specifying point and size respectively

plot(x=1:25, y=25:1, pch=1:25) # x & y inputs, and showing the available point symbols

plot(sin, -pi, 2*pi)

hist(rnorm(1000), breaks=50)

barplot(sumInsects$sum, names.arg = sumInsects$group)

pie(sumInsects$sum, labels = sumInsects$group)

# plots with more visual components are built up incrementally

x plot(x, pch=17)

lines(x, col=’#00FF00′)

points(x+5, pch=16, col=’red’)

# stacking charts

warpbreaks

sumWB names(sumWB) sumWB

(data barplot(data, names.arg=c(‘Group A’,’Group B’),

legend.text=c(‘L’,’M’,’H’), args.legend = list(x = “right”))

barplot(data, names.arg=c(‘Group A’,’Group B’), beside=T,

legend.text=c(‘L’,’M’,’H’), args.legend = list(x = “topright”))

# ‘symbols()’ is a good way to represent a 3rd data dimension (use square root for area proportionality)

(cities lon=c(-0.1,-2.6,-2.2,-1.5), lat=c(51.5,51.4,53.5,53.8), pop=c(8,1,2.7,0.8)))

symbols(x=cities$lon, y=cities$lat, circles=sqrt(cities$pop), inches=0.3,

bg=’red’, fg=NULL, asp=T, xlab=’Longitude’, ylab=’Latitude’)

abline(h=(seq(51,53,1)), col=”lightgray”, lty=1)

abline(v=(seq(-4,1,1)), col=”lightgray”, lty=1)

text(x=cities$lon, y=cities$lat+0.2, labels=cities$city)

# But for much easier and more elegant data visualisation use GGPLOT2

# END OF SCRIPT

You may want to check the text-as-it-appeared against the original — a number of brackets there and the second assignment operator won’t work.

Thanks- will do that.

Excellent. I was just wishing aloud for a crash course in R when this popped up in my reader. Many thanks.

While certainly more elegant, and powerful, ggplot2 wouldn’t be characterized as much easier, even by Hadley.

Fair point, ggplot does have an initial learning curve. The somewhat misleading ‘much easier’ refers to functional elegance!

Thanks, but there is something wrong with the code

probably copy-n-paste error. there’s something missing with the part introducing vectors:

====

# R objects can be of various data types, but probably most common are ‘numeric’ and ‘character’

( num ( char <- ‘any text string’ )

# create a VECTOR (array) using the ‘c()’ concatenate function

( vec = c(1,2,3,4,5,6) )

# a vector series

( vec = list( vec, vec, vec, vec) )

=====

??