R basics

getting help

  • help about function
help(functionName) or ?functionName  
  • searching the help documentation for a given character string
 help.search(string) or ??string
  • list all functions contain "name"
apropos("str",mode = "function")
apropos("plot",mode="function")
##  [1] "assocplot"           "barplot"             "barplot.default"    
##  [4] "biplot"              "boxplot"             "boxplot.default"    
##  [7] "boxplot.matrix"      "boxplot.stats"       "cdplot"             
## [10] "coplot"              "fourfoldplot"        "interaction.plot"   
## [13] "lag.plot"            "matplot"             "monthplot"          
## [16] "mosaicplot"          "plot"                "plot.default"       
## [19] "plot.design"         "plot.ecdf"           "plot.function"      
## [22] "plot.new"            "plot.spec.coherency" "plot.spec.phase"    
## [25] "plot.stepfun"        "plot.ts"             "plot.window"        
## [28] "plot.xy"             "preplot"             "qqplot"             
## [31] "recordPlot"          "replayPlot"          "savePlot"           
## [34] "screeplot"           "spineplot"           "sunflowerplot"      
## [37] "termplot"            "ts.plot"

Varaible assignment

  • "<-" (recommend)
a <- "hello"
print(a)
## [1] "hello"
"hi" -> b
print(b)
## [1] "hi"
  • "=" also works
c = 10; d = 11
print(c+d)
## [1] 21
  • Assign value to a character string
assign("aString",1)
aString
## [1] 1
for (i in 1:5){
  varName = paste("v",i,sep="")
  assign(varName,i)
}
  • clear the workspace
ls() ## list all the variables in the workplace
##  [1] "a"       "aString" "b"       "c"       "d"       "i"       "v1"     
##  [8] "v2"      "v3"      "v4"      "v5"      "varName"
rm(variableName)
rm(list=ls()) # clear the environment; It's a good habit to put it at top of your script

comment

One line

  • add "#" before comment

Multi lines

  • have to "#" each line
  • Rstudio "Ctrl+Shift+C" / "Command+Shift+C"
  • "if (FALSE)" maybe a helpful trick
if (F) {
"
  ...
  ...
      "
}

Packages

Install package

  • install.packages("packageName")
install.packages("ISLR")
  • "Packages" pane in Rstudio

Load package

  • library / require both work
library("ISLR")
require("ISLR")

Data Structures

  • Vector
  • Matrix
  • Array
  • Data Frame
  • Factor
  • List

vector

  • one dimension
  • all elements in a vector need to be same data type
# a vector of numeric values
a = 1:9
length(a)
## [1] 9
a
## [1] 1 2 3 4 5 6 7 8 9
b = c(1,3,4,10,11)
b
## [1]  1  3  4 10 11
# character strings
chVec = c("hello","hi","a","abc")
chVec
## [1] "hello" "hi"    "a"     "abc"
is.vector(chVec)
## [1] TRUE
# logical values
logicVec = c(TRUE,TRUE,FALSE,TRUE,FALSE)
logicVec
## [1]  TRUE  TRUE FALSE  TRUE FALSE

matrix

  • two dimensions
  • all elements should be same data type

  • Syntax

matrix(vector,nrow,ncol,byrow,dimnames)

-Examples

m = matrix(1:12,4,3)
m
##      [,1] [,2] [,3]
## [1,]    1    5    9
## [2,]    2    6   10
## [3,]    3    7   11
## [4,]    4    8   12
is.matrix(m)
## [1] TRUE
m[3,2] # element at 3rd row, 2nd colum
## [1] 7
m[2,] # the 2nd row
## [1]  2  6 10
m[,2:3] # the 2nd,3rd columns
##      [,1] [,2]
## [1,]    5    9
## [2,]    6   10
## [3,]    7   11
## [4,]    8   12
m[2:4,c(1,3)]
##      [,1] [,2]
## [1,]    2   10
## [2,]    3   11
## [3,]    4   12
rbind(m,m) # combine by rows
##      [,1] [,2] [,3]
## [1,]    1    5    9
## [2,]    2    6   10
## [3,]    3    7   11
## [4,]    4    8   12
## [5,]    1    5    9
## [6,]    2    6   10
## [7,]    3    7   11
## [8,]    4    8   12
cbind(m,m) # combine by columns
##      [,1] [,2] [,3] [,4] [,5] [,6]
## [1,]    1    5    9    1    5    9
## [2,]    2    6   10    2    6   10
## [3,]    3    7   11    3    7   11
## [4,]    4    8   12    4    8   12
  • Matrix Computation
m = matrix(c(3,2,-2,2,5,2,2,8,4),3,3) # create a square matrix
m
##      [,1] [,2] [,3]
## [1,]    3    2    2
## [2,]    2    5    8
## [3,]   -2    2    4
t(m) # transpose
##      [,1] [,2] [,3]
## [1,]    3    2   -2
## [2,]    2    5    2
## [3,]    2    8    4
solve(m) # inverse
##       [,1]  [,2]   [,3]
## [1,] -0.50  0.50 -0.750
## [2,]  3.00 -2.00  2.500
## [3,] -1.75  1.25 -1.375
det(m)  # determinant
## [1] -8
## * and %*% are different
m * m
##      [,1] [,2] [,3]
## [1,]    9    4    4
## [2,]    4   25   64
## [3,]    4    4   16
m %*% m # this is the matrix multiplication !!
##      [,1] [,2] [,3]
## [1,]    9   20   30
## [2,]    0   45   76
## [3,]  -10   14   28

array

  • array can have more than two dimensions
  • Syntax
array(vector, dimensions, dimnames)
array(1:6) # a vector
## [1] 1 2 3 4 5 6
array(1:6,dim=c(2,3)) # a matrix
##      [,1] [,2] [,3]
## [1,]    1    3    5
## [2,]    2    4    6
array(1:24, dim=c(2,3,4)) # 3 dimentions
## , , 1
## 
##      [,1] [,2] [,3]
## [1,]    1    3    5
## [2,]    2    4    6
## 
## , , 2
## 
##      [,1] [,2] [,3]
## [1,]    7    9   11
## [2,]    8   10   12
## 
## , , 3
## 
##      [,1] [,2] [,3]
## [1,]   13   15   17
## [2,]   14   16   18
## 
## , , 4
## 
##      [,1] [,2] [,3]
## [1,]   19   21   23
## [2,]   20   22   24
array(1:24, dim=c(2,3,4))[1,,]
##      [,1] [,2] [,3] [,4]
## [1,]    1    7   13   19
## [2,]    3    9   15   21
## [3,]    5   11   17   23

Data Frame

  • different columns can contain different types of data
  • each column can only have one data type
# create a data.frame
testDF = data.frame(col1 = c(1:5), col2 = letters[1:5], col3=c(T,T,F,F,T))
testDF
##   col1 col2  col3
## 1    1    a  TRUE
## 2    2    b  TRUE
## 3    3    c FALSE
## 4    4    d FALSE
## 5    5    e  TRUE
# change the colnames and rownames
colnames(testDF) = c("number","character","logic")
rownames(testDF) = paste("row",1:5,sep="")
testDF
##      number character logic
## row1      1         a  TRUE
## row2      2         b  TRUE
## row3      3         c FALSE
## row4      4         d FALSE
## row5      5         e  TRUE
testDF$number
## [1] 1 2 3 4 5
testDF[c("row1","row3"),c("number","character")]
##      number character
## row1      1         a
## row3      3         c
summary(cars)
##      speed           dist       
##  Min.   : 4.0   Min.   :  2.00  
##  1st Qu.:12.0   1st Qu.: 26.00  
##  Median :15.0   Median : 36.00  
##  Mean   :15.4   Mean   : 42.98  
##  3rd Qu.:19.0   3rd Qu.: 56.00  
##  Max.   :25.0   Max.   :120.00
  • data frame can't do matrix multiplication
    • as.matrix() first

Factor

  • categorical data
  • stored as a vecotr of character strings and a vector of intergers
a = c(0,1,0,2,1,0,1,2,1)
factor(a)
## [1] 0 1 0 2 1 0 1 2 1
## Levels: 0 1 2
str(factor(a))
##  Factor w/ 3 levels "0","1","2": 1 2 1 3 2 1 2 3 2

List

  • most complex
  • gather a variety of objects under one name
  • Syntax
list(name1 = object1, name2 = object2,...)
  • Example
testList =  list(n = c(2, 3, 5),
                char = c("aa", "bb", "cc", "dd", "ee"),
                bool = c(TRUE, FALSE, TRUE, FALSE, FALSE),
                m = matrix(1:9,3,3),
                alist = list(name=c("a","b"),gender=c("male","female")))

testList
## $n
## [1] 2 3 5
## 
## $char
## [1] "aa" "bb" "cc" "dd" "ee"
## 
## $bool
## [1]  TRUE FALSE  TRUE FALSE FALSE
## 
## $m
##      [,1] [,2] [,3]
## [1,]    1    4    7
## [2,]    2    5    8
## [3,]    3    6    9
## 
## $alist
## $alist$name
## [1] "a" "b"
## 
## $alist$gender
## [1] "male"   "female"
testList[[5]]
## $name
## [1] "a" "b"
## 
## $gender
## [1] "male"   "female"
testList[["m"]]
##      [,1] [,2] [,3]
## [1,]    1    4    7
## [2,]    2    5    8
## [3,]    3    6    9
testList$char
## [1] "aa" "bb" "cc" "dd" "ee"
  • function with multiple outputs

Import/Export data

Working Directory

  • read or write files to a specific location
  • avoid using full filepath

  • get working directory

getwd()
  • set working directory
    • setwd()
    • Rstudio: Session -> set working directory
setwd("~")
getwd()

Import/Export data

  • Import
    • read.table
    • read.csv , read.delim
  • Example

http://www-bcf.usc.edu/~gareth/ISL/data.html

autoData = read.table("Auto.csv",header=T,sep=",",quote="")
head(autoData)
##   mpg cylinders displacement horsepower weight acceleration year origin
## 1  18         8          307        130   3504         12.0   70      1
## 2  15         8          350        165   3693         11.5   70      1
## 3  18         8          318        150   3436         11.0   70      1
## 4  16         8          304        150   3433         12.0   70      1
## 5  17         8          302        140   3449         10.5   70      1
## 6  15         8          429        198   4341         10.0   70      1
##                        name
## 1 chevrolet chevelle malibu
## 2         buick skylark 320
## 3        plymouth satellite
## 4             amc rebel sst
## 5               ford torino
## 6          ford galaxie 500
  • Export
    • write.table
write.table(autoData,file="Auto2.txt",quote=F,sep=" ; ",row.names=F)
  • For small data sets
    • fix() or edit()
students = data.frame(name=character(),age=numeric(),grade=numeric(),stringsAsFactors = F)
fix(students)

Control Flow

if

  • Syntax
if (condition) {
   statement1
} else {
   statement2
}
  • Example
x <- 0
if (x < 0) {
   print("Negative number")
} else if (x > 0) {
   print("Positive number")
} else {
   print("Zero")
}
## [1] "Zero"
  • Logical Operators

-Example

year = 2017
if ( (year %% 4 == 0 & year %% 100 != 0) | year %% 400 ==0){
    print(paste(year,"is a leap year"))
  } else {
    print("no")
  }
## [1] "no"

Loops

  • for (most common)
  • while
  • repeat
  • break and next

for (most common)

-Syntax

for (var in range) {
    statement
}
  • Example: Find the leap years
for (i in 2000:2020){
  if ( (i %% 4 == 0 & i %% 100 != 0) | i %% 400 ==0){
    print(paste(i,"is a leap year"))
  }
}
## [1] "2000 is a leap year"
## [1] "2004 is a leap year"
## [1] "2008 is a leap year"
## [1] "2012 is a leap year"
## [1] "2016 is a leap year"
## [1] "2020 is a leap year"

while

  • Syntax
while (condition){
  statement
}
  • Example
x = 5
while(x <= 20){
  print(x)
  x = x+5
}
## [1] 5
## [1] 10
## [1] 15
## [1] 20
  • Be careful with the infinit loop

repeat

  • repeat {statement}
  • need to use break
x = 5
repeat{
  print(x)
  x = x+5
  if (x > 20) break
}
## [1] 5
## [1] 10
## [1] 15
## [1] 20

"break" and "next"

  • "break": break the current loop
for (i in 1:6){
  if (i==5){
    break
  }
  print(i)
}
## [1] 1
## [1] 2
## [1] 3
## [1] 4
  • "next": skip to next iteration
for (i in 1:6){
  if (i==5){
    next
  }
  print(i)
}
## [1] 1
## [1] 2
## [1] 3
## [1] 4
## [1] 6

Create your Functions

User-defined function

-Syntax

testFunc = function(arg1,arg2,...){
  statements
  return(something)
}

-Example

sumSquare = function(x,y){
    val = x^2+y^2
    return(val)
}

sumSquare(3,4)
## [1] 25

Exercise: create a function for finding leap years

  • input: startYear, endYear
  • output: return a vector of all the leap years between startYear and endYear
  • Use while or repeat for looping

Some Useful Functions for Statistics

Arithmetic

  • Arithmetic Operators

  • Mathematic Functions

Vectorized Arithmetic

testVect = c(1,3,5,2,9,10,7,8,6)

min(testVect) # minimum
## [1] 1
max(testVect) # maximum
## [1] 10
mean(testVect) # mean
## [1] 5.666667
median(testVect) # median
## [1] 6
quantile(testVect) # quantile
##   0%  25%  50%  75% 100% 
##    1    3    6    8   10
var(testVect) #variance
## [1] 10
sd(testVect) # standard deviation
## [1] 3.162278
vect1 = cars$speed
vect2 = cars$dist

cov(vect1,vect2) # covariance
## [1] 109.9469
cor(vect1,vect2) # correlation coefficient
## [1] 0.8068949

Probability Distributions

  • key words
    • d : density (returns the height of the pdf)
    • p : distribution function (returns the cdf)
    • q : quantile function (returns the inverse cdf)
    • r : random generation
  • distributions
    • binom : Binomial Distribution
    • pois : Poisson Distribution
    • unif : Uniform Distribution
    • exp : Exponential Distribution
    • norm : Normal Distribution
    • chisq : Chi-Squared Distribution
    • t : t Distribution
    • f : F Distribution
  • Examples
# bionomial
dbinom(2, size=10, prob=0.2) 
## [1] 0.3019899
dbinom(0, size=10, prob=0.2) + dbinom(1, size=10, prob=0.2) + dbinom(2, size=10, prob=0.2) 
## [1] 0.6777995
pbinom(2,size=10,prob=0.2)
## [1] 0.6777995
runif(6,min=1,max=2)
## [1] 1.609645 1.247878 1.108959 1.249730 1.623926 1.522166
qt(c(.025, .975), df=4)
## [1] -2.776445  2.776445
qf(.95, df1=3, df2=4) 
## [1] 6.591382
normSamples = rnorm(1000,mean=5,sd = 3)
hist(normSamples)

  • set.seed() reproduce the results even using random
set.seed(100)
rnorm(5)
## [1] -0.50219235  0.13153117 -0.07891709  0.88678481  0.11697127
rnorm(5)
## [1]  0.3186301 -0.5817907  0.7145327 -0.8252594 -0.3598621
set.seed(100) # reproduce the results
rnorm(5)
## [1] -0.50219235  0.13153117 -0.07891709  0.88678481  0.11697127
rnorm(5)
## [1]  0.3186301 -0.5817907  0.7145327 -0.8252594 -0.3598621

some packages (for fun)

  • sqldf
library(sqldf)
## Loading required package: gsubfn
## Loading required package: proto
## Loading required package: RSQLite
## Loading required package: DBI
library(ISLR)

sqlCars = sqldf(" select name,origin,year,cylinders,horsepower
                  from Auto 
                  where cylinders = 8 and horsepower > 200
                  order by horsepower
                ")
## Loading required package: tcltk
sqlCars
##                            name origin year cylinders horsepower
## 1               mercury marquis      1   72         8        208
## 2                    dodge d200      1   70         8        210
## 3             plymouth fury iii      1   70         8        215
## 4                     ford f250      1   70         8        215
## 5  chrysler new yorker brougham      1   73         8        215
## 6              chevrolet impala      1   70         8        220
## 7              pontiac catalina      1   70         8        225
## 8       buick estate wagon (sw)      1   70         8        225
## 9      buick electra 225 custom      1   73         8        225
## 10           pontiac grand prix      1   73         8        230
  • quantmod
library(quantmod)
## Loading required package: xts
## Loading required package: zoo
## 
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
## 
##     as.Date, as.Date.numeric
## Loading required package: TTR
## Version 0.4-0 included new data defaults. See ?getSymbols.
getSymbols("AAPL",src="yahoo",from = Sys.Date()-500)
##     As of 0.4-0, 'getSymbols' uses env=parent.frame() and
##  auto.assign=TRUE by default.
## 
##  This  behavior  will be  phased out in 0.5-0  when the call  will
##  default to use auto.assign=FALSE. getOption("getSymbols.env") and 
##  getOptions("getSymbols.auto.assign") are now checked for alternate defaults
## 
##  This message is shown once per session and may be disabled by setting 
##  options("getSymbols.warning4.0"=FALSE). See ?getSymbols for more details.
## [1] "AAPL"
tail(AAPL)
##            AAPL.Open AAPL.High AAPL.Low AAPL.Close AAPL.Volume
## 2017-01-24    119.55    120.10   119.50     119.97    23150200
## 2017-01-25    120.42    122.10   120.28     121.88    32377600
## 2017-01-26    121.67    122.44   121.60     121.94    26282000
## 2017-01-27    122.14    122.35   121.60     121.95    20437400
## 2017-01-30    120.93    121.63   120.66     121.63    30278800
## 2017-01-31    121.15    121.39   120.62     121.35    41610600
##            AAPL.Adjusted
## 2017-01-24        119.97
## 2017-01-25        121.88
## 2017-01-26        121.94
## 2017-01-27        121.95
## 2017-01-30        121.63
## 2017-01-31        121.35
barChart(AAPL)

getQuote("AAPL",src="yahoo")
##               Trade Time     Last Change % Change  Open   High    Low
## AAPL 2017-02-01 04:00:00 128.7908 7.4408 +6.1317% 127.1 130.49 127.01
##        Volume
## AAPL 11590137