- k-Nearest Neighbors (kNN)
- R exercise: create a simple KNN classifier
- knn() function from class library
2017-01-24
Algorithm
- 0 training error
- sensitive to "noise"
- overfitting problem
- Larger k gives smoother boundaries
- underfitting problem
set.seed(99)
## prepare the data set
train = data.frame(x1 = runif(20,-1,1), x2 = runif(20,-1,1), label = rep(NA,20))
## assign labels based on x, y coordinates
train[train$x1>0 & train$x2 >0,"label"] = "red" # Quadrant 1
train[train$x1<0 & train$x2 >0,"label"] = "green" # Quadrant 2
train[train$x1<0 & train$x2 <0,"label"] = "orange"
train[train$x1>0 & train$x2 <0,"label"] = "blue"
## plot the points
plot(train[,1:2],col=train$label,pch=19,asp=1,xlim=c(-1,1),ylim=c(-1,1))
abline(h=0,lty=3)
abline(v=0,lty=3)
## add a point we want to predict
points(0.5,0.5,pch=10)
## knn classifier
p = c(0.5,0.5) # test data
## dist function
distFunc = function(p,data){
val = c()
for (i in 1:nrow(data)){
val = c(val,sqrt(sum((p - data[i,])^2)))
}
return(val)
}
#as.matrix(dist(rbind(p,train[,1:2])))[,1] # use the built in dist function
myKnn = function(p,trainData,labels,k){
distV = distFunc(p,trainData) ## calculate distance
klabels = labels[order(distV)][1:k] ## k nearest neighbours
## majority vote and distance
return(list(vote = sort(table(klabels),decreasing = T),dist=distV[order(distV)][1:k]))
}
p = c(0.1,-0.5) # test data
myKnn(p,train[,1:2],train[,3],k=3)
knn(train, test, cl,k ,...)
library(class) knn(train[,1:2],p,train[,3],k=5,prob=T)
library(class) library(ISLR) ## ?Smarket attach(Smarket) train = (Year<2005) train.X = cbind(Lag1,Lag2)[train,] test.X = cbind(Lag1,Lag2)[!train,] train.Direction = Direction[train] Direction.2005 = Direction[!train] set.seed(1) knn.pred = knn(train.X,test.X,train.Direction,k=1) table(knn.pred,Direction.2005) mean(knn.pred == Direction.2005)