diff --git a/index.html b/index.html new file mode 100644 index 0000000..08e7fa0 --- /dev/null +++ b/index.html @@ -0,0 +1,313 @@ + + + + +
+ + + + + + + + + +First, we download the training and test datasets and load them in through the read.csv function. During my exploratory data analysis, I saw that blank values, “NA”, and “#DIV/0!” often show up in data columns so I have decided to treat all of these values as NA.
# load packag
+library(caret)
+# download data
+if(!file.exists("pml-training.csv")){
+ download.file("https://d396qusza40orc.cloudfront.net/predmachlearn/pml-training.csv",
+ destfile = "pml-training.csv", method = "curl")
+}
+if(!file.exists("pml-testing.csv")){
+ download.file("https://d396qusza40orc.cloudfront.net/predmachlearn/pml-testing.csv",
+ destfile = "pml-testing.csv", method = "curl")
+}
+# load data
+train <- read.csv("pml-training.csv", header = TRUE, na.strings=c("","NA", "#DIV/0!"))
+test <- read.csv("pml-testing.csv", header = TRUE, na.strings=c("","NA", "#DIV/0!"))
+In order to run the machine learning algorithms, the features used cannot contain any NA values. To see which variables/features should be used, I calculated the percentage of NA’s for each column.
# see error percentage
+NAPercent <- round(colMeans(is.na(train)), 2)
+table(NAPercent)
+## NAPercent
+## 0 0.98 1
+## 60 94 6
+From above, we can see that only 60 variables have complete data so those are the variables we will use to build the prediction algorithm. I removed the first variable here because it is the row index from the csv file and not a true variable.
+# find index of the complete columns minus the first
+index <- which(NAPercent==0)[-1]
+# subset the data
+train <- train[, index]
+test <- test[, index]
+# looking at the structure of the data for the first 10 columns
+str(train[, 1:10])
+## 'data.frame': 19622 obs. of 10 variables:
+## $ user_name : Factor w/ 6 levels "adelmo","carlitos",..: 2 2 2 2 2 2 2 2 2 2 ...
+## $ raw_timestamp_part_1: int 1323084231 1323084231 1323084231 1323084232 1323084232 1323084232 1323084232 1323084232 1323084232 1323084232 ...
+## $ raw_timestamp_part_2: int 788290 808298 820366 120339 196328 304277 368296 440390 484323 484434 ...
+## $ cvtd_timestamp : Factor w/ 20 levels "02/12/2011 13:32",..: 9 9 9 9 9 9 9 9 9 9 ...
+## $ new_window : Factor w/ 2 levels "no","yes": 1 1 1 1 1 1 1 1 1 1 ...
+## $ num_window : int 11 11 11 12 12 12 12 12 12 12 ...
+## $ roll_belt : num 1.41 1.41 1.42 1.48 1.48 1.45 1.42 1.42 1.43 1.45 ...
+## $ pitch_belt : num 8.07 8.07 8.07 8.05 8.07 8.06 8.09 8.13 8.16 8.17 ...
+## $ yaw_belt : num -94.4 -94.4 -94.4 -94.4 -94.4 -94.4 -94.4 -94.4 -94.4 -94.4 ...
+## $ total_accel_belt : int 3 3 3 3 3 3 3 3 3 3 ...
+From the structure of the data, we can see that the first 6 variables user_name, raw_timestamp_part_1, raw_timestamp_part_2, cvtd_timestamp, new_window, num_window are simply administrative parameters and are unlikely to help us predict the activity the subjects are performing. Therefore, we are going to leave those 6 columns out before we build the algorithm. In addition, to make the columns easier to deal with, we will go ahead and convert all features to numeric class.
# subset the data
+train <- train[, -(1:6)]
+test <- test[, -(1:6)]
+# convert all numerical data to numeric class
+for(i in 1:(length(train)-1)){
+ train[,i] <- as.numeric(train[,i])
+ test[,i] <- as.numeric(test[,i])
+}
+Forthis project, we will focus on using the two most widely-used, most accurate prediction algorithms,
+We set test set aside and split the train data into two sections for cross validation. We will allocate 80% of the data to train the model and 20% to validate it.
We expect that the out-of-bag (OOB) error rates returned by the models should be good estimate for the out of sample error rate. We will get actual estimates of error rates from the accuracies achieved by the models.
+# split train data set
+inTrain <- createDataPartition(y=train$classe,p=0.8, list=FALSE)
+trainData <- train[inTrain,]
+validation <- train[-inTrain,]
+# print out the dimentions of the 3 data sets
+rbind(trainData = dim(trainData), validation = dim(validation), test = dim(test))
+## [,1] [,2]
+## trainData 15699 53
+## validation 3923 53
+## test 20 53
+First, We will use random forest to build the first model. Because the algorithm is computationally intensive, we will leverage parallel processing using multiple cores through the doMC package
# load doMC package
+library(doMC)
+# set my cores
+registerDoMC(cores = 8)
+# load randomForest package
+library(randomForest)
+# run the random forest algorithm on the training data set
+rfFit <- randomForest(classe~., data = trainData, method ="rf", prox = TRUE)
+rfFit
+##
+## Call:
+## randomForest(formula = classe ~ ., data = trainData, method = "rf", prox = TRUE)
+## Type of random forest: classification
+## Number of trees: 500
+## No. of variables tried at each split: 7
+##
+## OOB estimate of error rate: 0.43%
+## Confusion matrix:
+## A B C D E class.error
+## A 4462 2 0 0 0 0.0004480287
+## B 16 3018 4 0 0 0.0065832785
+## C 0 8 2725 5 0 0.0047479912
+## D 0 0 24 2546 3 0.0104935873
+## E 0 0 1 4 2881 0.0017325017
+# use model to predict on validation data set
+rfPred <- predict(rfFit, validation)
+# predicted result
+confusionMatrix(rfPred, validation$classe)
+## Confusion Matrix and Statistics
+##
+## Reference
+## Prediction A B C D E
+## A 1116 2 0 0 0
+## B 0 756 3 0 0
+## C 0 1 681 6 0
+## D 0 0 0 637 1
+## E 0 0 0 0 720
+##
+## Overall Statistics
+##
+## Accuracy : 0.9967
+## 95% CI : (0.9943, 0.9982)
+## No Information Rate : 0.2845
+## P-Value [Acc > NIR] : < 2.2e-16
+##
+## Kappa : 0.9958
+## Mcnemar's Test P-Value : NA
+##
+## Statistics by Class:
+##
+## Class: A Class: B Class: C Class: D Class: E
+## Sensitivity 1.0000 0.9960 0.9956 0.9907 0.9986
+## Specificity 0.9993 0.9991 0.9978 0.9997 1.0000
+## Pos Pred Value 0.9982 0.9960 0.9898 0.9984 1.0000
+## Neg Pred Value 1.0000 0.9991 0.9991 0.9982 0.9997
+## Prevalence 0.2845 0.1935 0.1744 0.1639 0.1838
+## Detection Rate 0.2845 0.1927 0.1736 0.1624 0.1835
+## Detection Prevalence 0.2850 0.1935 0.1754 0.1626 0.1835
+## Balanced Accuracy 0.9996 0.9975 0.9967 0.9952 0.9993
+Next, we will try the Generalized Boosted Regression Models.
+# run the generalized boosted regression model
+gbmFit <- train(classe~., data = trainData, method ="gbm", verbose = FALSE)
+gbmFit
+## Stochastic Gradient Boosting
+##
+## 15699 samples
+## 52 predictor
+## 5 classes: 'A', 'B', 'C', 'D', 'E'
+##
+## No pre-processing
+## Resampling: Bootstrapped (25 reps)
+##
+## Summary of sample sizes: 15699, 15699, 15699, 15699, 15699, 15699, ...
+##
+## Resampling results across tuning parameters:
+##
+## interaction.depth n.trees Accuracy Kappa Accuracy SD
+## 1 50 0.7481717 0.6807386 0.007517094
+## 1 100 0.8152710 0.7662110 0.004916848
+## 1 150 0.8500706 0.8102767 0.004760952
+## 2 50 0.8519370 0.8124713 0.005877966
+## 2 100 0.9061600 0.8812564 0.004217447
+## 2 150 0.9304904 0.9120490 0.003690489
+## 3 50 0.8956435 0.8678982 0.004276404
+## 3 100 0.9404469 0.9246541 0.004019593
+## 3 150 0.9594901 0.9487596 0.003121579
+## Kappa SD
+## 0.009517534
+## 0.006225865
+## 0.006044945
+## 0.007440065
+## 0.005345304
+## 0.004675461
+## 0.005406066
+## 0.005087412
+## 0.003945901
+##
+## Tuning parameter 'shrinkage' was held constant at a value of 0.1
+## Accuracy was used to select the optimal model using the largest value.
+## The final values used for the model were n.trees = 150,
+## interaction.depth = 3 and shrinkage = 0.1.
+# use model to predict on validation data set
+gbmPred <- predict(gbmFit, validation)
+# predicted result
+confusionMatrix(gbmPred, validation$classe)
+## Confusion Matrix and Statistics
+##
+## Reference
+## Prediction A B C D E
+## A 1095 25 0 3 2
+## B 17 713 24 6 9
+## C 2 21 652 26 6
+## D 2 0 7 606 3
+## E 0 0 1 2 701
+##
+## Overall Statistics
+##
+## Accuracy : 0.9602
+## 95% CI : (0.9536, 0.9661)
+## No Information Rate : 0.2845
+## P-Value [Acc > NIR] : < 2.2e-16
+##
+## Kappa : 0.9497
+## Mcnemar's Test P-Value : 9.725e-05
+##
+## Statistics by Class:
+##
+## Class: A Class: B Class: C Class: D Class: E
+## Sensitivity 0.9812 0.9394 0.9532 0.9425 0.9723
+## Specificity 0.9893 0.9823 0.9830 0.9963 0.9991
+## Pos Pred Value 0.9733 0.9272 0.9222 0.9806 0.9957
+## Neg Pred Value 0.9925 0.9854 0.9900 0.9888 0.9938
+## Prevalence 0.2845 0.1935 0.1744 0.1639 0.1838
+## Detection Rate 0.2791 0.1817 0.1662 0.1545 0.1787
+## Detection Prevalence 0.2868 0.1960 0.1802 0.1575 0.1795
+## Balanced Accuracy 0.9852 0.9608 0.9681 0.9694 0.9857
+From the above, we can see that randomForest is the better performing algorithm with 0.43% out-of-bag (OOB) error rate, which is what we expect the out of sample error rate to be. When applied to the validation set for cross validation, the model achieved an accuracy of 99.7%, which indicates the actual error rate is 0.3%, where as GBM has an accuracy of 96.0% with error rate of 4.0%.
+We can apply the randomForest model to the 20 given test set for the predictions. The results were all correct.
+# apply random forest model to test set
+predict(rfFit, test)
+## 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20
+## B A B A A E D B A A B C B A E E A B B B
+## Levels: A B C D E
+