# R Solution to Homework 2, problem 3
#
# Author: Stephen OConnell
###############################################################################
## rm(list=ls())
## R PACKAGE WITH NAIVE BAYES
require(e1071)
## SETUP THE WORKING DIRECTORY
setwd("/Users/oconste/Downloads/MachineLearning/materials/HW_2_Data")
getwd()
## THE naiveBayes FUNCTION NEEDS THE DATA IN A SPECIFIC FORMAT
get_frame <- function(input, num_features=1448) {
## READ THE TRAINING DATA IN
in_line <- readLines(input)
## CLEAN UP THE FIRST LINE, REMOVE THE ']','[', and ' '
Y_line = gsub(' ', '', in_line[1])
Y_line = gsub('\\[', '', Y_line)
Y_line = gsub('\\]', '', Y_line)
## SPLIT THE THIS BY THE ',' AND CONVERT THE RESULT TO NUMERIC
Y <- as.numeric(strsplit(Y_line, ',')[[1]])
num_samp <- length(Y)
Y <- matrix(Y, nrow=num_samp, ncol=1)
Y <- as.data.frame(Y)
names(Y) <- 'spam_ind'
Y$spam_ind[Y$spam_ind == 0] <- 'spam'
Y$spam_ind[Y$spam_ind == 1] <- 'non_spam'
M_line = gsub(' ', '', in_line[2])
M_line = gsub('\\[', '', M_line)
M_line = gsub('\\]', '', M_line)
M <- as.numeric(strsplit(M_line, ',')[[1]])
M <- matrix(M, nrow=num_samp, ncol=num_features, byrow=TRUE)
M <- as.data.frame(M)
out_frame <- Y
out_frame <- cbind(out_frame, M)
out_frame$spam_ind <- as.factor(out_frame$spam_ind)
return(out_frame)
}
## GET THE TEST DATA ( OUTPUT OF MIKE B. python program)
test_spam <- get_frame("MATRIX.TEST.csv")
## GET THE TRAINING DATA SET ( OUTPUT OF MIKE B. python program)
spam_train <- get_frame("MATRIX.TRAIN.1400.csv")
model <- naiveBayes(spam_ind ~ ., data = spam_train)
pred <- predict(model, test_spam[,-1])
### CHECK THE RESULTS
table(pred, test_spam$spam_ind)
answer <- pred
solution <- test_spam$spam_ind
error <- 0
for (i in 1:length(test_spam$spam_ind)) {
if (answer[i] != solution[i]) {
error <- error + 1
}
}
## PRINT THE ERROR RATE
error/length(test_spam$spam_ind)
********************************* RESULTS WITH 1400 TRAINING ****************************
> ### CHECK THE RESULTS
> table(pred, test_spam$spam_ind)
pred non_spam spam
non_spam 353 50
spam 47 350
> ## PRINT THE ERROR RATE
> error/length(test_spam$spam_ind)
[1] 0.12125
Comments (0)
You don't have permission to comment on this page.