| 
  • If you are citizen of an European Union member nation, you may not use this service unless you are at least 16 years old.

  • You already know Dokkio is an AI-powered assistant to organize & manage your digital files & messages. Very soon, Dokkio will support Outlook as well as One Drive. Check it out today!

View
 

cs229_homework_2_3

Page history last edited by Stephen O'Connell 13 years, 10 months ago

# R Solution to Homework 2, problem 3

# Author: Stephen OConnell

###############################################################################

## rm(list=ls())

 

## R PACKAGE WITH NAIVE BAYES

require(e1071)

 

## SETUP THE WORKING DIRECTORY

setwd("/Users/oconste/Downloads/MachineLearning/materials/HW_2_Data")

getwd()

 

## THE naiveBayes FUNCTION NEEDS THE DATA IN A SPECIFIC FORMAT

get_frame <- function(input, num_features=1448) {

 

## READ THE TRAINING DATA IN

in_line <- readLines(input)

 

## CLEAN UP THE FIRST LINE, REMOVE THE ']','[', and ' '

Y_line = gsub(' ', '', in_line[1])

Y_line = gsub('\\[', '', Y_line)

Y_line = gsub('\\]', '', Y_line)

 

## SPLIT THE THIS BY THE ',' AND CONVERT THE RESULT TO NUMERIC

Y <- as.numeric(strsplit(Y_line, ',')[[1]])

num_samp <- length(Y)

 

Y <- matrix(Y, nrow=num_samp, ncol=1)

 

Y <- as.data.frame(Y)

names(Y) <- 'spam_ind'

Y$spam_ind[Y$spam_ind == 0] <- 'spam'

Y$spam_ind[Y$spam_ind == 1] <- 'non_spam'

 

M_line = gsub(' ', '', in_line[2])

M_line = gsub('\\[', '', M_line)

M_line = gsub('\\]', '', M_line)

M <- as.numeric(strsplit(M_line, ',')[[1]])

M <- matrix(M, nrow=num_samp, ncol=num_features, byrow=TRUE)

M <- as.data.frame(M)

 

out_frame <- Y

out_frame <- cbind(out_frame, M)

out_frame$spam_ind <- as.factor(out_frame$spam_ind)

 

return(out_frame)

}

 

 

## GET THE TEST DATA ( OUTPUT OF MIKE B. python program)

test_spam <- get_frame("MATRIX.TEST.csv")

 

## GET THE TRAINING DATA SET ( OUTPUT OF MIKE B. python program)

spam_train <- get_frame("MATRIX.TRAIN.1400.csv")

 

model <- naiveBayes(spam_ind ~ ., data = spam_train) 

pred <- predict(model, test_spam[,-1])

 

### CHECK THE RESULTS

table(pred, test_spam$spam_ind)

 

answer <- pred

solution <- test_spam$spam_ind

 

error <- 0

for (i in 1:length(test_spam$spam_ind)) {

if (answer[i] != solution[i]) {

error <- error + 1

}

}

 

## PRINT THE ERROR RATE

error/length(test_spam$spam_ind)

 

 

********************************* RESULTS WITH 1400 TRAINING ****************************

> ### CHECK THE RESULTS

> table(pred, test_spam$spam_ind)

 

pred       non_spam spam

  non_spam      353   50

  spam               47  350

 

> ## PRINT THE ERROR RATE

> error/length(test_spam$spam_ind)

[1] 0.12125

 

 

 

Comments (0)

You don't have permission to comment on this page.