| 
  • If you are citizen of an European Union member nation, you may not use this service unless you are at least 16 years old.

  • Whenever you search in PBworks or on the Web, Dokkio Sidebar (from the makers of PBworks) will run the same search in your Drive, Dropbox, OneDrive, Gmail, Slack, and browsed web pages. Now you can find what you're looking for wherever it lives. Try Dokkio Sidebar for free.

View
 

cs229_homework_2_3

Page history last edited by Stephen O'Connell 12 years, 10 months ago

# R Solution to Homework 2, problem 3

# Author: Stephen OConnell

###############################################################################

## rm(list=ls())

 

## R PACKAGE WITH NAIVE BAYES

require(e1071)

 

## SETUP THE WORKING DIRECTORY

setwd("/Users/oconste/Downloads/MachineLearning/materials/HW_2_Data")

getwd()

 

## THE naiveBayes FUNCTION NEEDS THE DATA IN A SPECIFIC FORMAT

get_frame <- function(input, num_features=1448) {

 

## READ THE TRAINING DATA IN

in_line <- readLines(input)

 

## CLEAN UP THE FIRST LINE, REMOVE THE ']','[', and ' '

Y_line = gsub(' ', '', in_line[1])

Y_line = gsub('\\[', '', Y_line)

Y_line = gsub('\\]', '', Y_line)

 

## SPLIT THE THIS BY THE ',' AND CONVERT THE RESULT TO NUMERIC

Y <- as.numeric(strsplit(Y_line, ',')[[1]])

num_samp <- length(Y)

 

Y <- matrix(Y, nrow=num_samp, ncol=1)

 

Y <- as.data.frame(Y)

names(Y) <- 'spam_ind'

Y$spam_ind[Y$spam_ind == 0] <- 'spam'

Y$spam_ind[Y$spam_ind == 1] <- 'non_spam'

 

M_line = gsub(' ', '', in_line[2])

M_line = gsub('\\[', '', M_line)

M_line = gsub('\\]', '', M_line)

M <- as.numeric(strsplit(M_line, ',')[[1]])

M <- matrix(M, nrow=num_samp, ncol=num_features, byrow=TRUE)

M <- as.data.frame(M)

 

out_frame <- Y

out_frame <- cbind(out_frame, M)

out_frame$spam_ind <- as.factor(out_frame$spam_ind)

 

return(out_frame)

}

 

 

## GET THE TEST DATA ( OUTPUT OF MIKE B. python program)

test_spam <- get_frame("MATRIX.TEST.csv")

 

## GET THE TRAINING DATA SET ( OUTPUT OF MIKE B. python program)

spam_train <- get_frame("MATRIX.TRAIN.1400.csv")

 

model <- naiveBayes(spam_ind ~ ., data = spam_train) 

pred <- predict(model, test_spam[,-1])

 

### CHECK THE RESULTS

table(pred, test_spam$spam_ind)

 

answer <- pred

solution <- test_spam$spam_ind

 

error <- 0

for (i in 1:length(test_spam$spam_ind)) {

if (answer[i] != solution[i]) {

error <- error + 1

}

}

 

## PRINT THE ERROR RATE

error/length(test_spam$spam_ind)

 

 

********************************* RESULTS WITH 1400 TRAINING ****************************

> ### CHECK THE RESULTS

> table(pred, test_spam$spam_ind)

 

pred       non_spam spam

  non_spam      353   50

  spam               47  350

 

> ## PRINT THE ERROR RATE

> error/length(test_spam$spam_ind)

[1] 0.12125

 

 

 

Comments (0)

You don't have permission to comment on this page.