나이브 베이즈 python code 인공지능 및 기계학습 개론Ⅰ : edwith

babibubebo 2019.06.26

안녕하세요. 교수님 매트랩 코드를 파이썬으로 만들어 보았습니다.

저는 컴공과 출신이 아니라서 이 코드는 많이 부족할거에요.

그래도 올바르게 만들어진 코드인지 궁금해서 올립니다.

가감없는 조언 댓글 부탁드리겠습니다.

저는 72%의 test_accuracy가 나왔습니다. random.suffle이라 약간씩 변동이 있을 것으로 예상됩니다.

naivebayesclassifier.py

import scipy.io
import numpy as np
import pandas as pd

#numword = len(nb_data['word'])
# numtesting = 50

class NBClassifier:

def __init__(self, numword=29717, numtesting=50, probXbyY=None, probY=None):
self.numword = numword
self.numtesting = numtesting
self.probXbyY = probXbyY
self.probY = probY

def fit(self, X, y):
numword = self.numword

cntXbyY = np.ones([numword,2])/1000
for i in range(len(X)):
for j in range(numword):
if X[i, j] >= 1:
cntXbyY[j, y[i]] += 1

cntY = np.zeros(2)
for i in y:
if i[0] == 0:
cntY[0] += 1
else:
cntY[1] += 1

pXbyY = np.zeros([numword, 2])
for i in range(numword):
for j in [0, 1]:
pXbyY[i, j] = cntXbyY[i,j] / cntY[j]
self.probXbyY = pXbyY

pY = np.zeros(2)
for j in [0, 1]:
pY[j] = cntY[j] / (cntY[0] + cntY[1])
self.probY = pY

return self

def predict(self, X):
numword = self.numword
probXbyY = self.probXbyY
probY = self.probY
logprobsenti = np.zeros([len(X), 2])
for i in range(len(X)):
for k in [0, 1]:
for j in range(numword):
if X[i,j] >= 1:
logprobsenti[i,k] = logprobsenti[i,k] + np.log(probXbyY[j,k]) + np.log(X[i,j])
else:
logprobsenti[i,k] = logprobsenti[i,k] + np.log(1-probXbyY[j,k])
logprobsenti[i,k] = logprobsenti[i,k] + np.log(probY[k])

estsenti = np.zeros([len(logprobsenti), 1])
for i in range(len(logprobsenti)):
if logprobsenti[i,0] > logprobsenti[i,1]:
estsenti[i] = 0
else:
estsenti[i] = 1
return estsenti

def get_acc(self, estsentiment, y):
cntCorrect = 0
for i in range(len(y)):
if estsentiment[i] == y[i]:
cntCorrect += 1
return cntCorrect/len(y)

nb_data = scipy.io.loadmat('sentimentdataset.mat')
# nb_data['word']: corpus
# nb_data['bagofwords']: X, 변수별 벡터
# nb_data['sentiment']: Y

numtesting=50
indx = np.arange(len(nb_data['sentiment']))
np.random.shuffle(indx)
train_ind = indx[0:-numtesting]
test_ind = indx[-numtesting:]

X_total = nb_data['bagofword']
y_total = nb_data['sentiment']
X_train = X_total[train_ind]
y_train = y_total[train_ind]
X_test = X_total[test_ind]
y_test = y_total[test_ind]

nb_ex = NBClassifier()
nb_ex = nb_ex.fit(X_train, y_train)
y_pred = nb_ex.predict(X_test)

print('The accuracy of Naive Bayes Classifier is {:.2f}%.'.format(nb_ex.get_acc(y_pred, y_test)*100))

인공지능 및 기계학습 개론Ⅰ

comment