import numpy as np import pandas as pd import matplotlib.pyplot as plt dataset = pd.read_csv("Restaurant_Reviews.tsv", delimiter="\t", quoting=3) import re # import nltk from nltk.corpus import stopwords from nltk.stem.porter import PorterStemmer # nltk.download('stopwords') ps = PorterStemmer() corpus = [] for i in range(0, 1000): review = re.sub('[^a-zA-Z]', ' ', dataset['Review'][i]) review = review.lower() review = review.split() review = [ps.stem(word) for word in review if not word in set(stopwords.words('english'))] review = ' '.join(review) corpus.append(review) from sklearn.feature_extraction.text import CountVectorizer cv = CountVectorizer(max_features=1500) X = cv.fit_transform(corpus).toarray() Y = dataset.iloc[:, 1].values from sklearn.model_selection import train_test_split X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.20, random_state = 0) from sklearn.naive_bayes import GaussianNB classifier = GaussianNB() classifier.fit(X_train, Y_train) Y_pred = classifier.predict(X_test) print(Y_pred) print(Y_test) count = 0 for i in range(0,len(Y_pred)): if Y_pred[i] == Y_test[i]: count += 1 print(count / len(Y_pred))