In [2]:
import numpy as np
import pandas as pd

In [3]:
df=pd.read_csv("/kaggle/input/imdb-dataset-of-50k-movie-reviews/IMDB Dataset.csv")

df=df.head(10000)

In [4]:
df['sentiment'].value_counts()

sentiment
positive    25000
negative    25000
Name: count, dtype: int64

In [5]:
df.isnull().sum()

review       0
sentiment    0
dtype: int64

In [6]:
df.duplicated().sum()

418

In [7]:
df=df.drop_duplicates()

In [8]:
df.duplicated().sum()

0

# **Removing HTML Tags**

In [9]:
import re
def remove_tags(text):
    return re.sub(re.compile('<.*?>'),'',text)

In [10]:
df['review']=df['review'].apply(remove_tags)

In [11]:
df.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. The filming tec...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


# **Lowercase**

In [12]:
df['review']=df['review'].apply(lambda x:x.lower())

In [13]:
df.head()

Unnamed: 0,review,sentiment
0,one of the other reviewers has mentioned that ...,positive
1,a wonderful little production. the filming tec...,positive
2,i thought this was a wonderful way to spend ti...,positive
3,basically there's a family where a little boy ...,negative
4,"petter mattei's ""love in the time of money"" is...",positive


# **Removing Stopwords**

In [14]:
from nltk.corpus import stopwords

In [15]:
sw_list=stopwords.words('english')

In [16]:
df['review']=df['review'].apply(lambda x:[item for item in x.split() if item not in sw_list]).apply(lambda x:" ".join(x))

In [17]:
df.head()

Unnamed: 0,review,sentiment
0,one reviewers mentioned watching 1 oz episode ...,positive
1,wonderful little production. filming technique...,positive
2,thought wonderful way spend time hot summer we...,positive
3,basically there's family little boy (jake) thi...,negative
4,"petter mattei's ""love time money"" visually stu...",positive


# **Removing Numbers**

In [18]:
df['review']=df['review'].apply(lambda x:' '.join([i for i in x.split() if not i.isdigit()]))

In [19]:
df.head()

Unnamed: 0,review,sentiment
0,one reviewers mentioned watching oz episode ho...,positive
1,wonderful little production. filming technique...,positive
2,thought wonderful way spend time hot summer we...,positive
3,basically there's family little boy (jake) thi...,negative
4,"petter mattei's ""love time money"" visually stu...",positive


# **Removing Punctuation**

In [20]:
import string
PUNCT_TO_REMOVE = string.punctuation
def remove_punctuation(text):
    return text.translate(str.maketrans('', '', PUNCT_TO_REMOVE))

In [21]:
df['review']=df['review'].apply(remove_punctuation)

In [22]:
df.head()

Unnamed: 0,review,sentiment
0,one reviewers mentioned watching oz episode ho...,positive
1,wonderful little production filming technique ...,positive
2,thought wonderful way spend time hot summer we...,positive
3,basically theres family little boy jake thinks...,negative
4,petter matteis love time money visually stunni...,positive


# **Removing Contractions**

In [23]:
!pip install contractions



In [24]:
import contractions
def remove_contractions(text):
    return contractions.fix(text)


In [25]:
df['review']=df['review'].apply(remove_contractions)

In [26]:
df

Unnamed: 0,review,sentiment
0,one reviewers mentioned watching oz episode ho...,positive
1,wonderful little production filming technique ...,positive
2,thought wonderful way spend time hot summer we...,positive
3,basically there is family little boy jake thin...,negative
4,petter matteis love time money visually stunni...,positive
...,...,...
49995,thought movie right good job creative original...,positive
49996,bad plot bad dialogue bad acting idiotic direc...,negative
49997,catholic taught parochial elementary schools n...,negative
49998,i am going disagree previous comment side malt...,negative


In [27]:
x=df.drop(columns='sentiment')
y=df['sentiment']

In [28]:
x

Unnamed: 0,review
0,one reviewers mentioned watching oz episode ho...
1,wonderful little production filming technique ...
2,thought wonderful way spend time hot summer we...
3,basically there is family little boy jake thin...
4,petter matteis love time money visually stunni...
...,...
49995,thought movie right good job creative original...
49996,bad plot bad dialogue bad acting idiotic direc...
49997,catholic taught parochial elementary schools n...
49998,i am going disagree previous comment side malt...


In [29]:
y

0        positive
1        positive
2        positive
3        negative
4        positive
           ...   
49995    positive
49996    negative
49997    negative
49998    negative
49999    negative
Name: sentiment, Length: 49582, dtype: object

In [30]:
from sklearn.preprocessing import LabelEncoder

In [31]:
y=LabelEncoder().fit_transform(y)

In [32]:
y

array([1, 1, 1, ..., 0, 0, 0])

In [33]:
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.2,random_state=3,stratify=y)

In [34]:
print(x_train.shape,x_test.shape)

(39665, 1) (9917, 1)


# Bag of Word

In [35]:
from sklearn.feature_extraction.text import CountVectorizer

In [36]:
cv=CountVectorizer(max_features=10000)

In [37]:
x_train

Unnamed: 0,review
17185,watching avalon which decent nice digital fx s...
12989,rarely denzil washington make bad movie come t...
31628,think movie reasonbaly good kind of weird olse...
12399,movie is horrible wonderful time first saw yea...
33230,watching the bodyguard last night felt compell...
...,...
31515,good cast with one major exception pushes way ...
19133,seldom see short comments written imdb filmgoe...
47930,say without shadow doubt going overboard singl...
35145,wife watched dvring encore action past week wo...


In [38]:
x_train=cv.fit_transform(x_train['review']).toarray()
x_test=cv.transform(x_test['review']).toarray()

In [39]:
x_train.shape

(39665, 10000)

# Applying NaiveBayes

In [40]:
from sklearn.naive_bayes import GaussianNB
gnb=GaussianNB()
gnb.fit(x_train,y_train)

In [41]:
y_pred=gnb.predict(x_test)

In [42]:
from sklearn.metrics import accuracy_score,confusion_matrix

In [43]:
accuracy_score(y_test,y_pred)

0.7354038519713623

In [44]:
confusion_matrix(y_test,y_pred)

array([[4276,  664],
       [1960, 3017]])

In [45]:
from sklearn.ensemble import RandomForestClassifier
rf=RandomForestClassifier()
rf.fit(x_train,y_train)
y_pred=rf.predict(x_test)
accuracy_score(y_test,y_pred)

0.8426943632146818

In [46]:
confusion_matrix(y_test,y_pred)

array([[4152,  788],
       [ 772, 4205]])

# N_Grams

In [47]:
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.2,random_state=3,stratify=y)

In [48]:
cv=CountVectorizer(ngram_range=(1,2),max_features=10000)


In [49]:
x_train=cv.fit_transform(x_train['review']).toarray()
x_test=cv.transform(x_test['review']).toarray()

In [50]:
from sklearn.ensemble import RandomForestClassifier
rf=RandomForestClassifier()
rf.fit(x_train,y_train)
y_pred=rf.predict(x_test)
accuracy_score(y_test,y_pred)

0.846324493294343

In [51]:
confusion_matrix(y_test,y_pred)

array([[4178,  762],
       [ 762, 4215]])

# Saving and Loading

In [60]:
import pickle

# save the iris classification model as a pickle file
model_pkl_file = "Sentimental_Analysis1.pkl"  

with open(model_pkl_file, 'wb') as file:  
    pickle.dump(rf, file)

In [61]:
with open(model_pkl_file, 'rb') as file:  
    rf = pickle.load(file)
y_pred=rf.predict(x_test)
accuracy_score(y_test,y_pred)

0.844711102147827

In [54]:
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.2,random_state=3,stratify=y)

# TF_IDF

In [55]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [56]:
tfidf=TfidfVectorizer(max_features=10000)

In [57]:
x_train=tfidf.fit_transform(x_train['review']).toarray()
x_test=tfidf.transform(x_test['review'])

In [58]:
rf=RandomForestClassifier()
rf.fit(x_train,y_train)
y_pred=rf.predict(x_test)
accuracy_score(y_test,y_pred)

0.844711102147827

In [59]:
confusion_matrix(y_test,y_pred)

array([[4182,  758],
       [ 782, 4195]])

In [62]:
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.2,random_state=3,stratify=y)

# Word2Vec

In [64]:
import gensim

In [65]:
from nltk import sent_tokenize
from gensim.utils import simple_preprocess

In [66]:
story=[]
for doc in df['review']:
    raw_sent=sent_tokenize(doc)
    for sent in raw_sent:
        story.append(simple_preprocess(sent))

In [67]:
model=gensim.models.Word2Vec(
window=10,min_count=2)

In [68]:
model.build_vocab(story)

In [69]:
model.train(story,total_examples=model.corpus_count,epochs=model.epochs)

(28382867, 30062525)

In [70]:
len(model.wv.index_to_key)

79870

In [71]:
def dec_vector(doc):
    doc=[word for word in doc.split() if word in model.wv.index_to_key]
    return np.mean(model.wv[doc],axis=0)

In [72]:
from tqdm import tqdm

In [74]:
X=[]
for doc in tqdm(df['review'].values):
    X.append(dec_vector(doc))
                

100%|██████████| 49582/49582 [19:48<00:00, 41.72it/s]


In [75]:
X=np.array(X)

In [76]:
X.shape

(49582, 100)

In [77]:
y

array([1, 1, 1, ..., 0, 0, 0])

In [78]:
x_train,x_test,y_train,y_test=train_test_split(X,y,test_size=0.2,random_state=3,stratify=y)

In [79]:
rf=RandomForestClassifier()
rf.fit(x_train,y_train)
y_pred=rf.predict(x_test)
accuracy_score(y_test,y_pred)

0.8395684178683069

In [80]:
confusion_matrix(y_test,y_pred)

array([[4011,  929],
       [ 662, 4315]])

In [81]:
model_pkl_file = "Sentimental_Analysis_Word2Vec.pkl"  

with open(model_pkl_file, 'wb') as file:  
    pickle.dump(rf, file)