{
"cells": [
{
"cell_type": "code",
"execution_count": 2,
"metadata": {
"execution": {
"iopub.execute_input": "2024-05-28T14:52:16.293152Z",
"iopub.status.busy": "2024-05-28T14:52:16.292540Z",
"iopub.status.idle": "2024-05-28T14:52:17.111429Z",
"shell.execute_reply": "2024-05-28T14:52:17.110616Z",
"shell.execute_reply.started": "2024-05-28T14:52:16.293118Z"
}
},
"outputs": [],
"source": [
"import numpy as np\n",
"import pandas as pd"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {
"execution": {
"iopub.execute_input": "2024-05-28T14:52:17.113325Z",
"iopub.status.busy": "2024-05-28T14:52:17.112978Z",
"iopub.status.idle": "2024-05-28T14:52:18.294078Z",
"shell.execute_reply": "2024-05-28T14:52:18.293195Z",
"shell.execute_reply.started": "2024-05-28T14:52:17.113302Z"
}
},
"outputs": [],
"source": [
"df=pd.read_csv(\"/kaggle/input/imdb-dataset-of-50k-movie-reviews/IMDB Dataset.csv\")"
]
},
{
"cell_type": "markdown",
"metadata": {
"execution": {
"iopub.execute_input": "2024-05-28T14:25:43.527134Z",
"iopub.status.busy": "2024-05-28T14:25:43.526863Z",
"iopub.status.idle": "2024-05-28T14:25:43.531421Z",
"shell.execute_reply": "2024-05-28T14:25:43.530388Z",
"shell.execute_reply.started": "2024-05-28T14:25:43.527112Z"
}
},
"source": [
"df=df.head(10000)"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {
"execution": {
"iopub.execute_input": "2024-05-28T14:52:18.295520Z",
"iopub.status.busy": "2024-05-28T14:52:18.295184Z",
"iopub.status.idle": "2024-05-28T14:52:18.320913Z",
"shell.execute_reply": "2024-05-28T14:52:18.320001Z",
"shell.execute_reply.started": "2024-05-28T14:52:18.295492Z"
}
},
"outputs": [
{
"data": {
"text/plain": [
"sentiment\n",
"positive 25000\n",
"negative 25000\n",
"Name: count, dtype: int64"
]
},
"execution_count": 4,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df['sentiment'].value_counts()"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {
"execution": {
"iopub.execute_input": "2024-05-28T14:52:18.324117Z",
"iopub.status.busy": "2024-05-28T14:52:18.323673Z",
"iopub.status.idle": "2024-05-28T14:52:18.340689Z",
"shell.execute_reply": "2024-05-28T14:52:18.339777Z",
"shell.execute_reply.started": "2024-05-28T14:52:18.324090Z"
}
},
"outputs": [
{
"data": {
"text/plain": [
"review 0\n",
"sentiment 0\n",
"dtype: int64"
]
},
"execution_count": 5,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df.isnull().sum()"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {
"execution": {
"iopub.execute_input": "2024-05-28T14:52:18.342289Z",
"iopub.status.busy": "2024-05-28T14:52:18.341941Z",
"iopub.status.idle": "2024-05-28T14:52:18.511426Z",
"shell.execute_reply": "2024-05-28T14:52:18.510510Z",
"shell.execute_reply.started": "2024-05-28T14:52:18.342257Z"
}
},
"outputs": [
{
"data": {
"text/plain": [
"418"
]
},
"execution_count": 6,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df.duplicated().sum()"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {
"execution": {
"iopub.execute_input": "2024-05-28T14:52:18.513001Z",
"iopub.status.busy": "2024-05-28T14:52:18.512633Z",
"iopub.status.idle": "2024-05-28T14:52:18.662976Z",
"shell.execute_reply": "2024-05-28T14:52:18.662258Z",
"shell.execute_reply.started": "2024-05-28T14:52:18.512969Z"
}
},
"outputs": [],
"source": [
"df=df.drop_duplicates()"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {
"execution": {
"iopub.execute_input": "2024-05-28T14:52:18.664321Z",
"iopub.status.busy": "2024-05-28T14:52:18.664037Z",
"iopub.status.idle": "2024-05-28T14:52:18.811830Z",
"shell.execute_reply": "2024-05-28T14:52:18.810966Z",
"shell.execute_reply.started": "2024-05-28T14:52:18.664297Z"
}
},
"outputs": [
{
"data": {
"text/plain": [
"0"
]
},
"execution_count": 8,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df.duplicated().sum()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# **Removing HTML Tags**"
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {
"execution": {
"iopub.execute_input": "2024-05-28T14:52:18.813219Z",
"iopub.status.busy": "2024-05-28T14:52:18.812944Z",
"iopub.status.idle": "2024-05-28T14:52:18.817734Z",
"shell.execute_reply": "2024-05-28T14:52:18.816871Z",
"shell.execute_reply.started": "2024-05-28T14:52:18.813195Z"
}
},
"outputs": [],
"source": [
"import re\n",
"def remove_tags(text):\n",
" return re.sub(re.compile('<.*?>'),'',text)"
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {
"execution": {
"iopub.execute_input": "2024-05-28T14:52:18.819205Z",
"iopub.status.busy": "2024-05-28T14:52:18.818864Z",
"iopub.status.idle": "2024-05-28T14:52:19.094654Z",
"shell.execute_reply": "2024-05-28T14:52:19.093668Z",
"shell.execute_reply.started": "2024-05-28T14:52:18.819173Z"
}
},
"outputs": [],
"source": [
"df['review']=df['review'].apply(remove_tags)"
]
},
{
"cell_type": "code",
"execution_count": 11,
"metadata": {
"execution": {
"iopub.execute_input": "2024-05-28T14:52:19.099305Z",
"iopub.status.busy": "2024-05-28T14:52:19.099017Z",
"iopub.status.idle": "2024-05-28T14:52:19.112472Z",
"shell.execute_reply": "2024-05-28T14:52:19.111531Z",
"shell.execute_reply.started": "2024-05-28T14:52:19.099280Z"
}
},
"outputs": [
{
"data": {
"text/html": [
"
\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" review | \n",
" sentiment | \n",
"
\n",
" \n",
" \n",
" \n",
" | 0 | \n",
" One of the other reviewers has mentioned that ... | \n",
" positive | \n",
"
\n",
" \n",
" | 1 | \n",
" A wonderful little production. The filming tec... | \n",
" positive | \n",
"
\n",
" \n",
" | 2 | \n",
" I thought this was a wonderful way to spend ti... | \n",
" positive | \n",
"
\n",
" \n",
" | 3 | \n",
" Basically there's a family where a little boy ... | \n",
" negative | \n",
"
\n",
" \n",
" | 4 | \n",
" Petter Mattei's \"Love in the Time of Money\" is... | \n",
" positive | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" review sentiment\n",
"0 One of the other reviewers has mentioned that ... positive\n",
"1 A wonderful little production. The filming tec... positive\n",
"2 I thought this was a wonderful way to spend ti... positive\n",
"3 Basically there's a family where a little boy ... negative\n",
"4 Petter Mattei's \"Love in the Time of Money\" is... positive"
]
},
"execution_count": 11,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df.head()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# **Lowercase**"
]
},
{
"cell_type": "code",
"execution_count": 12,
"metadata": {
"execution": {
"iopub.execute_input": "2024-05-28T14:52:19.113977Z",
"iopub.status.busy": "2024-05-28T14:52:19.113619Z",
"iopub.status.idle": "2024-05-28T14:52:19.299489Z",
"shell.execute_reply": "2024-05-28T14:52:19.298706Z",
"shell.execute_reply.started": "2024-05-28T14:52:19.113944Z"
}
},
"outputs": [],
"source": [
"df['review']=df['review'].apply(lambda x:x.lower())"
]
},
{
"cell_type": "code",
"execution_count": 13,
"metadata": {
"execution": {
"iopub.execute_input": "2024-05-28T14:52:19.300853Z",
"iopub.status.busy": "2024-05-28T14:52:19.300570Z",
"iopub.status.idle": "2024-05-28T14:52:19.311359Z",
"shell.execute_reply": "2024-05-28T14:52:19.310381Z",
"shell.execute_reply.started": "2024-05-28T14:52:19.300827Z"
}
},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" review | \n",
" sentiment | \n",
"
\n",
" \n",
" \n",
" \n",
" | 0 | \n",
" one of the other reviewers has mentioned that ... | \n",
" positive | \n",
"
\n",
" \n",
" | 1 | \n",
" a wonderful little production. the filming tec... | \n",
" positive | \n",
"
\n",
" \n",
" | 2 | \n",
" i thought this was a wonderful way to spend ti... | \n",
" positive | \n",
"
\n",
" \n",
" | 3 | \n",
" basically there's a family where a little boy ... | \n",
" negative | \n",
"
\n",
" \n",
" | 4 | \n",
" petter mattei's \"love in the time of money\" is... | \n",
" positive | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" review sentiment\n",
"0 one of the other reviewers has mentioned that ... positive\n",
"1 a wonderful little production. the filming tec... positive\n",
"2 i thought this was a wonderful way to spend ti... positive\n",
"3 basically there's a family where a little boy ... negative\n",
"4 petter mattei's \"love in the time of money\" is... positive"
]
},
"execution_count": 13,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df.head()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# **Removing Stopwords**"
]
},
{
"cell_type": "code",
"execution_count": 14,
"metadata": {
"execution": {
"iopub.execute_input": "2024-05-28T14:52:19.313147Z",
"iopub.status.busy": "2024-05-28T14:52:19.312754Z",
"iopub.status.idle": "2024-05-28T14:52:20.687218Z",
"shell.execute_reply": "2024-05-28T14:52:20.686234Z",
"shell.execute_reply.started": "2024-05-28T14:52:19.313113Z"
}
},
"outputs": [],
"source": [
"from nltk.corpus import stopwords"
]
},
{
"cell_type": "code",
"execution_count": 15,
"metadata": {
"execution": {
"iopub.execute_input": "2024-05-28T14:52:20.688722Z",
"iopub.status.busy": "2024-05-28T14:52:20.688422Z",
"iopub.status.idle": "2024-05-28T14:52:20.695776Z",
"shell.execute_reply": "2024-05-28T14:52:20.694869Z",
"shell.execute_reply.started": "2024-05-28T14:52:20.688690Z"
}
},
"outputs": [],
"source": [
"sw_list=stopwords.words('english')"
]
},
{
"cell_type": "code",
"execution_count": 16,
"metadata": {
"execution": {
"iopub.execute_input": "2024-05-28T14:52:20.697662Z",
"iopub.status.busy": "2024-05-28T14:52:20.696910Z",
"iopub.status.idle": "2024-05-28T14:52:41.103845Z",
"shell.execute_reply": "2024-05-28T14:52:41.103006Z",
"shell.execute_reply.started": "2024-05-28T14:52:20.697626Z"
}
},
"outputs": [],
"source": [
"df['review']=df['review'].apply(lambda x:[item for item in x.split() if item not in sw_list]).apply(lambda x:\" \".join(x))"
]
},
{
"cell_type": "code",
"execution_count": 17,
"metadata": {
"execution": {
"iopub.execute_input": "2024-05-28T14:52:41.105206Z",
"iopub.status.busy": "2024-05-28T14:52:41.104945Z",
"iopub.status.idle": "2024-05-28T14:52:41.115077Z",
"shell.execute_reply": "2024-05-28T14:52:41.114149Z",
"shell.execute_reply.started": "2024-05-28T14:52:41.105183Z"
}
},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" review | \n",
" sentiment | \n",
"
\n",
" \n",
" \n",
" \n",
" | 0 | \n",
" one reviewers mentioned watching 1 oz episode ... | \n",
" positive | \n",
"
\n",
" \n",
" | 1 | \n",
" wonderful little production. filming technique... | \n",
" positive | \n",
"
\n",
" \n",
" | 2 | \n",
" thought wonderful way spend time hot summer we... | \n",
" positive | \n",
"
\n",
" \n",
" | 3 | \n",
" basically there's family little boy (jake) thi... | \n",
" negative | \n",
"
\n",
" \n",
" | 4 | \n",
" petter mattei's \"love time money\" visually stu... | \n",
" positive | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" review sentiment\n",
"0 one reviewers mentioned watching 1 oz episode ... positive\n",
"1 wonderful little production. filming technique... positive\n",
"2 thought wonderful way spend time hot summer we... positive\n",
"3 basically there's family little boy (jake) thi... negative\n",
"4 petter mattei's \"love time money\" visually stu... positive"
]
},
"execution_count": 17,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df.head()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# **Removing Numbers**"
]
},
{
"cell_type": "code",
"execution_count": 18,
"metadata": {
"execution": {
"iopub.execute_input": "2024-05-28T14:52:41.116693Z",
"iopub.status.busy": "2024-05-28T14:52:41.116370Z",
"iopub.status.idle": "2024-05-28T14:52:42.350128Z",
"shell.execute_reply": "2024-05-28T14:52:42.349082Z",
"shell.execute_reply.started": "2024-05-28T14:52:41.116666Z"
}
},
"outputs": [],
"source": [
"df['review']=df['review'].apply(lambda x:' '.join([i for i in x.split() if not i.isdigit()]))"
]
},
{
"cell_type": "code",
"execution_count": 19,
"metadata": {
"execution": {
"iopub.execute_input": "2024-05-28T14:52:42.352200Z",
"iopub.status.busy": "2024-05-28T14:52:42.351476Z",
"iopub.status.idle": "2024-05-28T14:52:42.361695Z",
"shell.execute_reply": "2024-05-28T14:52:42.360686Z",
"shell.execute_reply.started": "2024-05-28T14:52:42.352154Z"
}
},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" review | \n",
" sentiment | \n",
"
\n",
" \n",
" \n",
" \n",
" | 0 | \n",
" one reviewers mentioned watching oz episode ho... | \n",
" positive | \n",
"
\n",
" \n",
" | 1 | \n",
" wonderful little production. filming technique... | \n",
" positive | \n",
"
\n",
" \n",
" | 2 | \n",
" thought wonderful way spend time hot summer we... | \n",
" positive | \n",
"
\n",
" \n",
" | 3 | \n",
" basically there's family little boy (jake) thi... | \n",
" negative | \n",
"
\n",
" \n",
" | 4 | \n",
" petter mattei's \"love time money\" visually stu... | \n",
" positive | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" review sentiment\n",
"0 one reviewers mentioned watching oz episode ho... positive\n",
"1 wonderful little production. filming technique... positive\n",
"2 thought wonderful way spend time hot summer we... positive\n",
"3 basically there's family little boy (jake) thi... negative\n",
"4 petter mattei's \"love time money\" visually stu... positive"
]
},
"execution_count": 19,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df.head()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# **Removing Punctuation**"
]
},
{
"cell_type": "code",
"execution_count": 20,
"metadata": {
"execution": {
"iopub.execute_input": "2024-05-28T14:52:42.363205Z",
"iopub.status.busy": "2024-05-28T14:52:42.362921Z",
"iopub.status.idle": "2024-05-28T14:52:42.369773Z",
"shell.execute_reply": "2024-05-28T14:52:42.368884Z",
"shell.execute_reply.started": "2024-05-28T14:52:42.363181Z"
}
},
"outputs": [],
"source": [
"import string\n",
"PUNCT_TO_REMOVE = string.punctuation\n",
"def remove_punctuation(text):\n",
" return text.translate(str.maketrans('', '', PUNCT_TO_REMOVE))"
]
},
{
"cell_type": "code",
"execution_count": 21,
"metadata": {
"execution": {
"iopub.execute_input": "2024-05-28T14:52:42.371097Z",
"iopub.status.busy": "2024-05-28T14:52:42.370761Z",
"iopub.status.idle": "2024-05-28T14:52:43.206495Z",
"shell.execute_reply": "2024-05-28T14:52:43.205666Z",
"shell.execute_reply.started": "2024-05-28T14:52:42.371071Z"
}
},
"outputs": [],
"source": [
"df['review']=df['review'].apply(remove_punctuation)"
]
},
{
"cell_type": "code",
"execution_count": 22,
"metadata": {
"execution": {
"iopub.execute_input": "2024-05-28T14:52:43.207858Z",
"iopub.status.busy": "2024-05-28T14:52:43.207590Z",
"iopub.status.idle": "2024-05-28T14:52:43.217082Z",
"shell.execute_reply": "2024-05-28T14:52:43.216131Z",
"shell.execute_reply.started": "2024-05-28T14:52:43.207835Z"
}
},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" review | \n",
" sentiment | \n",
"
\n",
" \n",
" \n",
" \n",
" | 0 | \n",
" one reviewers mentioned watching oz episode ho... | \n",
" positive | \n",
"
\n",
" \n",
" | 1 | \n",
" wonderful little production filming technique ... | \n",
" positive | \n",
"
\n",
" \n",
" | 2 | \n",
" thought wonderful way spend time hot summer we... | \n",
" positive | \n",
"
\n",
" \n",
" | 3 | \n",
" basically theres family little boy jake thinks... | \n",
" negative | \n",
"
\n",
" \n",
" | 4 | \n",
" petter matteis love time money visually stunni... | \n",
" positive | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" review sentiment\n",
"0 one reviewers mentioned watching oz episode ho... positive\n",
"1 wonderful little production filming technique ... positive\n",
"2 thought wonderful way spend time hot summer we... positive\n",
"3 basically theres family little boy jake thinks... negative\n",
"4 petter matteis love time money visually stunni... positive"
]
},
"execution_count": 22,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df.head()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# **Removing Contractions**"
]
},
{
"cell_type": "code",
"execution_count": 23,
"metadata": {
"execution": {
"iopub.execute_input": "2024-05-28T14:52:43.218467Z",
"iopub.status.busy": "2024-05-28T14:52:43.218186Z",
"iopub.status.idle": "2024-05-28T14:52:56.917289Z",
"shell.execute_reply": "2024-05-28T14:52:56.916286Z",
"shell.execute_reply.started": "2024-05-28T14:52:43.218441Z"
}
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Requirement already satisfied: contractions in /opt/conda/lib/python3.10/site-packages (0.1.73)\n",
"Requirement already satisfied: textsearch>=0.0.21 in /opt/conda/lib/python3.10/site-packages (from contractions) (0.0.24)\n",
"Requirement already satisfied: anyascii in /opt/conda/lib/python3.10/site-packages (from textsearch>=0.0.21->contractions) (0.3.2)\n",
"Requirement already satisfied: pyahocorasick in /opt/conda/lib/python3.10/site-packages (from textsearch>=0.0.21->contractions) (2.1.0)\n"
]
}
],
"source": [
"!pip install contractions"
]
},
{
"cell_type": "code",
"execution_count": 24,
"metadata": {
"execution": {
"iopub.execute_input": "2024-05-28T14:52:56.919161Z",
"iopub.status.busy": "2024-05-28T14:52:56.918839Z",
"iopub.status.idle": "2024-05-28T14:52:56.944515Z",
"shell.execute_reply": "2024-05-28T14:52:56.943830Z",
"shell.execute_reply.started": "2024-05-28T14:52:56.919130Z"
}
},
"outputs": [],
"source": [
"import contractions\n",
"def remove_contractions(text):\n",
" return contractions.fix(text)\n"
]
},
{
"cell_type": "code",
"execution_count": 25,
"metadata": {
"execution": {
"iopub.execute_input": "2024-05-28T14:52:56.945783Z",
"iopub.status.busy": "2024-05-28T14:52:56.945518Z",
"iopub.status.idle": "2024-05-28T14:53:00.977406Z",
"shell.execute_reply": "2024-05-28T14:53:00.976592Z",
"shell.execute_reply.started": "2024-05-28T14:52:56.945760Z"
}
},
"outputs": [],
"source": [
"df['review']=df['review'].apply(remove_contractions)"
]
},
{
"cell_type": "code",
"execution_count": 26,
"metadata": {
"execution": {
"iopub.execute_input": "2024-05-28T14:53:00.978856Z",
"iopub.status.busy": "2024-05-28T14:53:00.978551Z",
"iopub.status.idle": "2024-05-28T14:53:00.991438Z",
"shell.execute_reply": "2024-05-28T14:53:00.990549Z",
"shell.execute_reply.started": "2024-05-28T14:53:00.978830Z"
}
},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" review | \n",
" sentiment | \n",
"
\n",
" \n",
" \n",
" \n",
" | 0 | \n",
" one reviewers mentioned watching oz episode ho... | \n",
" positive | \n",
"
\n",
" \n",
" | 1 | \n",
" wonderful little production filming technique ... | \n",
" positive | \n",
"
\n",
" \n",
" | 2 | \n",
" thought wonderful way spend time hot summer we... | \n",
" positive | \n",
"
\n",
" \n",
" | 3 | \n",
" basically there is family little boy jake thin... | \n",
" negative | \n",
"
\n",
" \n",
" | 4 | \n",
" petter matteis love time money visually stunni... | \n",
" positive | \n",
"
\n",
" \n",
" | ... | \n",
" ... | \n",
" ... | \n",
"
\n",
" \n",
" | 49995 | \n",
" thought movie right good job creative original... | \n",
" positive | \n",
"
\n",
" \n",
" | 49996 | \n",
" bad plot bad dialogue bad acting idiotic direc... | \n",
" negative | \n",
"
\n",
" \n",
" | 49997 | \n",
" catholic taught parochial elementary schools n... | \n",
" negative | \n",
"
\n",
" \n",
" | 49998 | \n",
" i am going disagree previous comment side malt... | \n",
" negative | \n",
"
\n",
" \n",
" | 49999 | \n",
" one expects star trek movies high art fans exp... | \n",
" negative | \n",
"
\n",
" \n",
"
\n",
"
49582 rows × 2 columns
\n",
"
"
],
"text/plain": [
" review sentiment\n",
"0 one reviewers mentioned watching oz episode ho... positive\n",
"1 wonderful little production filming technique ... positive\n",
"2 thought wonderful way spend time hot summer we... positive\n",
"3 basically there is family little boy jake thin... negative\n",
"4 petter matteis love time money visually stunni... positive\n",
"... ... ...\n",
"49995 thought movie right good job creative original... positive\n",
"49996 bad plot bad dialogue bad acting idiotic direc... negative\n",
"49997 catholic taught parochial elementary schools n... negative\n",
"49998 i am going disagree previous comment side malt... negative\n",
"49999 one expects star trek movies high art fans exp... negative\n",
"\n",
"[49582 rows x 2 columns]"
]
},
"execution_count": 26,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df"
]
},
{
"cell_type": "code",
"execution_count": 27,
"metadata": {
"execution": {
"iopub.execute_input": "2024-05-28T14:53:00.993229Z",
"iopub.status.busy": "2024-05-28T14:53:00.992777Z",
"iopub.status.idle": "2024-05-28T14:53:01.001576Z",
"shell.execute_reply": "2024-05-28T14:53:01.000876Z",
"shell.execute_reply.started": "2024-05-28T14:53:00.993151Z"
}
},
"outputs": [],
"source": [
"x=df.drop(columns='sentiment')\n",
"y=df['sentiment']"
]
},
{
"cell_type": "code",
"execution_count": 28,
"metadata": {
"execution": {
"iopub.execute_input": "2024-05-28T14:53:01.003284Z",
"iopub.status.busy": "2024-05-28T14:53:01.002910Z",
"iopub.status.idle": "2024-05-28T14:53:01.014799Z",
"shell.execute_reply": "2024-05-28T14:53:01.013876Z",
"shell.execute_reply.started": "2024-05-28T14:53:01.003260Z"
}
},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" review | \n",
"
\n",
" \n",
" \n",
" \n",
" | 0 | \n",
" one reviewers mentioned watching oz episode ho... | \n",
"
\n",
" \n",
" | 1 | \n",
" wonderful little production filming technique ... | \n",
"
\n",
" \n",
" | 2 | \n",
" thought wonderful way spend time hot summer we... | \n",
"
\n",
" \n",
" | 3 | \n",
" basically there is family little boy jake thin... | \n",
"
\n",
" \n",
" | 4 | \n",
" petter matteis love time money visually stunni... | \n",
"
\n",
" \n",
" | ... | \n",
" ... | \n",
"
\n",
" \n",
" | 49995 | \n",
" thought movie right good job creative original... | \n",
"
\n",
" \n",
" | 49996 | \n",
" bad plot bad dialogue bad acting idiotic direc... | \n",
"
\n",
" \n",
" | 49997 | \n",
" catholic taught parochial elementary schools n... | \n",
"
\n",
" \n",
" | 49998 | \n",
" i am going disagree previous comment side malt... | \n",
"
\n",
" \n",
" | 49999 | \n",
" one expects star trek movies high art fans exp... | \n",
"
\n",
" \n",
"
\n",
"
49582 rows × 1 columns
\n",
"
"
],
"text/plain": [
" review\n",
"0 one reviewers mentioned watching oz episode ho...\n",
"1 wonderful little production filming technique ...\n",
"2 thought wonderful way spend time hot summer we...\n",
"3 basically there is family little boy jake thin...\n",
"4 petter matteis love time money visually stunni...\n",
"... ...\n",
"49995 thought movie right good job creative original...\n",
"49996 bad plot bad dialogue bad acting idiotic direc...\n",
"49997 catholic taught parochial elementary schools n...\n",
"49998 i am going disagree previous comment side malt...\n",
"49999 one expects star trek movies high art fans exp...\n",
"\n",
"[49582 rows x 1 columns]"
]
},
"execution_count": 28,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"x"
]
},
{
"cell_type": "code",
"execution_count": 29,
"metadata": {
"execution": {
"iopub.execute_input": "2024-05-28T14:53:01.022597Z",
"iopub.status.busy": "2024-05-28T14:53:01.022330Z",
"iopub.status.idle": "2024-05-28T14:53:01.030503Z",
"shell.execute_reply": "2024-05-28T14:53:01.029518Z",
"shell.execute_reply.started": "2024-05-28T14:53:01.022574Z"
}
},
"outputs": [
{
"data": {
"text/plain": [
"0 positive\n",
"1 positive\n",
"2 positive\n",
"3 negative\n",
"4 positive\n",
" ... \n",
"49995 positive\n",
"49996 negative\n",
"49997 negative\n",
"49998 negative\n",
"49999 negative\n",
"Name: sentiment, Length: 49582, dtype: object"
]
},
"execution_count": 29,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"y"
]
},
{
"cell_type": "code",
"execution_count": 30,
"metadata": {
"execution": {
"iopub.execute_input": "2024-05-28T14:53:01.032109Z",
"iopub.status.busy": "2024-05-28T14:53:01.031731Z",
"iopub.status.idle": "2024-05-28T14:53:01.037243Z",
"shell.execute_reply": "2024-05-28T14:53:01.036374Z",
"shell.execute_reply.started": "2024-05-28T14:53:01.032084Z"
}
},
"outputs": [],
"source": [
"from sklearn.preprocessing import LabelEncoder"
]
},
{
"cell_type": "code",
"execution_count": 31,
"metadata": {
"execution": {
"iopub.execute_input": "2024-05-28T14:53:01.038703Z",
"iopub.status.busy": "2024-05-28T14:53:01.038403Z",
"iopub.status.idle": "2024-05-28T14:53:01.057333Z",
"shell.execute_reply": "2024-05-28T14:53:01.056511Z",
"shell.execute_reply.started": "2024-05-28T14:53:01.038679Z"
}
},
"outputs": [],
"source": [
"y=LabelEncoder().fit_transform(y)"
]
},
{
"cell_type": "code",
"execution_count": 32,
"metadata": {
"execution": {
"iopub.execute_input": "2024-05-28T14:53:01.058600Z",
"iopub.status.busy": "2024-05-28T14:53:01.058345Z",
"iopub.status.idle": "2024-05-28T14:53:01.067016Z",
"shell.execute_reply": "2024-05-28T14:53:01.066135Z",
"shell.execute_reply.started": "2024-05-28T14:53:01.058579Z"
}
},
"outputs": [
{
"data": {
"text/plain": [
"array([1, 1, 1, ..., 0, 0, 0])"
]
},
"execution_count": 32,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"y"
]
},
{
"cell_type": "code",
"execution_count": 33,
"metadata": {
"execution": {
"iopub.execute_input": "2024-05-28T14:53:01.068374Z",
"iopub.status.busy": "2024-05-28T14:53:01.068086Z",
"iopub.status.idle": "2024-05-28T14:53:01.099626Z",
"shell.execute_reply": "2024-05-28T14:53:01.098771Z",
"shell.execute_reply.started": "2024-05-28T14:53:01.068341Z"
}
},
"outputs": [],
"source": [
"from sklearn.model_selection import train_test_split\n",
"x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.2,random_state=3,stratify=y)"
]
},
{
"cell_type": "code",
"execution_count": 34,
"metadata": {
"execution": {
"iopub.execute_input": "2024-05-28T14:53:01.101218Z",
"iopub.status.busy": "2024-05-28T14:53:01.100870Z",
"iopub.status.idle": "2024-05-28T14:53:01.106109Z",
"shell.execute_reply": "2024-05-28T14:53:01.105161Z",
"shell.execute_reply.started": "2024-05-28T14:53:01.101184Z"
}
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"(39665, 1) (9917, 1)\n"
]
}
],
"source": [
"print(x_train.shape,x_test.shape)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Bag of Word"
]
},
{
"cell_type": "code",
"execution_count": 35,
"metadata": {
"execution": {
"iopub.execute_input": "2024-05-28T14:53:01.107866Z",
"iopub.status.busy": "2024-05-28T14:53:01.107441Z",
"iopub.status.idle": "2024-05-28T14:53:01.113900Z",
"shell.execute_reply": "2024-05-28T14:53:01.112939Z",
"shell.execute_reply.started": "2024-05-28T14:53:01.107829Z"
}
},
"outputs": [],
"source": [
"from sklearn.feature_extraction.text import CountVectorizer"
]
},
{
"cell_type": "code",
"execution_count": 36,
"metadata": {
"execution": {
"iopub.execute_input": "2024-05-28T14:53:01.115218Z",
"iopub.status.busy": "2024-05-28T14:53:01.114952Z",
"iopub.status.idle": "2024-05-28T14:53:01.121742Z",
"shell.execute_reply": "2024-05-28T14:53:01.120751Z",
"shell.execute_reply.started": "2024-05-28T14:53:01.115195Z"
}
},
"outputs": [],
"source": [
"cv=CountVectorizer(max_features=10000)"
]
},
{
"cell_type": "code",
"execution_count": 37,
"metadata": {
"execution": {
"iopub.execute_input": "2024-05-28T14:53:01.123124Z",
"iopub.status.busy": "2024-05-28T14:53:01.122830Z",
"iopub.status.idle": "2024-05-28T14:53:01.134108Z",
"shell.execute_reply": "2024-05-28T14:53:01.133166Z",
"shell.execute_reply.started": "2024-05-28T14:53:01.123101Z"
}
},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" review | \n",
"
\n",
" \n",
" \n",
" \n",
" | 17185 | \n",
" watching avalon which decent nice digital fx s... | \n",
"
\n",
" \n",
" | 12989 | \n",
" rarely denzil washington make bad movie come t... | \n",
"
\n",
" \n",
" | 31628 | \n",
" think movie reasonbaly good kind of weird olse... | \n",
"
\n",
" \n",
" | 12399 | \n",
" movie is horrible wonderful time first saw yea... | \n",
"
\n",
" \n",
" | 33230 | \n",
" watching the bodyguard last night felt compell... | \n",
"
\n",
" \n",
" | ... | \n",
" ... | \n",
"
\n",
" \n",
" | 31515 | \n",
" good cast with one major exception pushes way ... | \n",
"
\n",
" \n",
" | 19133 | \n",
" seldom see short comments written imdb filmgoe... | \n",
"
\n",
" \n",
" | 47930 | \n",
" say without shadow doubt going overboard singl... | \n",
"
\n",
" \n",
" | 35145 | \n",
" wife watched dvring encore action past week wo... | \n",
"
\n",
" \n",
" | 32654 | \n",
" pokemon little three four episodes tv series s... | \n",
"
\n",
" \n",
"
\n",
"
39665 rows × 1 columns
\n",
"
"
],
"text/plain": [
" review\n",
"17185 watching avalon which decent nice digital fx s...\n",
"12989 rarely denzil washington make bad movie come t...\n",
"31628 think movie reasonbaly good kind of weird olse...\n",
"12399 movie is horrible wonderful time first saw yea...\n",
"33230 watching the bodyguard last night felt compell...\n",
"... ...\n",
"31515 good cast with one major exception pushes way ...\n",
"19133 seldom see short comments written imdb filmgoe...\n",
"47930 say without shadow doubt going overboard singl...\n",
"35145 wife watched dvring encore action past week wo...\n",
"32654 pokemon little three four episodes tv series s...\n",
"\n",
"[39665 rows x 1 columns]"
]
},
"execution_count": 37,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"x_train"
]
},
{
"cell_type": "code",
"execution_count": 38,
"metadata": {
"execution": {
"iopub.execute_input": "2024-05-28T14:53:01.135399Z",
"iopub.status.busy": "2024-05-28T14:53:01.135133Z",
"iopub.status.idle": "2024-05-28T14:53:10.557535Z",
"shell.execute_reply": "2024-05-28T14:53:10.556708Z",
"shell.execute_reply.started": "2024-05-28T14:53:01.135377Z"
}
},
"outputs": [],
"source": [
"x_train=cv.fit_transform(x_train['review']).toarray()\n",
"x_test=cv.transform(x_test['review']).toarray()"
]
},
{
"cell_type": "code",
"execution_count": 39,
"metadata": {
"execution": {
"iopub.execute_input": "2024-05-28T14:53:10.559169Z",
"iopub.status.busy": "2024-05-28T14:53:10.558796Z",
"iopub.status.idle": "2024-05-28T14:53:10.565563Z",
"shell.execute_reply": "2024-05-28T14:53:10.564604Z",
"shell.execute_reply.started": "2024-05-28T14:53:10.559135Z"
}
},
"outputs": [
{
"data": {
"text/plain": [
"(39665, 10000)"
]
},
"execution_count": 39,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"x_train.shape"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Applying NaiveBayes"
]
},
{
"cell_type": "code",
"execution_count": 40,
"metadata": {
"execution": {
"iopub.execute_input": "2024-05-28T14:53:10.567583Z",
"iopub.status.busy": "2024-05-28T14:53:10.566619Z",
"iopub.status.idle": "2024-05-28T14:53:16.857125Z",
"shell.execute_reply": "2024-05-28T14:53:16.856146Z",
"shell.execute_reply.started": "2024-05-28T14:53:10.567557Z"
}
},
"outputs": [
{
"data": {
"text/html": [
"GaussianNB()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org. "
],
"text/plain": [
"GaussianNB()"
]
},
"execution_count": 40,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"from sklearn.naive_bayes import GaussianNB\n",
"gnb=GaussianNB()\n",
"gnb.fit(x_train,y_train)"
]
},
{
"cell_type": "code",
"execution_count": 41,
"metadata": {
"execution": {
"iopub.execute_input": "2024-05-28T14:53:16.860063Z",
"iopub.status.busy": "2024-05-28T14:53:16.858411Z",
"iopub.status.idle": "2024-05-28T14:53:18.286607Z",
"shell.execute_reply": "2024-05-28T14:53:18.285709Z",
"shell.execute_reply.started": "2024-05-28T14:53:16.860034Z"
}
},
"outputs": [],
"source": [
"y_pred=gnb.predict(x_test)"
]
},
{
"cell_type": "code",
"execution_count": 42,
"metadata": {
"execution": {
"iopub.execute_input": "2024-05-28T14:53:18.288847Z",
"iopub.status.busy": "2024-05-28T14:53:18.288416Z",
"iopub.status.idle": "2024-05-28T14:53:18.293343Z",
"shell.execute_reply": "2024-05-28T14:53:18.292339Z",
"shell.execute_reply.started": "2024-05-28T14:53:18.288789Z"
}
},
"outputs": [],
"source": [
"from sklearn.metrics import accuracy_score,confusion_matrix"
]
},
{
"cell_type": "code",
"execution_count": 43,
"metadata": {
"execution": {
"iopub.execute_input": "2024-05-28T14:53:18.295253Z",
"iopub.status.busy": "2024-05-28T14:53:18.294781Z",
"iopub.status.idle": "2024-05-28T14:53:18.305801Z",
"shell.execute_reply": "2024-05-28T14:53:18.304884Z",
"shell.execute_reply.started": "2024-05-28T14:53:18.295215Z"
}
},
"outputs": [
{
"data": {
"text/plain": [
"0.7354038519713623"
]
},
"execution_count": 43,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"accuracy_score(y_test,y_pred)"
]
},
{
"cell_type": "code",
"execution_count": 44,
"metadata": {
"execution": {
"iopub.execute_input": "2024-05-28T14:53:18.307732Z",
"iopub.status.busy": "2024-05-28T14:53:18.307121Z",
"iopub.status.idle": "2024-05-28T14:53:18.316221Z",
"shell.execute_reply": "2024-05-28T14:53:18.315238Z",
"shell.execute_reply.started": "2024-05-28T14:53:18.307697Z"
}
},
"outputs": [
{
"data": {
"text/plain": [
"array([[4276, 664],\n",
" [1960, 3017]])"
]
},
"execution_count": 44,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"confusion_matrix(y_test,y_pred)"
]
},
{
"cell_type": "code",
"execution_count": 45,
"metadata": {
"execution": {
"iopub.execute_input": "2024-05-28T14:53:18.317650Z",
"iopub.status.busy": "2024-05-28T14:53:18.317380Z",
"iopub.status.idle": "2024-05-28T14:55:29.097680Z",
"shell.execute_reply": "2024-05-28T14:55:29.096708Z",
"shell.execute_reply.started": "2024-05-28T14:53:18.317628Z"
}
},
"outputs": [
{
"data": {
"text/plain": [
"0.8426943632146818"
]
},
"execution_count": 45,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"from sklearn.ensemble import RandomForestClassifier\n",
"rf=RandomForestClassifier()\n",
"rf.fit(x_train,y_train)\n",
"y_pred=rf.predict(x_test)\n",
"accuracy_score(y_test,y_pred)"
]
},
{
"cell_type": "code",
"execution_count": 46,
"metadata": {
"execution": {
"iopub.execute_input": "2024-05-28T14:55:29.099391Z",
"iopub.status.busy": "2024-05-28T14:55:29.099103Z",
"iopub.status.idle": "2024-05-28T14:55:29.108811Z",
"shell.execute_reply": "2024-05-28T14:55:29.107863Z",
"shell.execute_reply.started": "2024-05-28T14:55:29.099364Z"
}
},
"outputs": [
{
"data": {
"text/plain": [
"array([[4152, 788],\n",
" [ 772, 4205]])"
]
},
"execution_count": 46,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"confusion_matrix(y_test,y_pred)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# N_Grams"
]
},
{
"cell_type": "code",
"execution_count": 47,
"metadata": {
"execution": {
"iopub.execute_input": "2024-05-28T14:55:29.110803Z",
"iopub.status.busy": "2024-05-28T14:55:29.110043Z",
"iopub.status.idle": "2024-05-28T14:55:29.372362Z",
"shell.execute_reply": "2024-05-28T14:55:29.371484Z",
"shell.execute_reply.started": "2024-05-28T14:55:29.110765Z"
}
},
"outputs": [],
"source": [
"x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.2,random_state=3,stratify=y)"
]
},
{
"cell_type": "code",
"execution_count": 48,
"metadata": {
"execution": {
"iopub.execute_input": "2024-05-28T14:55:29.373855Z",
"iopub.status.busy": "2024-05-28T14:55:29.373553Z",
"iopub.status.idle": "2024-05-28T14:55:29.393260Z",
"shell.execute_reply": "2024-05-28T14:55:29.392224Z",
"shell.execute_reply.started": "2024-05-28T14:55:29.373828Z"
}
},
"outputs": [],
"source": [
"cv=CountVectorizer(ngram_range=(1,2),max_features=10000)\n"
]
},
{
"cell_type": "code",
"execution_count": 49,
"metadata": {
"execution": {
"iopub.execute_input": "2024-05-28T14:55:29.394786Z",
"iopub.status.busy": "2024-05-28T14:55:29.394486Z",
"iopub.status.idle": "2024-05-28T14:56:00.578967Z",
"shell.execute_reply": "2024-05-28T14:56:00.577883Z",
"shell.execute_reply.started": "2024-05-28T14:55:29.394758Z"
}
},
"outputs": [],
"source": [
"x_train=cv.fit_transform(x_train['review']).toarray()\n",
"x_test=cv.transform(x_test['review']).toarray()"
]
},
{
"cell_type": "code",
"execution_count": 50,
"metadata": {
"execution": {
"iopub.execute_input": "2024-05-28T14:56:00.580808Z",
"iopub.status.busy": "2024-05-28T14:56:00.580266Z",
"iopub.status.idle": "2024-05-28T14:58:18.070996Z",
"shell.execute_reply": "2024-05-28T14:58:18.069821Z",
"shell.execute_reply.started": "2024-05-28T14:56:00.580771Z"
}
},
"outputs": [
{
"data": {
"text/plain": [
"0.846324493294343"
]
},
"execution_count": 50,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"from sklearn.ensemble import RandomForestClassifier\n",
"rf=RandomForestClassifier()\n",
"rf.fit(x_train,y_train)\n",
"y_pred=rf.predict(x_test)\n",
"accuracy_score(y_test,y_pred)"
]
},
{
"cell_type": "code",
"execution_count": 51,
"metadata": {
"execution": {
"iopub.execute_input": "2024-05-28T14:58:18.072639Z",
"iopub.status.busy": "2024-05-28T14:58:18.072319Z",
"iopub.status.idle": "2024-05-28T14:58:18.081205Z",
"shell.execute_reply": "2024-05-28T14:58:18.080365Z",
"shell.execute_reply.started": "2024-05-28T14:58:18.072613Z"
}
},
"outputs": [
{
"data": {
"text/plain": [
"array([[4178, 762],\n",
" [ 762, 4215]])"
]
},
"execution_count": 51,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"confusion_matrix(y_test,y_pred)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Saving and Loading"
]
},
{
"cell_type": "code",
"execution_count": 60,
"metadata": {
"execution": {
"iopub.execute_input": "2024-05-28T15:01:45.937561Z",
"iopub.status.busy": "2024-05-28T15:01:45.937238Z",
"iopub.status.idle": "2024-05-28T15:01:46.088033Z",
"shell.execute_reply": "2024-05-28T15:01:46.087204Z",
"shell.execute_reply.started": "2024-05-28T15:01:45.937533Z"
}
},
"outputs": [],
"source": [
"import pickle\n",
"\n",
"# save the iris classification model as a pickle file\n",
"model_pkl_file = \"Sentimental_Analysis1.pkl\" \n",
"\n",
"with open(model_pkl_file, 'wb') as file: \n",
" pickle.dump(rf, file)"
]
},
{
"cell_type": "code",
"execution_count": 61,
"metadata": {
"execution": {
"iopub.execute_input": "2024-05-28T15:01:46.090237Z",
"iopub.status.busy": "2024-05-28T15:01:46.089930Z",
"iopub.status.idle": "2024-05-28T15:01:46.801994Z",
"shell.execute_reply": "2024-05-28T15:01:46.800807Z",
"shell.execute_reply.started": "2024-05-28T15:01:46.090212Z"
}
},
"outputs": [
{
"data": {
"text/plain": [
"0.844711102147827"
]
},
"execution_count": 61,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"with open(model_pkl_file, 'rb') as file: \n",
" rf = pickle.load(file)\n",
"y_pred=rf.predict(x_test)\n",
"accuracy_score(y_test,y_pred)"
]
},
{
"cell_type": "code",
"execution_count": 54,
"metadata": {
"execution": {
"iopub.execute_input": "2024-05-28T14:58:19.511224Z",
"iopub.status.busy": "2024-05-28T14:58:19.510906Z",
"iopub.status.idle": "2024-05-28T14:58:19.785147Z",
"shell.execute_reply": "2024-05-28T14:58:19.784066Z",
"shell.execute_reply.started": "2024-05-28T14:58:19.511197Z"
}
},
"outputs": [],
"source": [
"x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.2,random_state=3,stratify=y)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# TF_IDF"
]
},
{
"cell_type": "code",
"execution_count": 55,
"metadata": {
"execution": {
"iopub.execute_input": "2024-05-28T14:58:19.787439Z",
"iopub.status.busy": "2024-05-28T14:58:19.786737Z",
"iopub.status.idle": "2024-05-28T14:58:19.792198Z",
"shell.execute_reply": "2024-05-28T14:58:19.791132Z",
"shell.execute_reply.started": "2024-05-28T14:58:19.787387Z"
}
},
"outputs": [],
"source": [
"from sklearn.feature_extraction.text import TfidfVectorizer"
]
},
{
"cell_type": "code",
"execution_count": 56,
"metadata": {
"execution": {
"iopub.execute_input": "2024-05-28T14:58:53.029215Z",
"iopub.status.busy": "2024-05-28T14:58:53.028431Z",
"iopub.status.idle": "2024-05-28T14:58:53.033696Z",
"shell.execute_reply": "2024-05-28T14:58:53.032603Z",
"shell.execute_reply.started": "2024-05-28T14:58:53.029178Z"
}
},
"outputs": [],
"source": [
"tfidf=TfidfVectorizer(max_features=10000)"
]
},
{
"cell_type": "code",
"execution_count": 57,
"metadata": {
"execution": {
"iopub.execute_input": "2024-05-28T14:58:58.538248Z",
"iopub.status.busy": "2024-05-28T14:58:58.537480Z",
"iopub.status.idle": "2024-05-28T14:59:09.408751Z",
"shell.execute_reply": "2024-05-28T14:59:09.407944Z",
"shell.execute_reply.started": "2024-05-28T14:58:58.538211Z"
}
},
"outputs": [],
"source": [
"x_train=tfidf.fit_transform(x_train['review']).toarray()\n",
"x_test=tfidf.transform(x_test['review'])"
]
},
{
"cell_type": "code",
"execution_count": 58,
"metadata": {
"execution": {
"iopub.execute_input": "2024-05-28T14:59:18.888515Z",
"iopub.status.busy": "2024-05-28T14:59:18.888049Z",
"iopub.status.idle": "2024-05-28T15:01:45.924455Z",
"shell.execute_reply": "2024-05-28T15:01:45.923366Z",
"shell.execute_reply.started": "2024-05-28T14:59:18.888481Z"
}
},
"outputs": [
{
"data": {
"text/plain": [
"0.844711102147827"
]
},
"execution_count": 58,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"rf=RandomForestClassifier()\n",
"rf.fit(x_train,y_train)\n",
"y_pred=rf.predict(x_test)\n",
"accuracy_score(y_test,y_pred)"
]
},
{
"cell_type": "code",
"execution_count": 59,
"metadata": {
"execution": {
"iopub.execute_input": "2024-05-28T15:01:45.926453Z",
"iopub.status.busy": "2024-05-28T15:01:45.926143Z",
"iopub.status.idle": "2024-05-28T15:01:45.935972Z",
"shell.execute_reply": "2024-05-28T15:01:45.934872Z",
"shell.execute_reply.started": "2024-05-28T15:01:45.926419Z"
}
},
"outputs": [
{
"data": {
"text/plain": [
"array([[4182, 758],\n",
" [ 782, 4195]])"
]
},
"execution_count": 59,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"confusion_matrix(y_test,y_pred)"
]
},
{
"cell_type": "code",
"execution_count": 62,
"metadata": {
"execution": {
"iopub.execute_input": "2024-05-28T15:01:46.804239Z",
"iopub.status.busy": "2024-05-28T15:01:46.803156Z",
"iopub.status.idle": "2024-05-28T15:01:47.032951Z",
"shell.execute_reply": "2024-05-28T15:01:47.031995Z",
"shell.execute_reply.started": "2024-05-28T15:01:46.804189Z"
}
},
"outputs": [],
"source": [
"x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.2,random_state=3,stratify=y)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Word2Vec"
]
},
{
"cell_type": "code",
"execution_count": 64,
"metadata": {
"execution": {
"iopub.execute_input": "2024-05-28T15:04:53.247003Z",
"iopub.status.busy": "2024-05-28T15:04:53.246571Z",
"iopub.status.idle": "2024-05-28T15:05:04.199287Z",
"shell.execute_reply": "2024-05-28T15:05:04.198486Z",
"shell.execute_reply.started": "2024-05-28T15:04:53.246970Z"
}
},
"outputs": [],
"source": [
"import gensim"
]
},
{
"cell_type": "code",
"execution_count": 65,
"metadata": {
"execution": {
"iopub.execute_input": "2024-05-28T15:05:48.076456Z",
"iopub.status.busy": "2024-05-28T15:05:48.076082Z",
"iopub.status.idle": "2024-05-28T15:05:48.080852Z",
"shell.execute_reply": "2024-05-28T15:05:48.079878Z",
"shell.execute_reply.started": "2024-05-28T15:05:48.076427Z"
}
},
"outputs": [],
"source": [
"from nltk import sent_tokenize\n",
"from gensim.utils import simple_preprocess"
]
},
{
"cell_type": "code",
"execution_count": 66,
"metadata": {
"execution": {
"iopub.execute_input": "2024-05-28T15:07:37.532271Z",
"iopub.status.busy": "2024-05-28T15:07:37.531546Z",
"iopub.status.idle": "2024-05-28T15:08:02.872926Z",
"shell.execute_reply": "2024-05-28T15:08:02.871888Z",
"shell.execute_reply.started": "2024-05-28T15:07:37.532232Z"
}
},
"outputs": [],
"source": [
"story=[]\n",
"for doc in df['review']:\n",
" raw_sent=sent_tokenize(doc)\n",
" for sent in raw_sent:\n",
" story.append(simple_preprocess(sent))"
]
},
{
"cell_type": "code",
"execution_count": 67,
"metadata": {
"execution": {
"iopub.execute_input": "2024-05-28T15:08:40.812263Z",
"iopub.status.busy": "2024-05-28T15:08:40.811483Z",
"iopub.status.idle": "2024-05-28T15:08:40.817557Z",
"shell.execute_reply": "2024-05-28T15:08:40.816616Z",
"shell.execute_reply.started": "2024-05-28T15:08:40.812227Z"
}
},
"outputs": [],
"source": [
"model=gensim.models.Word2Vec(\n",
"window=10,min_count=2)"
]
},
{
"cell_type": "code",
"execution_count": 68,
"metadata": {
"execution": {
"iopub.execute_input": "2024-05-28T15:09:01.856845Z",
"iopub.status.busy": "2024-05-28T15:09:01.855976Z",
"iopub.status.idle": "2024-05-28T15:09:05.537674Z",
"shell.execute_reply": "2024-05-28T15:09:05.536873Z",
"shell.execute_reply.started": "2024-05-28T15:09:01.856798Z"
}
},
"outputs": [],
"source": [
"model.build_vocab(story)"
]
},
{
"cell_type": "code",
"execution_count": 69,
"metadata": {
"execution": {
"iopub.execute_input": "2024-05-28T15:10:20.091520Z",
"iopub.status.busy": "2024-05-28T15:10:20.091143Z",
"iopub.status.idle": "2024-05-28T15:10:51.764165Z",
"shell.execute_reply": "2024-05-28T15:10:51.763105Z",
"shell.execute_reply.started": "2024-05-28T15:10:20.091491Z"
}
},
"outputs": [
{
"data": {
"text/plain": [
"(28382867, 30062525)"
]
},
"execution_count": 69,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"model.train(story,total_examples=model.corpus_count,epochs=model.epochs)"
]
},
{
"cell_type": "code",
"execution_count": 70,
"metadata": {
"execution": {
"iopub.execute_input": "2024-05-28T15:11:03.211080Z",
"iopub.status.busy": "2024-05-28T15:11:03.210673Z",
"iopub.status.idle": "2024-05-28T15:11:03.218564Z",
"shell.execute_reply": "2024-05-28T15:11:03.217552Z",
"shell.execute_reply.started": "2024-05-28T15:11:03.211047Z"
}
},
"outputs": [
{
"data": {
"text/plain": [
"79870"
]
},
"execution_count": 70,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"len(model.wv.index_to_key)"
]
},
{
"cell_type": "code",
"execution_count": 71,
"metadata": {
"execution": {
"iopub.execute_input": "2024-05-28T15:13:11.657877Z",
"iopub.status.busy": "2024-05-28T15:13:11.657479Z",
"iopub.status.idle": "2024-05-28T15:13:11.663513Z",
"shell.execute_reply": "2024-05-28T15:13:11.662556Z",
"shell.execute_reply.started": "2024-05-28T15:13:11.657844Z"
}
},
"outputs": [],
"source": [
"def dec_vector(doc):\n",
" doc=[word for word in doc.split() if word in model.wv.index_to_key]\n",
" return np.mean(model.wv[doc],axis=0)"
]
},
{
"cell_type": "code",
"execution_count": 72,
"metadata": {
"execution": {
"iopub.execute_input": "2024-05-28T15:14:29.737526Z",
"iopub.status.busy": "2024-05-28T15:14:29.736457Z",
"iopub.status.idle": "2024-05-28T15:14:29.742036Z",
"shell.execute_reply": "2024-05-28T15:14:29.740881Z",
"shell.execute_reply.started": "2024-05-28T15:14:29.737484Z"
}
},
"outputs": [],
"source": [
"from tqdm import tqdm"
]
},
{
"cell_type": "code",
"execution_count": 74,
"metadata": {
"execution": {
"iopub.execute_input": "2024-05-28T15:16:04.216141Z",
"iopub.status.busy": "2024-05-28T15:16:04.215772Z",
"iopub.status.idle": "2024-05-28T15:35:52.614033Z",
"shell.execute_reply": "2024-05-28T15:35:52.613102Z",
"shell.execute_reply.started": "2024-05-28T15:16:04.216114Z"
}
},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"100%|██████████| 49582/49582 [19:48<00:00, 41.72it/s]\n"
]
}
],
"source": [
"X=[]\n",
"for doc in tqdm(df['review'].values):\n",
" X.append(dec_vector(doc))\n",
" "
]
},
{
"cell_type": "code",
"execution_count": 75,
"metadata": {
"execution": {
"iopub.execute_input": "2024-05-28T15:35:52.757355Z",
"iopub.status.busy": "2024-05-28T15:35:52.756711Z",
"iopub.status.idle": "2024-05-28T15:35:52.801878Z",
"shell.execute_reply": "2024-05-28T15:35:52.800886Z",
"shell.execute_reply.started": "2024-05-28T15:35:52.757317Z"
}
},
"outputs": [],
"source": [
"X=np.array(X)"
]
},
{
"cell_type": "code",
"execution_count": 76,
"metadata": {
"execution": {
"iopub.execute_input": "2024-05-28T15:35:52.992514Z",
"iopub.status.busy": "2024-05-28T15:35:52.992157Z",
"iopub.status.idle": "2024-05-28T15:35:52.999577Z",
"shell.execute_reply": "2024-05-28T15:35:52.998462Z",
"shell.execute_reply.started": "2024-05-28T15:35:52.992480Z"
}
},
"outputs": [
{
"data": {
"text/plain": [
"(49582, 100)"
]
},
"execution_count": 76,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"X.shape"
]
},
{
"cell_type": "code",
"execution_count": 77,
"metadata": {
"execution": {
"iopub.execute_input": "2024-05-28T15:35:53.001378Z",
"iopub.status.busy": "2024-05-28T15:35:53.000874Z",
"iopub.status.idle": "2024-05-28T15:35:53.008752Z",
"shell.execute_reply": "2024-05-28T15:35:53.007881Z",
"shell.execute_reply.started": "2024-05-28T15:35:53.001350Z"
}
},
"outputs": [
{
"data": {
"text/plain": [
"array([1, 1, 1, ..., 0, 0, 0])"
]
},
"execution_count": 77,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"y"
]
},
{
"cell_type": "code",
"execution_count": 78,
"metadata": {
"execution": {
"iopub.execute_input": "2024-05-28T15:35:53.010300Z",
"iopub.status.busy": "2024-05-28T15:35:53.009962Z",
"iopub.status.idle": "2024-05-28T15:35:53.046198Z",
"shell.execute_reply": "2024-05-28T15:35:53.045411Z",
"shell.execute_reply.started": "2024-05-28T15:35:53.010269Z"
}
},
"outputs": [],
"source": [
"x_train,x_test,y_train,y_test=train_test_split(X,y,test_size=0.2,random_state=3,stratify=y)"
]
},
{
"cell_type": "code",
"execution_count": 79,
"metadata": {
"execution": {
"iopub.execute_input": "2024-05-28T15:35:53.047433Z",
"iopub.status.busy": "2024-05-28T15:35:53.047187Z",
"iopub.status.idle": "2024-05-28T15:36:36.307334Z",
"shell.execute_reply": "2024-05-28T15:36:36.306297Z",
"shell.execute_reply.started": "2024-05-28T15:35:53.047411Z"
}
},
"outputs": [
{
"data": {
"text/plain": [
"0.8395684178683069"
]
},
"execution_count": 79,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"rf=RandomForestClassifier()\n",
"rf.fit(x_train,y_train)\n",
"y_pred=rf.predict(x_test)\n",
"accuracy_score(y_test,y_pred)"
]
},
{
"cell_type": "code",
"execution_count": 80,
"metadata": {
"execution": {
"iopub.execute_input": "2024-05-28T15:36:36.308713Z",
"iopub.status.busy": "2024-05-28T15:36:36.308416Z",
"iopub.status.idle": "2024-05-28T15:36:36.317103Z",
"shell.execute_reply": "2024-05-28T15:36:36.316187Z",
"shell.execute_reply.started": "2024-05-28T15:36:36.308682Z"
}
},
"outputs": [
{
"data": {
"text/plain": [
"array([[4011, 929],\n",
" [ 662, 4315]])"
]
},
"execution_count": 80,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"confusion_matrix(y_test,y_pred)"
]
},
{
"cell_type": "code",
"execution_count": 81,
"metadata": {
"execution": {
"iopub.execute_input": "2024-05-28T15:36:36.319780Z",
"iopub.status.busy": "2024-05-28T15:36:36.319506Z",
"iopub.status.idle": "2024-05-28T15:36:37.272618Z",
"shell.execute_reply": "2024-05-28T15:36:37.271741Z",
"shell.execute_reply.started": "2024-05-28T15:36:36.319756Z"
}
},
"outputs": [],
"source": [
"model_pkl_file = \"Sentimental_Analysis_Word2Vec.pkl\" \n",
"\n",
"with open(model_pkl_file, 'wb') as file: \n",
" pickle.dump(rf, file)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kaggle": {
"accelerator": "nvidiaTeslaT4",
"dataSources": [
{
"datasetId": 134715,
"sourceId": 320111,
"sourceType": "datasetVersion"
}
],
"dockerImageVersionId": 30699,
"isGpuEnabled": true,
"isInternetEnabled": true,
"language": "python",
"sourceType": "notebook"
},
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.11.0"
}
},
"nbformat": 4,
"nbformat_minor": 4
}