{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
"import numpy as np\n",
"import pandas as pd\n",
"import re\n",
"from nltk.corpus import stopwords\n",
"from nltk.stem.porter import PorterStemmer\n",
"from sklearn.feature_extraction.text import TfidfVectorizer\n",
"from sklearn.model_selection import train_test_split\n",
"from sklearn.linear_model import LogisticRegression\n",
"from sklearn.metrics import accuracy_score\n"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"[nltk_data] Downloading package stopwords to\n",
"[nltk_data] C:\\Users\\KIIT\\AppData\\Roaming\\nltk_data...\n",
"[nltk_data] Package stopwords is already up-to-date!\n"
]
},
{
"data": {
"text/plain": [
"True"
]
},
"execution_count": 2,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"import nltk\n",
"nltk.download('stopwords')"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', \"you're\", \"you've\", \"you'll\", \"you'd\", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', \"she's\", 'her', 'hers', 'herself', 'it', \"it's\", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', \"that'll\", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', 'too', 'very', 's', 't', 'can', 'will', 'just', 'don', \"don't\", 'should', \"should've\", 'now', 'd', 'll', 'm', 'o', 're', 've', 'y', 'ain', 'aren', \"aren't\", 'couldn', \"couldn't\", 'didn', \"didn't\", 'doesn', \"doesn't\", 'hadn', \"hadn't\", 'hasn', \"hasn't\", 'haven', \"haven't\", 'isn', \"isn't\", 'ma', 'mightn', \"mightn't\", 'mustn', \"mustn't\", 'needn', \"needn't\", 'shan', \"shan't\", 'shouldn', \"shouldn't\", 'wasn', \"wasn't\", 'weren', \"weren't\", 'won', \"won't\", 'wouldn', \"wouldn't\"]\n"
]
}
],
"source": [
"print(stopwords.words('english'))"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [],
"source": [
"dataset = pd.read_csv(\"training.1600000.processed.noemoticon.csv\" , encoding= 'ISO-8859-1')"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"
\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" 0 | \n",
" 1467810369 | \n",
" Mon Apr 06 22:19:45 PDT 2009 | \n",
" NO_QUERY | \n",
" _TheSpecialOne_ | \n",
" @switchfoot http://twitpic.com/2y1zl - Awww, that's a bummer. You shoulda got David Carr of Third Day to do it. ;D | \n",
"
\n",
" \n",
" \n",
" \n",
" | 0 | \n",
" 0 | \n",
" 1467810672 | \n",
" Mon Apr 06 22:19:49 PDT 2009 | \n",
" NO_QUERY | \n",
" scotthamilton | \n",
" is upset that he can't update his Facebook by ... | \n",
"
\n",
" \n",
" | 1 | \n",
" 0 | \n",
" 1467810917 | \n",
" Mon Apr 06 22:19:53 PDT 2009 | \n",
" NO_QUERY | \n",
" mattycus | \n",
" @Kenichan I dived many times for the ball. Man... | \n",
"
\n",
" \n",
" | 2 | \n",
" 0 | \n",
" 1467811184 | \n",
" Mon Apr 06 22:19:57 PDT 2009 | \n",
" NO_QUERY | \n",
" ElleCTF | \n",
" my whole body feels itchy and like its on fire | \n",
"
\n",
" \n",
" | 3 | \n",
" 0 | \n",
" 1467811193 | \n",
" Mon Apr 06 22:19:57 PDT 2009 | \n",
" NO_QUERY | \n",
" Karoli | \n",
" @nationwideclass no, it's not behaving at all.... | \n",
"
\n",
" \n",
" | 4 | \n",
" 0 | \n",
" 1467811372 | \n",
" Mon Apr 06 22:20:00 PDT 2009 | \n",
" NO_QUERY | \n",
" joy_wolf | \n",
" @Kwesidei not the whole crew | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" 0 1467810369 Mon Apr 06 22:19:45 PDT 2009 NO_QUERY _TheSpecialOne_ \\\n",
"0 0 1467810672 Mon Apr 06 22:19:49 PDT 2009 NO_QUERY scotthamilton \n",
"1 0 1467810917 Mon Apr 06 22:19:53 PDT 2009 NO_QUERY mattycus \n",
"2 0 1467811184 Mon Apr 06 22:19:57 PDT 2009 NO_QUERY ElleCTF \n",
"3 0 1467811193 Mon Apr 06 22:19:57 PDT 2009 NO_QUERY Karoli \n",
"4 0 1467811372 Mon Apr 06 22:20:00 PDT 2009 NO_QUERY joy_wolf \n",
"\n",
" @switchfoot http://twitpic.com/2y1zl - Awww, that's a bummer. You shoulda got David Carr of Third Day to do it. ;D \n",
"0 is upset that he can't update his Facebook by ... \n",
"1 @Kenichan I dived many times for the ball. Man... \n",
"2 my whole body feels itchy and like its on fire \n",
"3 @nationwideclass no, it's not behaving at all.... \n",
"4 @Kwesidei not the whole crew "
]
},
"execution_count": 6,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"dataset.head()"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {},
"outputs": [],
"source": [
"col_names = ['target' , 'id' , 'date' , 'flag' , 'user' , 'text']\n",
"dataset.columns = col_names"
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" target | \n",
" id | \n",
" date | \n",
" flag | \n",
" user | \n",
" text | \n",
"
\n",
" \n",
" \n",
" \n",
" | 0 | \n",
" 0 | \n",
" 1467810672 | \n",
" Mon Apr 06 22:19:49 PDT 2009 | \n",
" NO_QUERY | \n",
" scotthamilton | \n",
" is upset that he can't update his Facebook by ... | \n",
"
\n",
" \n",
" | 1 | \n",
" 0 | \n",
" 1467810917 | \n",
" Mon Apr 06 22:19:53 PDT 2009 | \n",
" NO_QUERY | \n",
" mattycus | \n",
" @Kenichan I dived many times for the ball. Man... | \n",
"
\n",
" \n",
" | 2 | \n",
" 0 | \n",
" 1467811184 | \n",
" Mon Apr 06 22:19:57 PDT 2009 | \n",
" NO_QUERY | \n",
" ElleCTF | \n",
" my whole body feels itchy and like its on fire | \n",
"
\n",
" \n",
" | 3 | \n",
" 0 | \n",
" 1467811193 | \n",
" Mon Apr 06 22:19:57 PDT 2009 | \n",
" NO_QUERY | \n",
" Karoli | \n",
" @nationwideclass no, it's not behaving at all.... | \n",
"
\n",
" \n",
" | 4 | \n",
" 0 | \n",
" 1467811372 | \n",
" Mon Apr 06 22:20:00 PDT 2009 | \n",
" NO_QUERY | \n",
" joy_wolf | \n",
" @Kwesidei not the whole crew | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" target id date flag user \\\n",
"0 0 1467810672 Mon Apr 06 22:19:49 PDT 2009 NO_QUERY scotthamilton \n",
"1 0 1467810917 Mon Apr 06 22:19:53 PDT 2009 NO_QUERY mattycus \n",
"2 0 1467811184 Mon Apr 06 22:19:57 PDT 2009 NO_QUERY ElleCTF \n",
"3 0 1467811193 Mon Apr 06 22:19:57 PDT 2009 NO_QUERY Karoli \n",
"4 0 1467811372 Mon Apr 06 22:20:00 PDT 2009 NO_QUERY joy_wolf \n",
"\n",
" text \n",
"0 is upset that he can't update his Facebook by ... \n",
"1 @Kenichan I dived many times for the ball. Man... \n",
"2 my whole body feels itchy and like its on fire \n",
"3 @nationwideclass no, it's not behaving at all.... \n",
"4 @Kwesidei not the whole crew "
]
},
"execution_count": 9,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"dataset.head()"
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"(1599999, 6)"
]
},
"execution_count": 10,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"dataset.shape"
]
},
{
"cell_type": "code",
"execution_count": 11,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"target 0\n",
"id 0\n",
"date 0\n",
"flag 0\n",
"user 0\n",
"text 0\n",
"dtype: int64"
]
},
"execution_count": 11,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"#checking for missing values\n",
"dataset.isnull().sum()"
]
},
{
"cell_type": "code",
"execution_count": 12,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"target\n",
"4 800000\n",
"0 799999\n",
"Name: count, dtype: int64"
]
},
"execution_count": 12,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# Distribution of tweets\n",
"dataset['target'].value_counts()"
]
},
{
"cell_type": "code",
"execution_count": 13,
"metadata": {},
"outputs": [],
"source": [
"# Converting 0 to -ve and 4 to +ve\n",
"dataset['target'] = dataset['target'].map({0:0 , 4:1})"
]
},
{
"cell_type": "code",
"execution_count": 15,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"target\n",
"1 800000\n",
"0 799999\n",
"Name: count, dtype: int64"
]
},
"execution_count": 15,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"dataset['target'].value_counts()"
]
},
{
"cell_type": "code",
"execution_count": 16,
"metadata": {},
"outputs": [],
"source": [
"# Stemming\n",
"\n",
"stremmer = PorterStemmer()\n",
"\n",
"def stemming(content):\n",
" stemmed_content = re.sub('[^a-zA-Z]',' ',content) # removing not a-z and A-Z\n",
" stemmed_content = stemmed_content.lower()\n",
" stemmed_content = stemmed_content.split()\n",
" stemmed_content = [stremmer.stem(word) for word in stemmed_content if not word in stopwords.words('english')]\n",
" stemmed_content = ' '.join(stemmed_content)\n",
" return stemmed_content"
]
},
{
"cell_type": "code",
"execution_count": 17,
"metadata": {},
"outputs": [],
"source": [
"dataset['text'] = dataset['text'].apply(stemming)"
]
},
{
"cell_type": "code",
"execution_count": 18,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" target | \n",
" id | \n",
" date | \n",
" flag | \n",
" user | \n",
" text | \n",
"
\n",
" \n",
" \n",
" \n",
" | 0 | \n",
" 0 | \n",
" 1467810672 | \n",
" Mon Apr 06 22:19:49 PDT 2009 | \n",
" NO_QUERY | \n",
" scotthamilton | \n",
" upset updat facebook text might cri result sch... | \n",
"
\n",
" \n",
" | 1 | \n",
" 0 | \n",
" 1467810917 | \n",
" Mon Apr 06 22:19:53 PDT 2009 | \n",
" NO_QUERY | \n",
" mattycus | \n",
" kenichan dive mani time ball manag save rest g... | \n",
"
\n",
" \n",
" | 2 | \n",
" 0 | \n",
" 1467811184 | \n",
" Mon Apr 06 22:19:57 PDT 2009 | \n",
" NO_QUERY | \n",
" ElleCTF | \n",
" whole bodi feel itchi like fire | \n",
"
\n",
" \n",
" | 3 | \n",
" 0 | \n",
" 1467811193 | \n",
" Mon Apr 06 22:19:57 PDT 2009 | \n",
" NO_QUERY | \n",
" Karoli | \n",
" nationwideclass behav mad see | \n",
"
\n",
" \n",
" | 4 | \n",
" 0 | \n",
" 1467811372 | \n",
" Mon Apr 06 22:20:00 PDT 2009 | \n",
" NO_QUERY | \n",
" joy_wolf | \n",
" kwesidei whole crew | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" target id date flag user \\\n",
"0 0 1467810672 Mon Apr 06 22:19:49 PDT 2009 NO_QUERY scotthamilton \n",
"1 0 1467810917 Mon Apr 06 22:19:53 PDT 2009 NO_QUERY mattycus \n",
"2 0 1467811184 Mon Apr 06 22:19:57 PDT 2009 NO_QUERY ElleCTF \n",
"3 0 1467811193 Mon Apr 06 22:19:57 PDT 2009 NO_QUERY Karoli \n",
"4 0 1467811372 Mon Apr 06 22:20:00 PDT 2009 NO_QUERY joy_wolf \n",
"\n",
" text \n",
"0 upset updat facebook text might cri result sch... \n",
"1 kenichan dive mani time ball manag save rest g... \n",
"2 whole bodi feel itchi like fire \n",
"3 nationwideclass behav mad see \n",
"4 kwesidei whole crew "
]
},
"execution_count": 18,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"dataset.head()"
]
},
{
"cell_type": "code",
"execution_count": 19,
"metadata": {},
"outputs": [],
"source": [
"x = dataset['text']\n",
"y = dataset['target']"
]
},
{
"cell_type": "code",
"execution_count": 20,
"metadata": {},
"outputs": [],
"source": [
"# splitting the dataset\n",
"x_train , x_test , y_train , y_test = train_test_split(x , y , test_size = 0.2 , random_state = 0)"
]
},
{
"cell_type": "code",
"execution_count": 21,
"metadata": {},
"outputs": [],
"source": [
"# convert textual data to numerical data\n",
"vectorizer = TfidfVectorizer()\n",
"x_train = vectorizer.fit_transform(x_train)\n",
"x_test = vectorizer.transform(x_test)"
]
},
{
"cell_type": "code",
"execution_count": 22,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
" (0, 145591)\t0.48328892862950174\n",
" (0, 384310)\t0.38648598535906226\n",
" (0, 160355)\t0.18966194768681632\n",
" (0, 422796)\t0.4213995220282958\n",
" (0, 246262)\t0.516206150117446\n",
" (0, 393595)\t0.18633353695642413\n",
" (0, 149660)\t0.12602103676347354\n",
" (0, 150562)\t0.187752051036393\n",
" (0, 443991)\t0.22625223143666687\n",
" (1, 172128)\t0.6067414559564506\n",
" (1, 418051)\t0.7948992424350689\n",
" (2, 406965)\t0.6931768888241752\n",
" (2, 275790)\t0.3769717187165907\n",
" (2, 290673)\t0.24841016587340456\n",
" (2, 150650)\t0.20986098127991223\n",
" (2, 42279)\t0.5211994648067829\n",
" (3, 175231)\t0.30748407834013664\n",
" (3, 89478)\t0.5137960384023271\n",
" (3, 135304)\t0.18399221471225605\n",
" (3, 292469)\t0.3352332134067401\n",
" (3, 399931)\t0.21912347276618377\n",
" (3, 317428)\t0.5137960384023271\n",
" (3, 175234)\t0.4280552121498152\n",
" (4, 408579)\t0.14704998873675024\n",
" (4, 300289)\t0.2058593651486058\n",
" :\t:\n",
" (1279995, 101591)\t0.8081360486674279\n",
" (1279995, 248952)\t0.5889958631808858\n",
" (1279996, 277402)\t0.6930282733228941\n",
" (1279996, 133848)\t0.34541074396262944\n",
" (1279996, 435543)\t0.2695787059712405\n",
" (1279996, 230940)\t0.28709000004756496\n",
" (1279996, 384176)\t0.22284929416293517\n",
" (1279996, 168384)\t0.22632455016071848\n",
" (1279996, 445127)\t0.19037698208802128\n",
" (1279996, 170080)\t0.2583579928589749\n",
" (1279996, 408579)\t0.2035510397723402\n",
" (1279997, 22582)\t0.40592321055556474\n",
" (1279997, 407667)\t0.4517041173506153\n",
" (1279997, 365896)\t0.34128528334674657\n",
" (1279997, 78807)\t0.20434235294380243\n",
" (1279997, 318283)\t0.48408216042272795\n",
" (1279997, 278738)\t0.20662639845796468\n",
" (1279997, 31095)\t0.1879300266675478\n",
" (1279997, 267587)\t0.18767777014427442\n",
" (1279997, 334582)\t0.19548006690275818\n",
" (1279997, 243236)\t0.23915227399663266\n",
" (1279997, 241760)\t0.17315132700092342\n",
" (1279998, 360147)\t0.7967059461608392\n",
" (1279998, 393318)\t0.47775281405037406\n",
" (1279998, 150849)\t0.37015116374112683\n"
]
}
],
"source": [
"print(x_train)"
]
},
{
"cell_type": "code",
"execution_count": 23,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"C:\\Users\\KIIT\\AppData\\Roaming\\Python\\Python312\\site-packages\\sklearn\\linear_model\\_logistic.py:469: ConvergenceWarning: lbfgs failed to converge (status=1):\n",
"STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.\n",
"\n",
"Increase the number of iterations (max_iter) or scale the data as shown in:\n",
" https://scikit-learn.org/stable/modules/preprocessing.html\n",
"Please also refer to the documentation for alternative solver options:\n",
" https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression\n",
" n_iter_i = _check_optimize_result(\n"
]
},
{
"data": {
"text/html": [
"LogisticRegression()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org. "
],
"text/plain": [
"LogisticRegression()"
]
},
"execution_count": 23,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# Training the model\n",
"model = LogisticRegression()\n",
"model.fit(x_train , y_train)"
]
},
{
"cell_type": "code",
"execution_count": 24,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"0.775615625\n"
]
}
],
"source": [
"# Testing the model\n",
"y_pred = model.predict(x_test)\n",
"print(accuracy_score(y_test , y_pred))"
]
},
{
"cell_type": "code",
"execution_count": 25,
"metadata": {},
"outputs": [],
"source": [
"# Function to predict the sentiment\n",
"def predict_sentiment(text):\n",
" text = re.sub('[^a-zA-Z]',' ',text) # removing not a-z and A-Z\n",
" text = text.lower()\n",
" text = text.split() \n",
" text = [stremmer.stem(word) for word in text if not word in stopwords.words('english')]\n",
" text = ' '.join(text)\n",
" text = [text]\n",
" text = vectorizer.transform(text) \n",
" sentiment = model.predict(text)\n",
" if sentiment == 0:\n",
" return \"Negative\"\n",
" else:\n",
" return \"Positive\""
]
},
{
"cell_type": "code",
"execution_count": 26,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Negative\n",
"Positive\n"
]
}
],
"source": [
"# Testing the model\n",
"print(predict_sentiment(\"I hate you\"))\n",
"print(predict_sentiment(\"I love you\"))"
]
},
{
"cell_type": "code",
"execution_count": 27,
"metadata": {},
"outputs": [],
"source": [
"# Save the model\n",
"import pickle\n",
"pickle.dump(model , open('model.pkl' , 'wb'))"
]
},
{
"cell_type": "code",
"execution_count": 29,
"metadata": {},
"outputs": [],
"source": [
"pickle.dump(vectorizer , open('vectorizer.pkl' , 'wb'))"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.12.4"
}
},
"nbformat": 4,
"nbformat_minor": 2
}