{ "cells": [ { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [], "source": [ "import numpy as np\n", "import pandas as pd\n", "import re\n", "from nltk.corpus import stopwords\n", "from nltk.stem.porter import PorterStemmer\n", "from sklearn.feature_extraction.text import TfidfVectorizer\n", "from sklearn.model_selection import train_test_split\n", "from sklearn.linear_model import LogisticRegression\n", "from sklearn.metrics import accuracy_score\n" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "[nltk_data] Downloading package stopwords to\n", "[nltk_data] C:\\Users\\KIIT\\AppData\\Roaming\\nltk_data...\n", "[nltk_data] Package stopwords is already up-to-date!\n" ] }, { "data": { "text/plain": [ "True" ] }, "execution_count": 2, "metadata": {}, "output_type": "execute_result" } ], "source": [ "import nltk\n", "nltk.download('stopwords')" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', \"you're\", \"you've\", \"you'll\", \"you'd\", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', \"she's\", 'her', 'hers', 'herself', 'it', \"it's\", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', \"that'll\", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', 'too', 'very', 's', 't', 'can', 'will', 'just', 'don', \"don't\", 'should', \"should've\", 'now', 'd', 'll', 'm', 'o', 're', 've', 'y', 'ain', 'aren', \"aren't\", 'couldn', \"couldn't\", 'didn', \"didn't\", 'doesn', \"doesn't\", 'hadn', \"hadn't\", 'hasn', \"hasn't\", 'haven', \"haven't\", 'isn', \"isn't\", 'ma', 'mightn', \"mightn't\", 'mustn', \"mustn't\", 'needn', \"needn't\", 'shan', \"shan't\", 'shouldn', \"shouldn't\", 'wasn', \"wasn't\", 'weren', \"weren't\", 'won', \"won't\", 'wouldn', \"wouldn't\"]\n" ] } ], "source": [ "print(stopwords.words('english'))" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [], "source": [ "dataset = pd.read_csv(\"training.1600000.processed.noemoticon.csv\" , encoding= 'ISO-8859-1')" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
01467810369Mon Apr 06 22:19:45 PDT 2009NO_QUERY_TheSpecialOne_@switchfoot http://twitpic.com/2y1zl - Awww, that's a bummer. You shoulda got David Carr of Third Day to do it. ;D
001467810672Mon Apr 06 22:19:49 PDT 2009NO_QUERYscotthamiltonis upset that he can't update his Facebook by ...
101467810917Mon Apr 06 22:19:53 PDT 2009NO_QUERYmattycus@Kenichan I dived many times for the ball. Man...
201467811184Mon Apr 06 22:19:57 PDT 2009NO_QUERYElleCTFmy whole body feels itchy and like its on fire
301467811193Mon Apr 06 22:19:57 PDT 2009NO_QUERYKaroli@nationwideclass no, it's not behaving at all....
401467811372Mon Apr 06 22:20:00 PDT 2009NO_QUERYjoy_wolf@Kwesidei not the whole crew
\n", "
" ], "text/plain": [ " 0 1467810369 Mon Apr 06 22:19:45 PDT 2009 NO_QUERY _TheSpecialOne_ \\\n", "0 0 1467810672 Mon Apr 06 22:19:49 PDT 2009 NO_QUERY scotthamilton \n", "1 0 1467810917 Mon Apr 06 22:19:53 PDT 2009 NO_QUERY mattycus \n", "2 0 1467811184 Mon Apr 06 22:19:57 PDT 2009 NO_QUERY ElleCTF \n", "3 0 1467811193 Mon Apr 06 22:19:57 PDT 2009 NO_QUERY Karoli \n", "4 0 1467811372 Mon Apr 06 22:20:00 PDT 2009 NO_QUERY joy_wolf \n", "\n", " @switchfoot http://twitpic.com/2y1zl - Awww, that's a bummer. You shoulda got David Carr of Third Day to do it. ;D \n", "0 is upset that he can't update his Facebook by ... \n", "1 @Kenichan I dived many times for the ball. Man... \n", "2 my whole body feels itchy and like its on fire \n", "3 @nationwideclass no, it's not behaving at all.... \n", "4 @Kwesidei not the whole crew " ] }, "execution_count": 6, "metadata": {}, "output_type": "execute_result" } ], "source": [ "dataset.head()" ] }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [], "source": [ "col_names = ['target' , 'id' , 'date' , 'flag' , 'user' , 'text']\n", "dataset.columns = col_names" ] }, { "cell_type": "code", "execution_count": 9, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
targetiddateflagusertext
001467810672Mon Apr 06 22:19:49 PDT 2009NO_QUERYscotthamiltonis upset that he can't update his Facebook by ...
101467810917Mon Apr 06 22:19:53 PDT 2009NO_QUERYmattycus@Kenichan I dived many times for the ball. Man...
201467811184Mon Apr 06 22:19:57 PDT 2009NO_QUERYElleCTFmy whole body feels itchy and like its on fire
301467811193Mon Apr 06 22:19:57 PDT 2009NO_QUERYKaroli@nationwideclass no, it's not behaving at all....
401467811372Mon Apr 06 22:20:00 PDT 2009NO_QUERYjoy_wolf@Kwesidei not the whole crew
\n", "
" ], "text/plain": [ " target id date flag user \\\n", "0 0 1467810672 Mon Apr 06 22:19:49 PDT 2009 NO_QUERY scotthamilton \n", "1 0 1467810917 Mon Apr 06 22:19:53 PDT 2009 NO_QUERY mattycus \n", "2 0 1467811184 Mon Apr 06 22:19:57 PDT 2009 NO_QUERY ElleCTF \n", "3 0 1467811193 Mon Apr 06 22:19:57 PDT 2009 NO_QUERY Karoli \n", "4 0 1467811372 Mon Apr 06 22:20:00 PDT 2009 NO_QUERY joy_wolf \n", "\n", " text \n", "0 is upset that he can't update his Facebook by ... \n", "1 @Kenichan I dived many times for the ball. Man... \n", "2 my whole body feels itchy and like its on fire \n", "3 @nationwideclass no, it's not behaving at all.... \n", "4 @Kwesidei not the whole crew " ] }, "execution_count": 9, "metadata": {}, "output_type": "execute_result" } ], "source": [ "dataset.head()" ] }, { "cell_type": "code", "execution_count": 10, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "(1599999, 6)" ] }, "execution_count": 10, "metadata": {}, "output_type": "execute_result" } ], "source": [ "dataset.shape" ] }, { "cell_type": "code", "execution_count": 11, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "target 0\n", "id 0\n", "date 0\n", "flag 0\n", "user 0\n", "text 0\n", "dtype: int64" ] }, "execution_count": 11, "metadata": {}, "output_type": "execute_result" } ], "source": [ "#checking for missing values\n", "dataset.isnull().sum()" ] }, { "cell_type": "code", "execution_count": 12, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "target\n", "4 800000\n", "0 799999\n", "Name: count, dtype: int64" ] }, "execution_count": 12, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# Distribution of tweets\n", "dataset['target'].value_counts()" ] }, { "cell_type": "code", "execution_count": 13, "metadata": {}, "outputs": [], "source": [ "# Converting 0 to -ve and 4 to +ve\n", "dataset['target'] = dataset['target'].map({0:0 , 4:1})" ] }, { "cell_type": "code", "execution_count": 15, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "target\n", "1 800000\n", "0 799999\n", "Name: count, dtype: int64" ] }, "execution_count": 15, "metadata": {}, "output_type": "execute_result" } ], "source": [ "dataset['target'].value_counts()" ] }, { "cell_type": "code", "execution_count": 16, "metadata": {}, "outputs": [], "source": [ "# Stemming\n", "\n", "stremmer = PorterStemmer()\n", "\n", "def stemming(content):\n", " stemmed_content = re.sub('[^a-zA-Z]',' ',content) # removing not a-z and A-Z\n", " stemmed_content = stemmed_content.lower()\n", " stemmed_content = stemmed_content.split()\n", " stemmed_content = [stremmer.stem(word) for word in stemmed_content if not word in stopwords.words('english')]\n", " stemmed_content = ' '.join(stemmed_content)\n", " return stemmed_content" ] }, { "cell_type": "code", "execution_count": 17, "metadata": {}, "outputs": [], "source": [ "dataset['text'] = dataset['text'].apply(stemming)" ] }, { "cell_type": "code", "execution_count": 18, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
targetiddateflagusertext
001467810672Mon Apr 06 22:19:49 PDT 2009NO_QUERYscotthamiltonupset updat facebook text might cri result sch...
101467810917Mon Apr 06 22:19:53 PDT 2009NO_QUERYmattycuskenichan dive mani time ball manag save rest g...
201467811184Mon Apr 06 22:19:57 PDT 2009NO_QUERYElleCTFwhole bodi feel itchi like fire
301467811193Mon Apr 06 22:19:57 PDT 2009NO_QUERYKarolinationwideclass behav mad see
401467811372Mon Apr 06 22:20:00 PDT 2009NO_QUERYjoy_wolfkwesidei whole crew
\n", "
" ], "text/plain": [ " target id date flag user \\\n", "0 0 1467810672 Mon Apr 06 22:19:49 PDT 2009 NO_QUERY scotthamilton \n", "1 0 1467810917 Mon Apr 06 22:19:53 PDT 2009 NO_QUERY mattycus \n", "2 0 1467811184 Mon Apr 06 22:19:57 PDT 2009 NO_QUERY ElleCTF \n", "3 0 1467811193 Mon Apr 06 22:19:57 PDT 2009 NO_QUERY Karoli \n", "4 0 1467811372 Mon Apr 06 22:20:00 PDT 2009 NO_QUERY joy_wolf \n", "\n", " text \n", "0 upset updat facebook text might cri result sch... \n", "1 kenichan dive mani time ball manag save rest g... \n", "2 whole bodi feel itchi like fire \n", "3 nationwideclass behav mad see \n", "4 kwesidei whole crew " ] }, "execution_count": 18, "metadata": {}, "output_type": "execute_result" } ], "source": [ "dataset.head()" ] }, { "cell_type": "code", "execution_count": 19, "metadata": {}, "outputs": [], "source": [ "x = dataset['text']\n", "y = dataset['target']" ] }, { "cell_type": "code", "execution_count": 20, "metadata": {}, "outputs": [], "source": [ "# splitting the dataset\n", "x_train , x_test , y_train , y_test = train_test_split(x , y , test_size = 0.2 , random_state = 0)" ] }, { "cell_type": "code", "execution_count": 21, "metadata": {}, "outputs": [], "source": [ "# convert textual data to numerical data\n", "vectorizer = TfidfVectorizer()\n", "x_train = vectorizer.fit_transform(x_train)\n", "x_test = vectorizer.transform(x_test)" ] }, { "cell_type": "code", "execution_count": 22, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ " (0, 145591)\t0.48328892862950174\n", " (0, 384310)\t0.38648598535906226\n", " (0, 160355)\t0.18966194768681632\n", " (0, 422796)\t0.4213995220282958\n", " (0, 246262)\t0.516206150117446\n", " (0, 393595)\t0.18633353695642413\n", " (0, 149660)\t0.12602103676347354\n", " (0, 150562)\t0.187752051036393\n", " (0, 443991)\t0.22625223143666687\n", " (1, 172128)\t0.6067414559564506\n", " (1, 418051)\t0.7948992424350689\n", " (2, 406965)\t0.6931768888241752\n", " (2, 275790)\t0.3769717187165907\n", " (2, 290673)\t0.24841016587340456\n", " (2, 150650)\t0.20986098127991223\n", " (2, 42279)\t0.5211994648067829\n", " (3, 175231)\t0.30748407834013664\n", " (3, 89478)\t0.5137960384023271\n", " (3, 135304)\t0.18399221471225605\n", " (3, 292469)\t0.3352332134067401\n", " (3, 399931)\t0.21912347276618377\n", " (3, 317428)\t0.5137960384023271\n", " (3, 175234)\t0.4280552121498152\n", " (4, 408579)\t0.14704998873675024\n", " (4, 300289)\t0.2058593651486058\n", " :\t:\n", " (1279995, 101591)\t0.8081360486674279\n", " (1279995, 248952)\t0.5889958631808858\n", " (1279996, 277402)\t0.6930282733228941\n", " (1279996, 133848)\t0.34541074396262944\n", " (1279996, 435543)\t0.2695787059712405\n", " (1279996, 230940)\t0.28709000004756496\n", " (1279996, 384176)\t0.22284929416293517\n", " (1279996, 168384)\t0.22632455016071848\n", " (1279996, 445127)\t0.19037698208802128\n", " (1279996, 170080)\t0.2583579928589749\n", " (1279996, 408579)\t0.2035510397723402\n", " (1279997, 22582)\t0.40592321055556474\n", " (1279997, 407667)\t0.4517041173506153\n", " (1279997, 365896)\t0.34128528334674657\n", " (1279997, 78807)\t0.20434235294380243\n", " (1279997, 318283)\t0.48408216042272795\n", " (1279997, 278738)\t0.20662639845796468\n", " (1279997, 31095)\t0.1879300266675478\n", " (1279997, 267587)\t0.18767777014427442\n", " (1279997, 334582)\t0.19548006690275818\n", " (1279997, 243236)\t0.23915227399663266\n", " (1279997, 241760)\t0.17315132700092342\n", " (1279998, 360147)\t0.7967059461608392\n", " (1279998, 393318)\t0.47775281405037406\n", " (1279998, 150849)\t0.37015116374112683\n" ] } ], "source": [ "print(x_train)" ] }, { "cell_type": "code", "execution_count": 23, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "C:\\Users\\KIIT\\AppData\\Roaming\\Python\\Python312\\site-packages\\sklearn\\linear_model\\_logistic.py:469: ConvergenceWarning: lbfgs failed to converge (status=1):\n", "STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.\n", "\n", "Increase the number of iterations (max_iter) or scale the data as shown in:\n", " https://scikit-learn.org/stable/modules/preprocessing.html\n", "Please also refer to the documentation for alternative solver options:\n", " https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression\n", " n_iter_i = _check_optimize_result(\n" ] }, { "data": { "text/html": [ "
LogisticRegression()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
" ], "text/plain": [ "LogisticRegression()" ] }, "execution_count": 23, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# Training the model\n", "model = LogisticRegression()\n", "model.fit(x_train , y_train)" ] }, { "cell_type": "code", "execution_count": 24, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "0.775615625\n" ] } ], "source": [ "# Testing the model\n", "y_pred = model.predict(x_test)\n", "print(accuracy_score(y_test , y_pred))" ] }, { "cell_type": "code", "execution_count": 25, "metadata": {}, "outputs": [], "source": [ "# Function to predict the sentiment\n", "def predict_sentiment(text):\n", " text = re.sub('[^a-zA-Z]',' ',text) # removing not a-z and A-Z\n", " text = text.lower()\n", " text = text.split() \n", " text = [stremmer.stem(word) for word in text if not word in stopwords.words('english')]\n", " text = ' '.join(text)\n", " text = [text]\n", " text = vectorizer.transform(text) \n", " sentiment = model.predict(text)\n", " if sentiment == 0:\n", " return \"Negative\"\n", " else:\n", " return \"Positive\"" ] }, { "cell_type": "code", "execution_count": 26, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Negative\n", "Positive\n" ] } ], "source": [ "# Testing the model\n", "print(predict_sentiment(\"I hate you\"))\n", "print(predict_sentiment(\"I love you\"))" ] }, { "cell_type": "code", "execution_count": 27, "metadata": {}, "outputs": [], "source": [ "# Save the model\n", "import pickle\n", "pickle.dump(model , open('model.pkl' , 'wb'))" ] }, { "cell_type": "code", "execution_count": 29, "metadata": {}, "outputs": [], "source": [ "pickle.dump(vectorizer , open('vectorizer.pkl' , 'wb'))" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.12.4" } }, "nbformat": 4, "nbformat_minor": 2 }