{ "nbformat": 4, "nbformat_minor": 0, "metadata": { "colab": { "provenance": [] }, "kernelspec": { "name": "python3", "display_name": "Python 3" }, "language_info": { "name": "python" } }, "cells": [ { "cell_type": "code", "execution_count": null, "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "6wKWMp33VnT5", "outputId": "1d5f7dd3-5557-4d3b-95c5-8f2698ca8ba2" }, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "Hello World! Sentiment Analysis project\n" ] } ], "source": [ "# Print a starting message for the project\n", "print(\"Hello World! Sentiment Analysis project\")" ] }, { "cell_type": "code", "source": [ "# Import the kagglehub library to download datasets\n", "import kagglehub" ], "metadata": { "id": "kghqpA1oXWSp" }, "execution_count": null, "outputs": [] }, { "cell_type": "code", "source": [ "# Download the sentiment140 dataset using kagglehub\n", "path = kagglehub.dataset_download(\"kazanova/sentiment140\")" ], "metadata": { "id": "zgGiu4i5XXfj" }, "execution_count": null, "outputs": [] }, { "cell_type": "code", "source": [ "# Print the local path where the dataset files are downloaded\n", "print(\"Path to dataset files:\", path)" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "ZZOudY7EXc8H", "outputId": "f3e8f716-f970-4c86-f79d-98f7e01dc0a7" }, "execution_count": null, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "Path to dataset files: /kaggle/input/sentiment140\n" ] } ] }, { "cell_type": "code", "source": [ "# Import pandas library for data manipulation\n", "import pandas as pd\n", "\n", "# Read the training data CSV file into a pandas DataFrame\n", "# Specify encoding as 'latin-1' to handle potential character issues\n", "df = pd.read_csv(path + \"/training.1600000.processed.noemoticon.csv\", encoding='latin-1')\n", "\n", "# Assign column names to the DataFrame as the CSV does not have a header\n", "df.columns = ['target', 'ids', 'date', 'flag', 'user', 'text']\n", "\n", "# Select only the 'text' and 'target' columns which are relevant for sentiment analysis\n", "df = df[['text', 'target']]\n", "\n", "# Display the first few rows of the DataFrame to inspect the data\n", "df.head()" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 206 }, "id": "mH0Lm7zzXmVU", "outputId": "33fb3ad2-4d36-4d9b-cab2-5131846daa27" }, "execution_count": null, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ " text target\n", "0 is upset that he can't update his Facebook by ... 0\n", "1 @Kenichan I dived many times for the ball. Man... 0\n", "2 my whole body feels itchy and like its on fire 0\n", "3 @nationwideclass no, it's not behaving at all.... 0\n", "4 @Kwesidei not the whole crew 0" ], "text/html": [ "\n", "
\n", "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
texttarget
0is upset that he can't update his Facebook by ...0
1@Kenichan I dived many times for the ball. Man...0
2my whole body feels itchy and like its on fire0
3@nationwideclass no, it's not behaving at all....0
4@Kwesidei not the whole crew0
\n", "
\n", "
\n", "\n", "
\n", " \n", "\n", " \n", "\n", " \n", "
\n", "\n", "\n", "
\n", " \n", "\n", "\n", "\n", " \n", "
\n", "\n", "
\n", "
\n" ], "application/vnd.google.colaboratory.intrinsic+json": { "type": "dataframe", "variable_name": "df" } }, "metadata": {}, "execution_count": 8 } ] }, { "cell_type": "markdown", "source": [ "**# 0 - negative comment, 1 - positrive comment**" ], "metadata": { "id": "XY7xY_GsZnBm" } }, { "cell_type": "code", "source": [ "# Map the target variable from 4 to 1 to represent positive sentiment\n", "# Original dataset uses 0 for negative and 4 for positive\n", "df.target = df.target.map(lambda x: 1 if x == 4 else x)\n", "\n", "# Count the occurrences of each value in the 'target' column\n", "# This shows the distribution of sentiment labels (0 and 1)\n", "df.target.value_counts()" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 178 }, "id": "RiVnBr4aY4zP", "outputId": "0816188e-4526-4e27-ac86-ce95f04fdf61" }, "execution_count": null, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ "target\n", "1 800000\n", "0 799999\n", "Name: count, dtype: int64" ], "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
count
target
1800000
0799999
\n", "

" ] }, "metadata": {}, "execution_count": 17 } ] }, { "cell_type": "code", "source": [ "# Import the regular expression module for text cleaning\n", "import re\n", "\n", "# Import train_test_split to split data into training and testing sets\n", "from sklearn.model_selection import train_test_split\n", "\n", "# Import TfidfVectorizer to convert text data into numerical features\n", "from sklearn.feature_extraction.text import TfidfVectorizer" ], "metadata": { "id": "epVnr8mJaugo" }, "execution_count": null, "outputs": [] }, { "cell_type": "code", "source": [ "# Define a function to clean the text data\n", "# Converts text to lowercase and removes punctuation\n", "def clean_text(text):\n", " text = text.lower() # Convert text to lowercase\n", " text = re.sub(r'[^\\w\\s]', '', text) # Remove punctuation using regex\n", " return text\n", "\n", "# Apply the clean_text function to the 'text' column of the DataFrame\n", "df.text = df.text.apply(clean_text)" ], "metadata": { "id": "kM50kacBcz3f" }, "execution_count": null, "outputs": [] }, { "cell_type": "code", "source": [ "# Display the DataFrame after applying the text cleaning function\n", "# This shows the cleaned text data along with the target variable\n", "df" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 424 }, "id": "v0_0F72sdl8l", "outputId": "9f9f3e64-c499-417d-d939-bab7f90eb900" }, "execution_count": null, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ " text target\n", "0 is upset that he cant update his facebook by t... 0\n", "1 kenichan i dived many times for the ball manag... 0\n", "2 my whole body feels itchy and like its on fire 0\n", "3 nationwideclass no its not behaving at all im ... 0\n", "4 kwesidei not the whole crew 0\n", "... ... ...\n", "1599994 just woke up having no school is the best feel... 1\n", "1599995 thewdbcom very cool to hear old walt intervie... 1\n", "1599996 are you ready for your mojo makeover ask me fo... 1\n", "1599997 happy 38th birthday to my boo of alll time tup... 1\n", "1599998 happy charitytuesday thenspcc sparkscharity sp... 1\n", "\n", "[1599999 rows x 2 columns]" ], "text/html": [ "\n", "
\n", "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
texttarget
0is upset that he cant update his facebook by t...0
1kenichan i dived many times for the ball manag...0
2my whole body feels itchy and like its on fire0
3nationwideclass no its not behaving at all im ...0
4kwesidei not the whole crew0
.........
1599994just woke up having no school is the best feel...1
1599995thewdbcom very cool to hear old walt intervie...1
1599996are you ready for your mojo makeover ask me fo...1
1599997happy 38th birthday to my boo of alll time tup...1
1599998happy charitytuesday thenspcc sparkscharity sp...1
\n", "

1599999 rows × 2 columns

\n", "
\n", "
\n", "\n", "
\n", " \n", "\n", " \n", "\n", " \n", "
\n", "\n", "\n", "
\n", " \n", "\n", "\n", "\n", " \n", "
\n", "\n", "
\n", " \n", " \n", " \n", "
\n", "\n", "
\n", "
\n" ], "application/vnd.google.colaboratory.intrinsic+json": { "type": "dataframe", "variable_name": "df" } }, "metadata": {}, "execution_count": 20 } ] }, { "cell_type": "code", "source": [ "# Initialize TfidfVectorizer to convert text into TF-IDF features\n", "vectorizer = TfidfVectorizer()\n", "\n", "# Fit the vectorizer to the text data and transform the text into a sparse matrix of TF-IDF features\n", "# The result is stored in variable X, which represents the text data numerically\n", "X = vectorizer.fit_transform(df['text'])\n", "\n", "# Extract the 'target' column as the labels for the sentiment analysis\n", "# The labels are stored in variable y\n", "y = df['target']" ], "metadata": { "id": "l-u03h-Md2Fn" }, "execution_count": null, "outputs": [] }, { "cell_type": "code", "source": [ "from sklearn.linear_model import LogisticRegression\n", "from sklearn.metrics import classification_report\n", "\n", "X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)\n", "model = LogisticRegression(max_iter=1000)\n", "model.fit(X_train, y_train)\n", "\n", "y_pred = model.predict(X_test)\n", "print(classification_report(y_test, y_pred))\n" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "GqiuW6kIe3Pa", "outputId": "3e51ffa7-5605-4d1f-b49e-6ed3f31715bd" }, "execution_count": null, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ " precision recall f1-score support\n", "\n", " 0 0.81 0.77 0.79 160285\n", " 1 0.78 0.82 0.80 159715\n", "\n", " accuracy 0.80 320000\n", " macro avg 0.80 0.80 0.80 320000\n", "weighted avg 0.80 0.80 0.80 320000\n", "\n" ] } ] }, { "cell_type": "code", "source": [ "def predict_sentiment(text):\n", " text = clean_text(text)\n", " vec = vectorizer.transform([text])\n", " prediction = model.predict(vec)\n", " proba = model.predict_proba(vec)\n", " print(proba)\n", " classes = model.classes_\n", " print(classes)\n", " # Access probabilities using indexing based on class order\n", " print(\"Probability of Negative:\", proba[0][0])\n", " print(\"Probability of Positive:\", proba[0][1])\n", " return \"Positive comment\" if prediction[0] == 1 else \"Negative comment\"" ], "metadata": { "id": "WOp4thG7g6II" }, "execution_count": null, "outputs": [] }, { "cell_type": "code", "source": [ "predict_sentiment(\"I did not like this product\")" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 105 }, "id": "BAEDCJnVvIaV", "outputId": "22924d95-dab8-4c7e-a23e-2ee9cc918d6a" }, "execution_count": null, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "[[0.73289688 0.26710312]]\n", "[0 1]\n", "Probability of Negative: 0.7328968831194858\n", "Probability of Positive: 0.26710311688051425\n" ] }, { "output_type": "execute_result", "data": { "text/plain": [ "'Negative comment'" ], "application/vnd.google.colaboratory.intrinsic+json": { "type": "string" } }, "metadata": {}, "execution_count": 50 } ] }, { "cell_type": "code", "source": [ "import joblib" ], "metadata": { "id": "_2kYwao9h6vb" }, "execution_count": null, "outputs": [] }, { "cell_type": "code", "source": [ "joblib.dump(model, 'sentiment_model.pkl')\n", "joblib.dump(vectorizer, 'vectorizer.pkl')" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "SX_fqF5KiIyz", "outputId": "be343908-dc92-4510-9003-97b1407ee762" }, "execution_count": null, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ "['vectorizer.pkl']" ] }, "metadata": {}, "execution_count": 30 } ] } ] }