{ "cells": [ { "cell_type": "code", "execution_count": null, "metadata": { "id": "TjaG3WllZX3Y" }, "outputs": [], "source": [ "import numpy as np\n", "import pandas as pd\n", "from sklearn.model_selection import train_test_split\n", "from sklearn.linear_model import LogisticRegression\n", "from sklearn.metrics import accuracy_score\n", "\n", "import warnings\n", "warnings.filterwarnings('ignore')" ] }, { "cell_type": "code", "source": [ "import re\n", "import nltk\n", "import string\n", "from nltk.corpus import stopwords\n", "from nltk.stem import WordNetLemmatizer\n", "\n", "nltk.download('punkt')\n", "nltk.download('stopwords')\n", "nltk.download('wordnet')" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "s4OOuG5PQ6pJ", "outputId": "2e06700b-0d23-40e0-edbf-5174a81625d1" }, "execution_count": null, "outputs": [ { "output_type": "stream", "name": "stderr", "text": [ "[nltk_data] Downloading package punkt to /root/nltk_data...\n", "[nltk_data] Package punkt is already up-to-date!\n", "[nltk_data] Downloading package stopwords to /root/nltk_data...\n", "[nltk_data] Package stopwords is already up-to-date!\n", "[nltk_data] Downloading package wordnet to /root/nltk_data...\n", "[nltk_data] Package wordnet is already up-to-date!\n" ] }, { "output_type": "execute_result", "data": { "text/plain": [ "True" ] }, "metadata": {}, "execution_count": 3 } ] }, { "cell_type": "code", "source": [ "df = pd.read_csv(\"/content/IMDB Dataset.csv\", engine=\"python\", on_bad_lines=\"skip\")\n", "print(df.shape)\n", "print(df.head())\n", "\n" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "EmJ8miPsQ_ix", "outputId": "9801b4f9-f496-4860-a87e-af309576af72" }, "execution_count": null, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "(50000, 2)\n", " review sentiment\n", "0 One of the other reviewers has mentioned that ... positive\n", "1 A wonderful little production.

The... positive\n", "2 I thought this was a wonderful way to spend ti... positive\n", "3 Basically there's a family where a little boy ... negative\n", "4 Petter Mattei's \"Love in the Time of Money\" is... positive\n" ] } ] }, { "cell_type": "code", "source": [ "nltk.download(\"punkt_tab\")" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "DCj_cP5xQ_lA", "outputId": "493920ed-26b6-4dec-a3e0-7d8084698a7c" }, "execution_count": null, "outputs": [ { "output_type": "stream", "name": "stderr", "text": [ "[nltk_data] Downloading package punkt_tab to /root/nltk_data...\n", "[nltk_data] Package punkt_tab is already up-to-date!\n" ] }, { "output_type": "execute_result", "data": { "text/plain": [ "True" ] }, "metadata": {}, "execution_count": 5 } ] }, { "cell_type": "code", "source": [ "\n", "\n", "stop_words = set(stopwords.words('english'))\n", "lemmatizer = WordNetLemmatizer()\n", "\n", "def preprocess_text(text):\n", " # 1. Lowercase\n", " text = text.lower()\n", "\n", " # 2. Remove HTML tags\n", " text = re.sub(r'<.*?>', '', text)\n", "\n", " # 3. Remove punctuation & numbers\n", " text = re.sub(r'[^a-z\\s]', '', text)\n", "\n", " # 4. Tokenize\n", " tokens = nltk.word_tokenize(text)\n", "\n", " # 5. Remove stopwords\n", " tokens = [word for word in tokens if word not in stop_words]\n", "\n", " # 6. Lemmatization\n", " tokens = [lemmatizer.lemmatize(word) for word in tokens]\n", "\n", " return \" \".join(tokens)\n", "\n" ], "metadata": { "id": "zID5UONlP1ok" }, "execution_count": null, "outputs": [] }, { "cell_type": "code", "source": [ "df[\"cleaned_review\"] = df[\"review\"].apply(preprocess_text)\n", "\n", "\n", "print(df[[\"review\", \"cleaned_review\", \"sentiment\"]].tail())" ], "metadata": { "id": "HDsRgcxuP1rS", "colab": { "base_uri": "https://localhost:8080/" }, "outputId": "a30f30f1-6640-451b-8a46-247d488c9a68" }, "execution_count": null, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ " review \\\n", "49995 I thought this movie did a down right good job... \n", "49996 Bad plot, bad dialogue, bad acting, idiotic di... \n", "49997 I am a Catholic taught in parochial elementary... \n", "49998 I'm going to have to disagree with the previou... \n", "49999 No one expects the Star Trek movies to be high... \n", "\n", " cleaned_review sentiment \n", "49995 thought movie right good job wasnt creative or... positive \n", "49996 bad plot bad dialogue bad acting idiotic direc... negative \n", "49997 catholic taught parochial elementary school nu... negative \n", "49998 im going disagree previous comment side maltin... negative \n", "49999 one expects star trek movie high art fan expec... negative \n" ] } ] }, { "cell_type": "code", "source": [ "!pip install transformers torch sentence-transformers\n" ], "metadata": { "id": "DUf6-RvRP1uL", "colab": { "base_uri": "https://localhost:8080/" }, "outputId": "c4b4362f-8aad-43d8-ac0e-56f7d20c4140" }, "execution_count": null, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "Requirement already satisfied: transformers in /usr/local/lib/python3.12/dist-packages (4.56.2)\n", "Requirement already satisfied: torch in /usr/local/lib/python3.12/dist-packages (2.8.0+cu126)\n", "Requirement already satisfied: sentence-transformers in /usr/local/lib/python3.12/dist-packages (5.1.1)\n", "Requirement already satisfied: filelock in /usr/local/lib/python3.12/dist-packages (from transformers) (3.19.1)\n", "Requirement already satisfied: huggingface-hub<1.0,>=0.34.0 in /usr/local/lib/python3.12/dist-packages (from transformers) (0.35.0)\n", "Requirement already satisfied: numpy>=1.17 in /usr/local/lib/python3.12/dist-packages (from transformers) (1.26.0)\n", "Requirement already satisfied: packaging>=20.0 in /usr/local/lib/python3.12/dist-packages (from transformers) (25.0)\n", "Requirement already satisfied: pyyaml>=5.1 in /usr/local/lib/python3.12/dist-packages (from transformers) (6.0.2)\n", "Requirement already satisfied: regex!=2019.12.17 in /usr/local/lib/python3.12/dist-packages (from transformers) (2024.11.6)\n", "Requirement already satisfied: requests in /usr/local/lib/python3.12/dist-packages (from transformers) (2.32.4)\n", "Requirement already satisfied: tokenizers<=0.23.0,>=0.22.0 in /usr/local/lib/python3.12/dist-packages (from transformers) (0.22.0)\n", "Requirement already satisfied: safetensors>=0.4.3 in /usr/local/lib/python3.12/dist-packages (from transformers) (0.6.2)\n", "Requirement already satisfied: tqdm>=4.27 in /usr/local/lib/python3.12/dist-packages (from transformers) (4.67.1)\n", "Requirement already satisfied: typing-extensions>=4.10.0 in /usr/local/lib/python3.12/dist-packages (from torch) (4.15.0)\n", "Requirement already satisfied: setuptools in /usr/local/lib/python3.12/dist-packages (from torch) (75.2.0)\n", "Requirement already satisfied: sympy>=1.13.3 in /usr/local/lib/python3.12/dist-packages (from torch) (1.13.3)\n", "Requirement already satisfied: networkx in /usr/local/lib/python3.12/dist-packages (from torch) (3.5)\n", "Requirement already satisfied: jinja2 in /usr/local/lib/python3.12/dist-packages (from torch) (3.1.6)\n", "Requirement already satisfied: fsspec in /usr/local/lib/python3.12/dist-packages (from torch) (2025.3.0)\n", "Requirement already satisfied: nvidia-cuda-nvrtc-cu12==12.6.77 in /usr/local/lib/python3.12/dist-packages (from torch) (12.6.77)\n", "Requirement already satisfied: nvidia-cuda-runtime-cu12==12.6.77 in /usr/local/lib/python3.12/dist-packages (from torch) (12.6.77)\n", "Requirement already satisfied: nvidia-cuda-cupti-cu12==12.6.80 in /usr/local/lib/python3.12/dist-packages (from torch) (12.6.80)\n", "Requirement already satisfied: nvidia-cudnn-cu12==9.10.2.21 in /usr/local/lib/python3.12/dist-packages (from torch) (9.10.2.21)\n", "Requirement already satisfied: nvidia-cublas-cu12==12.6.4.1 in /usr/local/lib/python3.12/dist-packages (from torch) (12.6.4.1)\n", "Requirement already satisfied: nvidia-cufft-cu12==11.3.0.4 in /usr/local/lib/python3.12/dist-packages (from torch) (11.3.0.4)\n", "Requirement already satisfied: nvidia-curand-cu12==10.3.7.77 in /usr/local/lib/python3.12/dist-packages (from torch) (10.3.7.77)\n", "Requirement already satisfied: nvidia-cusolver-cu12==11.7.1.2 in /usr/local/lib/python3.12/dist-packages (from torch) (11.7.1.2)\n", "Requirement already satisfied: nvidia-cusparse-cu12==12.5.4.2 in /usr/local/lib/python3.12/dist-packages (from torch) (12.5.4.2)\n", "Requirement already satisfied: nvidia-cusparselt-cu12==0.7.1 in /usr/local/lib/python3.12/dist-packages (from torch) (0.7.1)\n", "Requirement already satisfied: nvidia-nccl-cu12==2.27.3 in /usr/local/lib/python3.12/dist-packages (from torch) (2.27.3)\n", "Requirement already satisfied: nvidia-nvtx-cu12==12.6.77 in /usr/local/lib/python3.12/dist-packages (from torch) (12.6.77)\n", "Requirement already satisfied: nvidia-nvjitlink-cu12==12.6.85 in /usr/local/lib/python3.12/dist-packages (from torch) (12.6.85)\n", "Requirement already satisfied: nvidia-cufile-cu12==1.11.1.6 in /usr/local/lib/python3.12/dist-packages (from torch) (1.11.1.6)\n", "Requirement already satisfied: triton==3.4.0 in /usr/local/lib/python3.12/dist-packages (from torch) (3.4.0)\n", "Requirement already satisfied: scikit-learn in /usr/local/lib/python3.12/dist-packages (from sentence-transformers) (1.6.1)\n", "Requirement already satisfied: scipy in /usr/local/lib/python3.12/dist-packages (from sentence-transformers) (1.16.2)\n", "Requirement already satisfied: Pillow in /usr/local/lib/python3.12/dist-packages (from sentence-transformers) (11.3.0)\n", "Requirement already satisfied: hf-xet<2.0.0,>=1.1.3 in /usr/local/lib/python3.12/dist-packages (from huggingface-hub<1.0,>=0.34.0->transformers) (1.1.10)\n", "Requirement already satisfied: mpmath<1.4,>=1.1.0 in /usr/local/lib/python3.12/dist-packages (from sympy>=1.13.3->torch) (1.3.0)\n", "Requirement already satisfied: MarkupSafe>=2.0 in /usr/local/lib/python3.12/dist-packages (from jinja2->torch) (3.0.2)\n", "Requirement already satisfied: charset_normalizer<4,>=2 in /usr/local/lib/python3.12/dist-packages (from requests->transformers) (3.4.3)\n", "Requirement already satisfied: idna<4,>=2.5 in /usr/local/lib/python3.12/dist-packages (from requests->transformers) (3.10)\n", "Requirement already satisfied: urllib3<3,>=1.21.1 in /usr/local/lib/python3.12/dist-packages (from requests->transformers) (2.5.0)\n", "Requirement already satisfied: certifi>=2017.4.17 in /usr/local/lib/python3.12/dist-packages (from requests->transformers) (2025.8.3)\n", "Requirement already satisfied: joblib>=1.2.0 in /usr/local/lib/python3.12/dist-packages (from scikit-learn->sentence-transformers) (1.5.2)\n", "Requirement already satisfied: threadpoolctl>=3.1.0 in /usr/local/lib/python3.12/dist-packages (from scikit-learn->sentence-transformers) (3.6.0)\n" ] } ] }, { "cell_type": "code", "source": [], "metadata": { "id": "so6p265dWAfb" }, "execution_count": null, "outputs": [] }, { "cell_type": "code", "source": [], "metadata": { "id": "j7qu7HlyWAkb" }, "execution_count": null, "outputs": [] }, { "cell_type": "code", "source": [ "!pip install -U torch torchvision torchaudio\n", "!pip install -U transformers sentence-transformers\n" ], "metadata": { "id": "kQkAlTaKP1w0", "colab": { "base_uri": "https://localhost:8080/" }, "outputId": "77568f04-e205-4524-c838-c46042a317a2" }, "execution_count": null, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "Requirement already satisfied: torch in /usr/local/lib/python3.12/dist-packages (2.8.0+cu126)\n", "Requirement already satisfied: torchvision in /usr/local/lib/python3.12/dist-packages (0.23.0+cu126)\n", "Requirement already satisfied: torchaudio in /usr/local/lib/python3.12/dist-packages (2.8.0+cu126)\n", "Requirement already satisfied: filelock in /usr/local/lib/python3.12/dist-packages (from torch) (3.19.1)\n", "Requirement already satisfied: typing-extensions>=4.10.0 in /usr/local/lib/python3.12/dist-packages (from torch) (4.15.0)\n", "Requirement already satisfied: setuptools in /usr/local/lib/python3.12/dist-packages (from torch) (75.2.0)\n", "Requirement already satisfied: sympy>=1.13.3 in /usr/local/lib/python3.12/dist-packages (from torch) (1.13.3)\n", "Requirement already satisfied: networkx in /usr/local/lib/python3.12/dist-packages (from torch) (3.5)\n", "Requirement already satisfied: jinja2 in /usr/local/lib/python3.12/dist-packages (from torch) (3.1.6)\n", "Requirement already satisfied: fsspec in /usr/local/lib/python3.12/dist-packages (from torch) (2025.3.0)\n", "Requirement already satisfied: nvidia-cuda-nvrtc-cu12==12.6.77 in /usr/local/lib/python3.12/dist-packages (from torch) (12.6.77)\n", "Requirement already satisfied: nvidia-cuda-runtime-cu12==12.6.77 in /usr/local/lib/python3.12/dist-packages (from torch) (12.6.77)\n", "Requirement already satisfied: nvidia-cuda-cupti-cu12==12.6.80 in /usr/local/lib/python3.12/dist-packages (from torch) (12.6.80)\n", "Requirement already satisfied: nvidia-cudnn-cu12==9.10.2.21 in /usr/local/lib/python3.12/dist-packages (from torch) (9.10.2.21)\n", "Requirement already satisfied: nvidia-cublas-cu12==12.6.4.1 in /usr/local/lib/python3.12/dist-packages (from torch) (12.6.4.1)\n", "Requirement already satisfied: nvidia-cufft-cu12==11.3.0.4 in /usr/local/lib/python3.12/dist-packages (from torch) (11.3.0.4)\n", "Requirement already satisfied: nvidia-curand-cu12==10.3.7.77 in /usr/local/lib/python3.12/dist-packages (from torch) (10.3.7.77)\n", "Requirement already satisfied: nvidia-cusolver-cu12==11.7.1.2 in /usr/local/lib/python3.12/dist-packages (from torch) (11.7.1.2)\n", "Requirement already satisfied: nvidia-cusparse-cu12==12.5.4.2 in /usr/local/lib/python3.12/dist-packages (from torch) (12.5.4.2)\n", "Requirement already satisfied: nvidia-cusparselt-cu12==0.7.1 in /usr/local/lib/python3.12/dist-packages (from torch) (0.7.1)\n", "Requirement already satisfied: nvidia-nccl-cu12==2.27.3 in /usr/local/lib/python3.12/dist-packages (from torch) (2.27.3)\n", "Requirement already satisfied: nvidia-nvtx-cu12==12.6.77 in /usr/local/lib/python3.12/dist-packages (from torch) (12.6.77)\n", "Requirement already satisfied: nvidia-nvjitlink-cu12==12.6.85 in /usr/local/lib/python3.12/dist-packages (from torch) (12.6.85)\n", "Requirement already satisfied: nvidia-cufile-cu12==1.11.1.6 in /usr/local/lib/python3.12/dist-packages (from torch) (1.11.1.6)\n", "Requirement already satisfied: triton==3.4.0 in /usr/local/lib/python3.12/dist-packages (from torch) (3.4.0)\n", "Requirement already satisfied: numpy in /usr/local/lib/python3.12/dist-packages (from torchvision) (1.26.0)\n", "Requirement already satisfied: pillow!=8.3.*,>=5.3.0 in /usr/local/lib/python3.12/dist-packages (from torchvision) (11.3.0)\n", "Requirement already satisfied: mpmath<1.4,>=1.1.0 in /usr/local/lib/python3.12/dist-packages (from sympy>=1.13.3->torch) (1.3.0)\n", "Requirement already satisfied: MarkupSafe>=2.0 in /usr/local/lib/python3.12/dist-packages (from jinja2->torch) (3.0.2)\n", "Requirement already satisfied: transformers in /usr/local/lib/python3.12/dist-packages (4.56.2)\n", "Requirement already satisfied: sentence-transformers in /usr/local/lib/python3.12/dist-packages (5.1.1)\n", "Requirement already satisfied: filelock in /usr/local/lib/python3.12/dist-packages (from transformers) (3.19.1)\n", "Requirement already satisfied: huggingface-hub<1.0,>=0.34.0 in /usr/local/lib/python3.12/dist-packages (from transformers) (0.35.0)\n", "Requirement already satisfied: numpy>=1.17 in /usr/local/lib/python3.12/dist-packages (from transformers) (1.26.0)\n", "Requirement already satisfied: packaging>=20.0 in /usr/local/lib/python3.12/dist-packages (from transformers) (25.0)\n", "Requirement already satisfied: pyyaml>=5.1 in /usr/local/lib/python3.12/dist-packages (from transformers) (6.0.2)\n", "Requirement already satisfied: regex!=2019.12.17 in /usr/local/lib/python3.12/dist-packages (from transformers) (2024.11.6)\n", "Requirement already satisfied: requests in /usr/local/lib/python3.12/dist-packages (from transformers) (2.32.4)\n", "Requirement already satisfied: tokenizers<=0.23.0,>=0.22.0 in /usr/local/lib/python3.12/dist-packages (from transformers) (0.22.0)\n", "Requirement already satisfied: safetensors>=0.4.3 in /usr/local/lib/python3.12/dist-packages (from transformers) (0.6.2)\n", "Requirement already satisfied: tqdm>=4.27 in /usr/local/lib/python3.12/dist-packages (from transformers) (4.67.1)\n", "Requirement already satisfied: torch>=1.11.0 in /usr/local/lib/python3.12/dist-packages (from sentence-transformers) (2.8.0+cu126)\n", "Requirement already satisfied: scikit-learn in /usr/local/lib/python3.12/dist-packages (from sentence-transformers) (1.6.1)\n", "Requirement already satisfied: scipy in /usr/local/lib/python3.12/dist-packages (from sentence-transformers) (1.16.2)\n", "Requirement already satisfied: Pillow in /usr/local/lib/python3.12/dist-packages (from sentence-transformers) (11.3.0)\n", "Requirement already satisfied: typing_extensions>=4.5.0 in /usr/local/lib/python3.12/dist-packages (from sentence-transformers) (4.15.0)\n", "Requirement already satisfied: fsspec>=2023.5.0 in /usr/local/lib/python3.12/dist-packages (from huggingface-hub<1.0,>=0.34.0->transformers) (2025.3.0)\n", "Requirement already satisfied: hf-xet<2.0.0,>=1.1.3 in /usr/local/lib/python3.12/dist-packages (from huggingface-hub<1.0,>=0.34.0->transformers) (1.1.10)\n", "Requirement already satisfied: setuptools in /usr/local/lib/python3.12/dist-packages (from torch>=1.11.0->sentence-transformers) (75.2.0)\n", "Requirement already satisfied: sympy>=1.13.3 in /usr/local/lib/python3.12/dist-packages (from torch>=1.11.0->sentence-transformers) (1.13.3)\n", "Requirement already satisfied: networkx in /usr/local/lib/python3.12/dist-packages (from torch>=1.11.0->sentence-transformers) (3.5)\n", "Requirement already satisfied: jinja2 in /usr/local/lib/python3.12/dist-packages (from torch>=1.11.0->sentence-transformers) (3.1.6)\n", "Requirement already satisfied: nvidia-cuda-nvrtc-cu12==12.6.77 in /usr/local/lib/python3.12/dist-packages (from torch>=1.11.0->sentence-transformers) (12.6.77)\n", "Requirement already satisfied: nvidia-cuda-runtime-cu12==12.6.77 in /usr/local/lib/python3.12/dist-packages (from torch>=1.11.0->sentence-transformers) (12.6.77)\n", "Requirement already satisfied: nvidia-cuda-cupti-cu12==12.6.80 in /usr/local/lib/python3.12/dist-packages (from torch>=1.11.0->sentence-transformers) (12.6.80)\n", "Requirement already satisfied: nvidia-cudnn-cu12==9.10.2.21 in /usr/local/lib/python3.12/dist-packages (from torch>=1.11.0->sentence-transformers) (9.10.2.21)\n", "Requirement already satisfied: nvidia-cublas-cu12==12.6.4.1 in /usr/local/lib/python3.12/dist-packages (from torch>=1.11.0->sentence-transformers) (12.6.4.1)\n", "Requirement already satisfied: nvidia-cufft-cu12==11.3.0.4 in /usr/local/lib/python3.12/dist-packages (from torch>=1.11.0->sentence-transformers) (11.3.0.4)\n", "Requirement already satisfied: nvidia-curand-cu12==10.3.7.77 in /usr/local/lib/python3.12/dist-packages (from torch>=1.11.0->sentence-transformers) (10.3.7.77)\n", "Requirement already satisfied: nvidia-cusolver-cu12==11.7.1.2 in /usr/local/lib/python3.12/dist-packages (from torch>=1.11.0->sentence-transformers) (11.7.1.2)\n", "Requirement already satisfied: nvidia-cusparse-cu12==12.5.4.2 in /usr/local/lib/python3.12/dist-packages (from torch>=1.11.0->sentence-transformers) (12.5.4.2)\n", "Requirement already satisfied: nvidia-cusparselt-cu12==0.7.1 in /usr/local/lib/python3.12/dist-packages (from torch>=1.11.0->sentence-transformers) (0.7.1)\n", "Requirement already satisfied: nvidia-nccl-cu12==2.27.3 in /usr/local/lib/python3.12/dist-packages (from torch>=1.11.0->sentence-transformers) (2.27.3)\n", "Requirement already satisfied: nvidia-nvtx-cu12==12.6.77 in /usr/local/lib/python3.12/dist-packages (from torch>=1.11.0->sentence-transformers) (12.6.77)\n", "Requirement already satisfied: nvidia-nvjitlink-cu12==12.6.85 in /usr/local/lib/python3.12/dist-packages (from torch>=1.11.0->sentence-transformers) (12.6.85)\n", "Requirement already satisfied: nvidia-cufile-cu12==1.11.1.6 in /usr/local/lib/python3.12/dist-packages (from torch>=1.11.0->sentence-transformers) (1.11.1.6)\n", "Requirement already satisfied: triton==3.4.0 in /usr/local/lib/python3.12/dist-packages (from torch>=1.11.0->sentence-transformers) (3.4.0)\n", "Requirement already satisfied: charset_normalizer<4,>=2 in /usr/local/lib/python3.12/dist-packages (from requests->transformers) (3.4.3)\n", "Requirement already satisfied: idna<4,>=2.5 in /usr/local/lib/python3.12/dist-packages (from requests->transformers) (3.10)\n", "Requirement already satisfied: urllib3<3,>=1.21.1 in /usr/local/lib/python3.12/dist-packages (from requests->transformers) (2.5.0)\n", "Requirement already satisfied: certifi>=2017.4.17 in /usr/local/lib/python3.12/dist-packages (from requests->transformers) (2025.8.3)\n", "Requirement already satisfied: joblib>=1.2.0 in /usr/local/lib/python3.12/dist-packages (from scikit-learn->sentence-transformers) (1.5.2)\n", "Requirement already satisfied: threadpoolctl>=3.1.0 in /usr/local/lib/python3.12/dist-packages (from scikit-learn->sentence-transformers) (3.6.0)\n", "Requirement already satisfied: mpmath<1.4,>=1.1.0 in /usr/local/lib/python3.12/dist-packages (from sympy>=1.13.3->torch>=1.11.0->sentence-transformers) (1.3.0)\n", "Requirement already satisfied: MarkupSafe>=2.0 in /usr/local/lib/python3.12/dist-packages (from jinja2->torch>=1.11.0->sentence-transformers) (3.0.2)\n" ] } ] }, { "cell_type": "code", "source": [ "\n", "from sklearn.feature_extraction.text import TfidfVectorizer" ], "metadata": { "id": "Ai71-ZR9jXUy" }, "execution_count": null, "outputs": [] }, { "cell_type": "code", "source": [ "\n", "tfidf = TfidfVectorizer(max_features=5000)\n", "X = tfidf.fit_transform(df[\"cleaned_review\"])\n", "\n", "y = df[\"sentiment\"].map({\"positive\": 1, \"negative\": 0}).values\n", "print(X)" ], "metadata": { "id": "qrre8w1hP1zn", "colab": { "base_uri": "https://localhost:8080/" }, "outputId": "7e36be98-44f0-4f69-efcb-9d38c6d7a5a8" }, "execution_count": null, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "\n", " Coords\tValues\n", " (0, 3086)\t0.022306340769509683\n", " (0, 3687)\t0.0716098594890523\n", " (0, 2778)\t0.06997013472849029\n", " (0, 4832)\t0.08220463762202762\n", " (0, 3137)\t0.47260726598173247\n", " (0, 1467)\t0.11279543497089747\n", " (0, 4990)\t0.05792759900745007\n", " (0, 2119)\t0.0884283468115394\n", " (0, 3707)\t0.09072957775856476\n", " (0, 1513)\t0.06118924586115793\n", " (0, 1993)\t0.06126011378849219\n", " (0, 1697)\t0.06827831217826244\n", " (0, 4468)\t0.034484905907198556\n", " (0, 4258)\t0.1771814513806625\n", " (0, 543)\t0.09851073786514736\n", " (0, 3831)\t0.03333634078441469\n", " (0, 4763)\t0.25095990028835147\n", " (0, 3914)\t0.04650048507467132\n", " (0, 4938)\t0.05348789066286241\n", " (0, 1883)\t0.07135314796729297\n", " (0, 4610)\t0.07763035600809258\n", " (0, 3975)\t0.15304406167950377\n", " (0, 3451)\t0.06917492387135953\n", " (0, 3456)\t0.08224792976382196\n", " (0, 3583)\t0.08260003424873928\n", " :\t:\n", " (49999, 4949)\t0.09967306284736142\n", " (49999, 1608)\t0.18895470226431363\n", " (49999, 2146)\t0.0938134095848708\n", " (49999, 4608)\t0.10421390262146894\n", " (49999, 680)\t0.1210849951944911\n", " (49999, 3818)\t0.1207533622254777\n", " (49999, 2470)\t0.12550521294127479\n", " (49999, 4670)\t0.11972754064045821\n", " (49999, 2540)\t0.10566290485449034\n", " (49999, 1538)\t0.12096018782971787\n", " (49999, 303)\t0.13528867065174088\n", " (49999, 2002)\t0.14117928521526385\n", " (49999, 2225)\t0.12437816971606588\n", " (49999, 2205)\t0.1883414548411176\n", " (49999, 241)\t0.12037119185480823\n", " (49999, 4588)\t0.1774967991895605\n", " (49999, 2457)\t0.2041086873930473\n", " (49999, 4154)\t0.2230098076678039\n", " (49999, 3632)\t0.17275135778399867\n", " (49999, 581)\t0.16361717948998034\n", " (49999, 2995)\t0.1828212276262584\n", " (49999, 684)\t0.14413194406724505\n", " (49999, 1897)\t0.17552265388590801\n", " (49999, 3630)\t0.16768695545427548\n", " (49999, 2903)\t0.19720455104959872\n" ] } ] }, { "cell_type": "code", "source": [ "\n", "from sklearn.feature_extraction.text import TfidfVectorizer\n" ], "metadata": { "id": "y0voh7wgYX5z" }, "execution_count": null, "outputs": [] }, { "cell_type": "markdown", "source": [ "**MODEL**" ], "metadata": { "id": "9ahaUAiOjaih" } }, { "cell_type": "code", "source": [ "from sklearn.model_selection import train_test_split\n", "from sklearn.linear_model import LogisticRegression\n", "from sklearn.metrics import accuracy_score, classification_report, confusion_matrix\n", "\n", "\n", "\n" ], "metadata": { "id": "fDNAJ33YP12O" }, "execution_count": null, "outputs": [] }, { "cell_type": "code", "source": [ "\n", "X_train, X_test, y_train, y_test = train_test_split(\n", " X, y, test_size=0.2, random_state=42\n", ")\n" ], "metadata": { "id": "a_u6DHxwP15A" }, "execution_count": null, "outputs": [] }, { "cell_type": "code", "source": [ "\n", "clf = LogisticRegression(max_iter=200)\n", "\n", "clf.fit(X_train, y_train)\n" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 80 }, "id": "rebafV7PZQa8", "outputId": "1237907c-d30f-4e97-914f-b16a544a98bf" }, "execution_count": null, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ "LogisticRegression(max_iter=200)" ], "text/html": [ "
LogisticRegression(max_iter=200)
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
" ] }, "metadata": {}, "execution_count": 15 } ] }, { "cell_type": "code", "source": [ "y_pred = clf.predict(X_test)\n" ], "metadata": { "id": "zutBgt6SP17z" }, "execution_count": null, "outputs": [] }, { "cell_type": "code", "source": [ "\n", "print(\"Accuracy:\", accuracy_score(y_test, y_pred))\n", "\n", "\n", "print(classification_report(y_test, y_pred))\n", "\n", "\n", "print(\"Confusion Matrix:\\n\", confusion_matrix(y_test, y_pred))\n" ], "metadata": { "id": "lOM-9luZP1-0", "colab": { "base_uri": "https://localhost:8080/" }, "outputId": "c971b06d-e98b-494a-fcb2-8e11b31aa072" }, "execution_count": null, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "Accuracy: 0.8845\n", " precision recall f1-score support\n", "\n", " 0 0.89 0.87 0.88 4961\n", " 1 0.88 0.90 0.89 5039\n", "\n", " accuracy 0.88 10000\n", " macro avg 0.88 0.88 0.88 10000\n", "weighted avg 0.88 0.88 0.88 10000\n", "\n", "Confusion Matrix:\n", " [[4318 643]\n", " [ 512 4527]]\n" ] } ] }, { "cell_type": "code", "source": [ "# Uninstall conflicting packages\n", "!pip uninstall -y shap scipy\n", "\n", "# Install compatible versions\n", "!pip install scipy==1.10.1\n", "!pip install shap==0.42.1" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 714 }, "id": "9FmYRCyOtS2L", "outputId": "5742d3ba-5692-4b0c-9bb8-6b83b49f77be" }, "execution_count": null, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "Found existing installation: shap 0.42.1\n", "Uninstalling shap-0.42.1:\n", " Successfully uninstalled shap-0.42.1\n", "Found existing installation: scipy 1.16.2\n", "Uninstalling scipy-1.16.2:\n", " Successfully uninstalled scipy-1.16.2\n", "\u001b[31mERROR: Ignored the following yanked versions: 1.11.0, 1.14.0rc1\u001b[0m\u001b[31m\n", "\u001b[0m\u001b[31mERROR: Ignored the following versions that require a different python version: 1.10.0 Requires-Python <3.12,>=3.8; 1.10.0rc1 Requires-Python <3.12,>=3.8; 1.10.0rc2 Requires-Python <3.12,>=3.8; 1.10.1 Requires-Python <3.12,>=3.8; 1.6.2 Requires-Python >=3.7,<3.10; 1.6.3 Requires-Python >=3.7,<3.10; 1.7.0 Requires-Python >=3.7,<3.10; 1.7.1 Requires-Python >=3.7,<3.10; 1.7.2 Requires-Python >=3.7,<3.11; 1.7.3 Requires-Python >=3.7,<3.11; 1.8.0 Requires-Python >=3.8,<3.11; 1.8.0rc1 Requires-Python >=3.8,<3.11; 1.8.0rc2 Requires-Python >=3.8,<3.11; 1.8.0rc3 Requires-Python >=3.8,<3.11; 1.8.0rc4 Requires-Python >=3.8,<3.11; 1.8.1 Requires-Python >=3.8,<3.11; 1.9.0 Requires-Python >=3.8,<3.12; 1.9.0rc1 Requires-Python >=3.8,<3.12; 1.9.0rc2 Requires-Python >=3.8,<3.12; 1.9.0rc3 Requires-Python >=3.8,<3.12; 1.9.1 Requires-Python >=3.8,<3.12\u001b[0m\u001b[31m\n", "\u001b[0m\u001b[31mERROR: Could not find a version that satisfies the requirement scipy==1.10.1 (from versions: 0.8.0, 0.9.0, 0.10.0, 0.10.1, 0.11.0, 0.12.0, 0.12.1, 0.13.0, 0.13.1, 0.13.2, 0.13.3, 0.14.0, 0.14.1, 0.15.0, 0.15.1, 0.16.0, 0.16.1, 0.17.0, 0.17.1, 0.18.0, 0.18.1, 0.19.0, 0.19.1, 1.0.0, 1.0.1, 1.1.0, 1.2.0, 1.2.1, 1.2.2, 1.2.3, 1.3.0, 1.3.1, 1.3.2, 1.3.3, 1.4.0, 1.4.1, 1.5.0, 1.5.1, 1.5.2, 1.5.3, 1.5.4, 1.6.0, 1.6.1, 1.9.2, 1.9.3, 1.11.0rc1, 1.11.0rc2, 1.11.1, 1.11.2, 1.11.3, 1.11.4, 1.12.0rc1, 1.12.0rc2, 1.12.0, 1.13.0rc1, 1.13.0, 1.13.1, 1.14.0rc2, 1.14.0, 1.14.1, 1.15.0rc1, 1.15.0rc2, 1.15.0, 1.15.1, 1.15.2, 1.15.3, 1.16.0rc1, 1.16.0rc2, 1.16.0, 1.16.1, 1.16.2)\u001b[0m\u001b[31m\n", "\u001b[0m\u001b[31mERROR: No matching distribution found for scipy==1.10.1\u001b[0m\u001b[31m\n", "\u001b[0mCollecting shap==0.42.1\n", " Using cached shap-0.42.1-cp312-cp312-linux_x86_64.whl\n", "Requirement already satisfied: numpy in /usr/local/lib/python3.12/dist-packages (from shap==0.42.1) (1.26.0)\n", "Collecting scipy (from shap==0.42.1)\n", " Using cached scipy-1.16.2-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.whl.metadata (62 kB)\n", "Requirement already satisfied: scikit-learn in /usr/local/lib/python3.12/dist-packages (from shap==0.42.1) (1.6.1)\n", "Requirement already satisfied: pandas in /usr/local/lib/python3.12/dist-packages (from shap==0.42.1) (2.2.2)\n", "Requirement already satisfied: tqdm>=4.27.0 in /usr/local/lib/python3.12/dist-packages (from shap==0.42.1) (4.67.1)\n", "Requirement already satisfied: packaging>20.9 in /usr/local/lib/python3.12/dist-packages (from shap==0.42.1) (25.0)\n", "Requirement already satisfied: slicer==0.0.7 in /usr/local/lib/python3.12/dist-packages (from shap==0.42.1) (0.0.7)\n", "Requirement already satisfied: numba in /usr/local/lib/python3.12/dist-packages (from shap==0.42.1) (0.60.0)\n", "Requirement already satisfied: cloudpickle in /usr/local/lib/python3.12/dist-packages (from shap==0.42.1) (3.1.1)\n", "Requirement already satisfied: llvmlite<0.44,>=0.43.0dev0 in /usr/local/lib/python3.12/dist-packages (from numba->shap==0.42.1) (0.43.0)\n", "Requirement already satisfied: python-dateutil>=2.8.2 in /usr/local/lib/python3.12/dist-packages (from pandas->shap==0.42.1) (2.9.0.post0)\n", "Requirement already satisfied: pytz>=2020.1 in /usr/local/lib/python3.12/dist-packages (from pandas->shap==0.42.1) (2025.2)\n", "Requirement already satisfied: tzdata>=2022.7 in /usr/local/lib/python3.12/dist-packages (from pandas->shap==0.42.1) (2025.2)\n", "Requirement already satisfied: joblib>=1.2.0 in /usr/local/lib/python3.12/dist-packages (from scikit-learn->shap==0.42.1) (1.5.2)\n", "Requirement already satisfied: threadpoolctl>=3.1.0 in /usr/local/lib/python3.12/dist-packages (from scikit-learn->shap==0.42.1) (3.6.0)\n", "Requirement already satisfied: six>=1.5 in /usr/local/lib/python3.12/dist-packages (from python-dateutil>=2.8.2->pandas->shap==0.42.1) (1.17.0)\n", "Using cached scipy-1.16.2-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.whl (35.7 MB)\n", "Installing collected packages: scipy, shap\n", "Successfully installed scipy-1.16.2 shap-0.42.1\n" ] }, { "output_type": "display_data", "data": { "application/vnd.colab-display-data+json": { "pip_warning": { "packages": [ "scipy", "shap" ] }, "id": "471ee32ea4224e2c8848bbf1936661d7" } }, "metadata": {} } ] }, { "cell_type": "code", "source": [ "!pip install numpy==1.26.0 shap==0.42.1\n", "\n" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "sgslFrBzuj2j", "outputId": "c013ef49-d02a-46ed-e28f-a669fe4ad0db" }, "execution_count": null, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "Requirement already satisfied: numpy==1.26.0 in /usr/local/lib/python3.12/dist-packages (1.26.0)\n", "Requirement already satisfied: shap==0.42.1 in /usr/local/lib/python3.12/dist-packages (0.42.1)\n", "Requirement already satisfied: scipy in /usr/local/lib/python3.12/dist-packages (from shap==0.42.1) (1.16.2)\n", "Requirement already satisfied: scikit-learn in /usr/local/lib/python3.12/dist-packages (from shap==0.42.1) (1.6.1)\n", "Requirement already satisfied: pandas in /usr/local/lib/python3.12/dist-packages (from shap==0.42.1) (2.2.2)\n", "Requirement already satisfied: tqdm>=4.27.0 in /usr/local/lib/python3.12/dist-packages (from shap==0.42.1) (4.67.1)\n", "Requirement already satisfied: packaging>20.9 in /usr/local/lib/python3.12/dist-packages (from shap==0.42.1) (25.0)\n", "Requirement already satisfied: slicer==0.0.7 in /usr/local/lib/python3.12/dist-packages (from shap==0.42.1) (0.0.7)\n", "Requirement already satisfied: numba in /usr/local/lib/python3.12/dist-packages (from shap==0.42.1) (0.60.0)\n", "Requirement already satisfied: cloudpickle in /usr/local/lib/python3.12/dist-packages (from shap==0.42.1) (3.1.1)\n", "Requirement already satisfied: llvmlite<0.44,>=0.43.0dev0 in /usr/local/lib/python3.12/dist-packages (from numba->shap==0.42.1) (0.43.0)\n", "Requirement already satisfied: python-dateutil>=2.8.2 in /usr/local/lib/python3.12/dist-packages (from pandas->shap==0.42.1) (2.9.0.post0)\n", "Requirement already satisfied: pytz>=2020.1 in /usr/local/lib/python3.12/dist-packages (from pandas->shap==0.42.1) (2025.2)\n", "Requirement already satisfied: tzdata>=2022.7 in /usr/local/lib/python3.12/dist-packages (from pandas->shap==0.42.1) (2025.2)\n", "Requirement already satisfied: joblib>=1.2.0 in /usr/local/lib/python3.12/dist-packages (from scikit-learn->shap==0.42.1) (1.5.2)\n", "Requirement already satisfied: threadpoolctl>=3.1.0 in /usr/local/lib/python3.12/dist-packages (from scikit-learn->shap==0.42.1) (3.6.0)\n", "Requirement already satisfied: six>=1.5 in /usr/local/lib/python3.12/dist-packages (from python-dateutil>=2.8.2->pandas->shap==0.42.1) (1.17.0)\n" ] } ] }, { "cell_type": "code", "source": [ "import numpy as np\n", "\n", "\n", "np.obj2sctype = lambda obj: np.dtype(obj).type\n", "\n", "import shap\n" ], "metadata": { "id": "ju0ZjEtXrzej" }, "execution_count": null, "outputs": [] }, { "cell_type": "code", "source": [], "metadata": { "id": "2ng9Yfj-p3z4" }, "execution_count": null, "outputs": [] }, { "cell_type": "code", "source": [], "metadata": { "id": "_ffFRvF8oy1l" }, "execution_count": null, "outputs": [] }, { "cell_type": "markdown", "source": [ "MODEL 2\n" ], "metadata": { "id": "iD0gMTx-jfP4" } }, { "cell_type": "code", "source": [ "\n", "\n", "from sklearn.model_selection import train_test_split\n", "from tensorflow.keras.preprocessing.text import Tokenizer\n", "from tensorflow.keras.preprocessing.sequence import pad_sequences\n", "from tensorflow.keras.models import Sequential\n", "from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout, Bidirectional\n", "\n", "max_words = 10000\n", "max_len = 200\n", "\n", "tokenizer = Tokenizer(num_words=max_words)\n", "tokenizer.fit_on_texts(df[\"cleaned_review\"])\n", "sequences = tokenizer.texts_to_sequences(df[\"cleaned_review\"])\n", "X = pad_sequences(sequences, maxlen=max_len)\n", "\n", "y = df[\"sentiment\"].map({\"positive\": 1, \"negative\": 0}).values\n", "\n", "\n", "X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)\n", "\n", "\n", "model = Sequential()\n", "model.add(Embedding(input_dim=max_words, output_dim=128, input_length=max_len))\n", "model.add(Bidirectional(LSTM(64)))\n", "model.add(Dropout(0.5))\n", "model.add(Dense(1, activation='sigmoid'))\n", "\n", "model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])\n", "model.summary()\n", "\n", "\n", "history = model.fit(\n", " X_train, y_train,\n", " epochs=5,\n", " batch_size=128,\n", " validation_split=0.1\n", ")\n", "\n", "\n", "\n", "loss, accuracy = model.evaluate(X_test, y_test)\n", "print(\"LSTM Test Accuracy:\", accuracy)\n", "\n", "\n" ], "metadata": { "id": "ZX3VcwlOP2EK" }, "execution_count": null, "outputs": [] }, { "cell_type": "code", "source": [ "\n", "!pip uninstall -y shap scipy\n", "\n", "\n", "!pip install scipy==1.10.1\n", "!pip install shap==0.42.1" ], "metadata": { "id": "2_DekM_ijieI" }, "execution_count": null, "outputs": [] }, { "cell_type": "code", "source": [], "metadata": { "id": "_Iwjp2JpjihN" }, "execution_count": null, "outputs": [] }, { "cell_type": "code", "source": [], "metadata": { "id": "QlgbCNxUjikB" }, "execution_count": null, "outputs": [] }, { "cell_type": "code", "source": [], "metadata": { "id": "afS4-CfWjinK" }, "execution_count": null, "outputs": [] }, { "cell_type": "code", "source": [ "import shap\n", "import numpy as np\n", "\n", "X_background = X_train[:100]\n", "\n", "explainer = shap.DeepExplainer(model, X_background)\n", "\n", "\n", "sample_review = X_test[0:1]\n", "\n", "\n", "shap_values = explainer.shap_values(sample_review)\n", "\n", "\n", "index_word = {v: k for k, v in tokenizer.word_index.items()}\n", "words = [index_word.get(i, '') for i in sample_review[0] if i != 0]\n", "\n", "shap.initjs()\n", "shap.force_plot(explainer.expected_value[0], shap_values[0][0], words)\n" ], "metadata": { "id": "D1zFJ5gwP2SC" }, "execution_count": null, "outputs": [] }, { "cell_type": "code", "source": [ "import shap" ], "metadata": { "id": "vwQyqFNlhOc4" }, "execution_count": null, "outputs": [] }, { "cell_type": "code", "source": [ "Logistic Regression is a simple baseline model for sentiment analysis, using features like bag-of-words or TF-IDF but ignores word order. LSTM, on the other hand, is a deep learning model that captures sequence and context, making it more accurate for understanding movie reviews" ], "metadata": { "id": "SReiT-0giWKR" }, "execution_count": null, "outputs": [] } ], "metadata": { "colab": { "provenance": [] }, "kernelspec": { "display_name": "Python 3", "name": "python3" } }, "nbformat": 4, "nbformat_minor": 0 }