{ "nbformat": 4, "nbformat_minor": 5, "metadata": { "colab": { "provenance": [] }, "language_info": { "name": "python" }, "kernelspec": { "name": "python3", "display_name": "Python 3" } }, "cells": [ { "id": "4b032f5a", "cell_type": "markdown", "source": [ "\n", "# **Text Classification (Traditional ML)** \n", "### Spam Detection using TF-IDF + Naïve Bayes\n", "\n", "This notebook covers:\n", "- Sentiment / Text Classification Basics \n", "- Train/Test Split & Evaluation Metrics \n", "- TF-IDF Feature Extraction \n", "- Naïve Bayes Model for Spam Detection \n" ], "metadata": { "id": "4b032f5a" } }, { "id": "9848cd98", "cell_type": "code", "metadata": { "id": "9848cd98" }, "execution_count": 1, "source": [ "\n", "# Install required libraries (if not present)\n", "# !pip install scikit-learn pandas\n" ], "outputs": [] }, { "cell_type": "code", "source": [ "import pandas as pd\n", "import os\n", "from sklearn.model_selection import train_test_split\n", "from sklearn.feature_extraction.text import TfidfVectorizer\n", "from sklearn.naive_bayes import MultinomialNB\n", "from sklearn.metrics import classification_report, confusion_matrix\n", "import joblib\n", "import kagglehub" ], "metadata": { "id": "XSLfGWSEMW7K" }, "id": "XSLfGWSEMW7K", "execution_count": 2, "outputs": [] }, { "cell_type": "markdown", "source": [ "# 1. Download & Load Dataset" ], "metadata": { "id": "2X8GrfqeMX7R" }, "id": "2X8GrfqeMX7R" }, { "cell_type": "code", "source": [ "dataset_path = kagglehub.dataset_download(\"uciml/sms-spam-collection-dataset\")\n", "\n", "# Locate CSV file automatically\n", "for file in os.listdir(dataset_path):\n", " if file.endswith(\".csv\"):\n", " data_file = os.path.join(dataset_path, file)\n", " break\n", "\n", "print(\"Using dataset file:\", data_file)\n", "\n", "df = pd.read_csv(data_file, encoding=\"latin-1\")[['v1','v2']]\n", "df.columns = ['label', 'text']" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "8xu6lXODMXgi", "outputId": "2edde3cc-0191-4d91-9036-73c6ee94bc99" }, "id": "8xu6lXODMXgi", "execution_count": 3, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "Using Colab cache for faster access to the 'sms-spam-collection-dataset' dataset.\n", "Using dataset file: /kaggle/input/sms-spam-collection-dataset/spam.csv\n" ] } ] }, { "cell_type": "markdown", "source": [ "# 2. Prepare Data" ], "metadata": { "id": "ZEDris-IMb3i" }, "id": "ZEDris-IMb3i" }, { "cell_type": "code", "source": [ "X = df['text']\n", "y = df['label']\n", "\n", "vectorizer = TfidfVectorizer(stop_words='english')\n", "X_tfidf = vectorizer.fit_transform(X)\n", "\n", "X_train, X_test, y_train, y_test = train_test_split(\n", " X_tfidf, y, test_size=0.2, random_state=42\n", ")" ], "metadata": { "id": "z8WqtF9IMdRq" }, "id": "z8WqtF9IMdRq", "execution_count": 4, "outputs": [] }, { "cell_type": "markdown", "source": [ "# 3. Train Model" ], "metadata": { "id": "nwzZ9jd9Me8x" }, "id": "nwzZ9jd9Me8x" }, { "cell_type": "code", "source": [ "model = MultinomialNB()\n", "model.fit(X_train, y_train)" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 80 }, "id": "pX9HVNtdMhc6", "outputId": "cd7a59da-c810-414e-ef0b-8b71e5026150" }, "id": "pX9HVNtdMhc6", "execution_count": 5, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ "MultinomialNB()" ], "text/html": [ "
MultinomialNB()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
MultinomialNB()