{ "nbformat": 4, "nbformat_minor": 5, "metadata": { "colab": { "provenance": [] }, "language_info": { "name": "python" }, "kernelspec": { "name": "python3", "display_name": "Python 3" } }, "cells": [ { "id": "4b032f5a", "cell_type": "markdown", "source": [ "\n", "# **Text Classification (Traditional ML)** \n", "### Spam Detection using TF-IDF + Naïve Bayes\n", "\n", "This notebook covers:\n", "- Sentiment / Text Classification Basics \n", "- Train/Test Split & Evaluation Metrics \n", "- TF-IDF Feature Extraction \n", "- Naïve Bayes Model for Spam Detection \n" ], "metadata": { "id": "4b032f5a" } }, { "id": "9848cd98", "cell_type": "code", "metadata": { "id": "9848cd98" }, "execution_count": 1, "source": [ "\n", "# Install required libraries (if not present)\n", "# !pip install scikit-learn pandas\n" ], "outputs": [] }, { "cell_type": "code", "source": [ "import pandas as pd\n", "import os\n", "from sklearn.model_selection import train_test_split\n", "from sklearn.feature_extraction.text import TfidfVectorizer\n", "from sklearn.naive_bayes import MultinomialNB\n", "from sklearn.metrics import classification_report, confusion_matrix\n", "import joblib\n", "import kagglehub" ], "metadata": { "id": "XSLfGWSEMW7K" }, "id": "XSLfGWSEMW7K", "execution_count": 2, "outputs": [] }, { "cell_type": "markdown", "source": [ "# 1. Download & Load Dataset" ], "metadata": { "id": "2X8GrfqeMX7R" }, "id": "2X8GrfqeMX7R" }, { "cell_type": "code", "source": [ "dataset_path = kagglehub.dataset_download(\"uciml/sms-spam-collection-dataset\")\n", "\n", "# Locate CSV file automatically\n", "for file in os.listdir(dataset_path):\n", " if file.endswith(\".csv\"):\n", " data_file = os.path.join(dataset_path, file)\n", " break\n", "\n", "print(\"Using dataset file:\", data_file)\n", "\n", "df = pd.read_csv(data_file, encoding=\"latin-1\")[['v1','v2']]\n", "df.columns = ['label', 'text']" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "8xu6lXODMXgi", "outputId": "2edde3cc-0191-4d91-9036-73c6ee94bc99" }, "id": "8xu6lXODMXgi", "execution_count": 3, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "Using Colab cache for faster access to the 'sms-spam-collection-dataset' dataset.\n", "Using dataset file: /kaggle/input/sms-spam-collection-dataset/spam.csv\n" ] } ] }, { "cell_type": "markdown", "source": [ "# 2. Prepare Data" ], "metadata": { "id": "ZEDris-IMb3i" }, "id": "ZEDris-IMb3i" }, { "cell_type": "code", "source": [ "X = df['text']\n", "y = df['label']\n", "\n", "vectorizer = TfidfVectorizer(stop_words='english')\n", "X_tfidf = vectorizer.fit_transform(X)\n", "\n", "X_train, X_test, y_train, y_test = train_test_split(\n", " X_tfidf, y, test_size=0.2, random_state=42\n", ")" ], "metadata": { "id": "z8WqtF9IMdRq" }, "id": "z8WqtF9IMdRq", "execution_count": 4, "outputs": [] }, { "cell_type": "markdown", "source": [ "# 3. Train Model" ], "metadata": { "id": "nwzZ9jd9Me8x" }, "id": "nwzZ9jd9Me8x" }, { "cell_type": "code", "source": [ "model = MultinomialNB()\n", "model.fit(X_train, y_train)" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 80 }, "id": "pX9HVNtdMhc6", "outputId": "cd7a59da-c810-414e-ef0b-8b71e5026150" }, "id": "pX9HVNtdMhc6", "execution_count": 5, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ "MultinomialNB()" ], "text/html": [ "
MultinomialNB()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
" ] }, "metadata": {}, "execution_count": 5 } ] }, { "cell_type": "markdown", "source": [ "# 4. Evaluate" ], "metadata": { "id": "d_RTgsobMkei" }, "id": "d_RTgsobMkei" }, { "cell_type": "code", "source": [ "y_pred = model.predict(X_test)\n", "print(\"\\nClassification Report:\")\n", "print(classification_report(y_test, y_pred))\n", "print(\"Confusion Matrix:\")\n", "print(confusion_matrix(y_test, y_pred))" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "rcpZ9f5fMlFx", "outputId": "ea940cb4-0b58-4f6c-f642-d88fdbcfab97" }, "id": "rcpZ9f5fMlFx", "execution_count": 6, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "\n", "Classification Report:\n", " precision recall f1-score support\n", "\n", " ham 0.96 1.00 0.98 965\n", " spam 1.00 0.77 0.87 150\n", "\n", " accuracy 0.97 1115\n", " macro avg 0.98 0.88 0.93 1115\n", "weighted avg 0.97 0.97 0.97 1115\n", "\n", "Confusion Matrix:\n", "[[965 0]\n", " [ 35 115]]\n" ] } ] }, { "cell_type": "markdown", "source": [ "# 5. Save Model & Vectorizer" ], "metadata": { "id": "s7dswNZjMmdp" }, "id": "s7dswNZjMmdp" }, { "cell_type": "code", "source": [ "joblib.dump(model, \"spam_classifier_model.joblib\")\n", "joblib.dump(vectorizer, \"tfidf_vectorizer.joblib\")\n", "print(\"\\nSaved model and vectorizer successfully.\")" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "pkHTLlE-MoMC", "outputId": "2268c9cc-b804-48d8-dc79-2f25405f0836" }, "id": "pkHTLlE-MoMC", "execution_count": 7, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "\n", "Saved model and vectorizer successfully.\n" ] } ] }, { "cell_type": "markdown", "source": [ "# 6. Test With Some Messages" ], "metadata": { "id": "vocKYlo3Mpfh" }, "id": "vocKYlo3Mpfh" }, { "cell_type": "code", "source": [ "test_texts = [\n", " \"Congratulations! You have been selected to win a $1000 gift card!\",\n", " \"Hey, are we still on for the meeting tomorrow?\",\n", " \"Click this link to claim your exclusive reward!!!\",\n", " \"Can you send me the documents?\"\n", "]\n", "\n", "test_vectors = vectorizer.transform(test_texts)\n", "predictions = model.predict(test_vectors)\n", "\n", "print(\"\\nModel Predictions:\")\n", "for text, pred in zip(test_texts, predictions):\n", " print(f\"{pred.upper()} --> {text}\")" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "Mh2EfjHUMUTx", "outputId": "85fa11ca-610d-41e3-f242-cb8579655bf4" }, "id": "Mh2EfjHUMUTx", "execution_count": 8, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "\n", "Model Predictions:\n", "SPAM --> Congratulations! You have been selected to win a $1000 gift card!\n", "HAM --> Hey, are we still on for the meeting tomorrow?\n", "SPAM --> Click this link to claim your exclusive reward!!!\n", "HAM --> Can you send me the documents?\n" ] } ] } ] }