{ "cells": [ { "cell_type": "code", "execution_count": 78, "metadata": {}, "outputs": [], "source": [ "import pandas as pd\n", "import numpy as np" ] }, { "cell_type": "code", "execution_count": 96, "metadata": {}, "outputs": [], "source": [ "df = pd.read_csv(\"./bbc_data.csv\")\n" ] }, { "cell_type": "code", "execution_count": 97, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
datalabels
0Musicians to tackle US red tape Musicians gro...entertainment
1U2s desire to be number one U2, who have won ...entertainment
2Rocker Doherty in on-stage fight Rock singer ...entertainment
3Snicket tops US box office chart The film ada...entertainment
4Oceans Twelve raids box office Oceans Twelve,...entertainment
.........
2220Warning over Windows Word files Writing a Mic...tech
2221Fast lifts rise into record books Two high-sp...tech
2222Nintendo adds media playing to DS Nintendo is...tech
2223Fast moving phone viruses appear Security fir...tech
2224Hacker threat to Apples iTunes Users of Apple...tech
\n", "

2225 rows × 2 columns

\n", "
" ], "text/plain": [ " data labels\n", "0 Musicians to tackle US red tape Musicians gro... entertainment\n", "1 U2s desire to be number one U2, who have won ... entertainment\n", "2 Rocker Doherty in on-stage fight Rock singer ... entertainment\n", "3 Snicket tops US box office chart The film ada... entertainment\n", "4 Oceans Twelve raids box office Oceans Twelve,... entertainment\n", "... ... ...\n", "2220 Warning over Windows Word files Writing a Mic... tech\n", "2221 Fast lifts rise into record books Two high-sp... tech\n", "2222 Nintendo adds media playing to DS Nintendo is... tech\n", "2223 Fast moving phone viruses appear Security fir... tech\n", "2224 Hacker threat to Apples iTunes Users of Apple... tech\n", "\n", "[2225 rows x 2 columns]" ] }, "execution_count": 97, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df" ] }, { "cell_type": "code", "execution_count": 81, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "labels\n", "sport 511\n", "business 510\n", "politics 417\n", "tech 401\n", "entertainment 386\n", "Name: count, dtype: int64" ] }, "execution_count": 81, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df['labels'].value_counts()" ] }, { "cell_type": "code", "execution_count": 82, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "data 0\n", "labels 0\n", "dtype: int64" ] }, "execution_count": 82, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df.isnull().sum()" ] }, { "cell_type": "code", "execution_count": 83, "metadata": {}, "outputs": [], "source": [ "# Split the data into features and target\n", "X = df['data']\n", "y = df['labels']\n" ] }, { "cell_type": "code", "execution_count": 84, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "[nltk_data] Downloading package stopwords to\n", "[nltk_data] C:\\Users\\saipr\\AppData\\Roaming\\nltk_data...\n", "[nltk_data] Package stopwords is already up-to-date!\n", "[nltk_data] Downloading package punkt to\n", "[nltk_data] C:\\Users\\saipr\\AppData\\Roaming\\nltk_data...\n", "[nltk_data] Package punkt is already up-to-date!\n" ] } ], "source": [ "import re\n", "from nltk.tokenize import word_tokenize\n", "from nltk.corpus import stopwords\n", "\n", "# Ensure required NLTK data is available\n", "import nltk\n", "nltk.download('stopwords')\n", "nltk.download('punkt')\n", "\n", "# Load stopwords only once\n", "stop_words = set(stopwords.words('english'))\n" ] }, { "cell_type": "code", "execution_count": 85, "metadata": {}, "outputs": [], "source": [ "\n", "def preprocess_text(text):\n", " # Remove punctuation and convert to lower case\n", " text = re.sub(r'[^\\w\\s]', '', text.lower())\n", " # Tokenize\n", " tokens = word_tokenize(text)\n", " # Remove stopwords\n", " tokens = [word for word in tokens if word not in stop_words]\n", " return ' '.join(tokens)\n", "\n", "# Apply preprocessing to the dataframe\n", "df['processed_data'] = df['data'].apply(preprocess_text)\n" ] }, { "cell_type": "code", "execution_count": 86, "metadata": {}, "outputs": [], "source": [ "from sklearn.model_selection import train_test_split # Import the function\n", "\n", "# Split the data into training and testing sets\n", "X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)\n" ] }, { "cell_type": "code", "execution_count": 87, "metadata": {}, "outputs": [], "source": [ "from sklearn.feature_extraction.text import TfidfVectorizer # Import TfidfVectorizer\n", "\n", "# Convert text data to numerical data using TF-IDF\n", "vectorizer = TfidfVectorizer(stop_words='english')\n", "X_train_tfidf = vectorizer.fit_transform(X_train)\n", "X_test_tfidf = vectorizer.transform(X_test)\n" ] }, { "cell_type": "code", "execution_count": 88, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
LogisticRegression()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
" ], "text/plain": [ "LogisticRegression()" ] }, "execution_count": 88, "metadata": {}, "output_type": "execute_result" } ], "source": [ "from sklearn.linear_model import LogisticRegression # Import LogisticRegression\n", "\n", "# Train a logistic regression model\n", "model = LogisticRegression()\n", "model.fit(X_train_tfidf, y_train)" ] }, { "cell_type": "code", "execution_count": 89, "metadata": {}, "outputs": [], "source": [ "# Make predictions on the test set\n", "y_pred = model.predict(X_test_tfidf)" ] }, { "cell_type": "code", "execution_count": 90, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Accuracy: 0.9887640449438202\n", "Classification Report:\n", " precision recall f1-score support\n", "\n", " business 0.97 1.00 0.99 103\n", "entertainment 1.00 0.98 0.99 84\n", " politics 0.98 0.99 0.98 80\n", " sport 1.00 0.99 0.99 98\n", " tech 1.00 0.99 0.99 80\n", "\n", " accuracy 0.99 445\n", " macro avg 0.99 0.99 0.99 445\n", " weighted avg 0.99 0.99 0.99 445\n", "\n" ] } ], "source": [ "from sklearn.metrics import accuracy_score, classification_report # Import evaluation metrics\n", "\n", "# Evaluate the model\n", "print(\"Accuracy:\", accuracy_score(y_test, y_pred))\n", "print(\"Classification Report:\\n\", classification_report(y_test, y_pred))\n" ] }, { "cell_type": "code", "execution_count": 91, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "confusion_matrix:\n", " [[103 0 0 0 0]\n", " [ 0 82 2 0 0]\n", " [ 1 0 79 0 0]\n", " [ 1 0 0 97 0]\n", " [ 1 0 0 0 79]]\n" ] } ], "source": [ "from sklearn.metrics import confusion_matrix\n", "print(\"confusion_matrix:\\n\", confusion_matrix(y_test, y_pred))\n" ] }, { "cell_type": "code", "execution_count": 92, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "['model.pkl']" ] }, "execution_count": 92, "metadata": {}, "output_type": "execute_result" } ], "source": [ "import joblib\n", "joblib.dump(model,'model.pkl')" ] }, { "cell_type": "code", "execution_count": 93, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "['vectorizer.pkl']" ] }, "execution_count": 93, "metadata": {}, "output_type": "execute_result" } ], "source": [ "import joblib\n", "joblib.dump(vectorizer,'vectorizer.pkl')" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "base", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.10.9" } }, "nbformat": 4, "nbformat_minor": 2 }