{ "cells": [ { "cell_type": "code", "execution_count": null, "id": "initial_id", "metadata": { "ExecuteTime": { "end_time": "2025-09-16T03:30:06.906158Z", "start_time": "2025-09-16T03:30:06.897210Z" } }, "outputs": [], "source": [ "import numpy as np\n", "\n", "category_mapping = {'Fiction' : 'Fiction',\n", " 'Juvenile Fiction' : \"Children's Fiction\",\n", " 'Biography & Autobiography' : 'Nonfiction',\n", " 'History' : 'Nonfiction',\n", " 'Literary Criticism' : 'Nonfiction',\n", " 'Philosophy' : 'Nonfiction',\n", " 'Religion' : 'Nonfiction',\n", " 'Comics & Graphic Novels' : 'Fiction',\n", " 'Juvenile Nonfiction' : \"Children's Nonfiction\",\n", " 'Science' : 'Nonfiction',\n", " 'Poetry' : 'Fiction',\n", " }" ] }, { "cell_type": "code", "execution_count": null, "id": "abd407fcfb12529f", "metadata": { "ExecuteTime": { "end_time": "2025-09-15T09:11:18.779297Z", "start_time": "2025-09-15T09:11:18.685368Z" } }, "outputs": [], "source": [ "import pandas as pd\n", "books = pd.read_csv(\"books_cleaned.csv\")" ] }, { "cell_type": "code", "execution_count": null, "id": "8730b04764af7caa", "metadata": { "ExecuteTime": { "end_time": "2025-09-15T09:12:06.202207Z", "start_time": "2025-09-15T09:12:06.190052Z" } }, "outputs": [], "source": [ "books['simple_categories'] = books['categories'].map(category_mapping)" ] }, { "cell_type": "code", "execution_count": null, "id": "17b0fe2cfe81778b", "metadata": { "ExecuteTime": { "end_time": "2025-09-15T09:13:56.419141Z", "start_time": "2025-09-15T09:13:56.325655Z" } }, "outputs": [], "source": [ "books" ] }, { "cell_type": "code", "execution_count": null, "id": "410d16934dfe2383", "metadata": { "ExecuteTime": { "end_time": "2025-09-15T09:39:48.441516Z", "start_time": "2025-09-15T09:39:48.396466Z" } }, "outputs": [], "source": [ "books[~(books['simple_categories'].isna())]" ] }, { "cell_type": "code", "execution_count": null, "id": "a0d8dcd913296e3d", "metadata": { "ExecuteTime": { "end_time": "2025-09-15T10:23:22.076926Z", "start_time": "2025-09-15T10:20:53.043882Z" } }, "outputs": [], "source": [ "!pip install hf_xet\n", "from transformers import pipeline\n", "\n", "fiction_categories = ['Fiction', 'Nonfiction']\n", "pipe = pipeline(\"zero-shot-classification\",model=\"facebook/bart-large-mnli\", device=\"cuda\")\n" ] }, { "cell_type": "code", "execution_count": null, "id": "cd9edaa3ee8c1243", "metadata": { "ExecuteTime": { "end_time": "2025-09-15T10:23:46.232544Z", "start_time": "2025-09-15T10:23:43.525543Z" } }, "outputs": [], "source": [ "!pip install --upgrade huggingface_hub\n" ] }, { "cell_type": "code", "execution_count": null, "id": "83b78716648ebbe6", "metadata": { "ExecuteTime": { "end_time": "2025-09-15T10:23:55.154934Z", "start_time": "2025-09-15T10:23:53.226725Z" } }, "outputs": [], "source": [ "!pip install \"huggingface_hub[hf_xet]\"\n" ] }, { "cell_type": "code", "execution_count": null, "id": "8d02bd90c594fbac", "metadata": { "ExecuteTime": { "end_time": "2025-09-15T10:24:14.628937Z", "start_time": "2025-09-15T10:24:12.758899Z" } }, "outputs": [], "source": [ "!pip show huggingface_hub\n", "!pip show hf_xet\n" ] }, { "cell_type": "code", "execution_count": null, "id": "83e5151bdc46709a", "metadata": { "ExecuteTime": { "end_time": "2025-09-16T04:10:04.964668Z", "start_time": "2025-09-16T04:10:01.587200Z" } }, "outputs": [], "source": [ "from transformers import pipeline\n", "import torch\n", "import os\n", "\n", "print(\"Loading model... (this may take a few minutes on first run)\")\n", "print(f\"CUDA available: {torch.cuda.is_available()}\")\n", "if torch.cuda.is_available():\n", " print(f\"GPU device: {torch.cuda.get_device_name(0)}\")\n", "\n", "# CRITICAL: Add GPU support and optimization parameters\n", "try:\n", " os.environ[\"HF_HUB_DOWNLOAD_TIMEOUT\"] = \"120\"\n", "\n", " pipe = pipeline(\n", " \"zero-shot-classification\",\n", " model=\"facebook/bart-large-mnli\",\n", " device=0 if torch.cuda.is_available() else -1, # Use GPU if available\n", " batch_size=64, # Internal pipeline batch size\n", " max_length=512, # Truncate long texts\n", " truncation=True,\n", " use_auth_token=False,\n", " revision=\"main\"\n", " )\n", "\n", " print(\"āœ… Model loaded successfully with GPU acceleration!\" if torch.cuda.is_available() else \"āœ… Model loaded (CPU mode)\")\n", "\n", "except Exception as e:\n", " print(f\"Error with facebook/bart-large-mnli: {e}\")\n", " print(\"\\nšŸ”„ Trying alternative model...\")\n", "\n", " try:\n", " pipe = pipeline(\n", " \"zero-shot-classification\",\n", " model=\"typeform/distilbert-base-uncased-mnli\",\n", " device=0 if torch.cuda.is_available() else -1, # GPU support\n", " batch_size=64,\n", " max_length=512,\n", " truncation=True\n", " )\n", "\n", " print(\"āœ… Alternative model loaded successfully!\")\n", "\n", " except Exception as e2:\n", " print(f\"āŒ Error with alternative model: {e2}\")\n", " print(\"Please check your internet connection and try again.\")\n" ] }, { "cell_type": "code", "execution_count": null, "id": "80bc187fbfff3e10", "metadata": { "ExecuteTime": { "end_time": "2025-09-16T04:15:12.296956Z", "start_time": "2025-09-16T04:15:12.116659Z" } }, "outputs": [], "source": [ "sequence = books.loc[books[\"simple_categories\"] == 'Fiction', 'description'].reset_index(drop=True)[0]" ] }, { "cell_type": "code", "execution_count": null, "id": "8ba6836b2c958329", "metadata": { "ExecuteTime": { "end_time": "2025-09-16T04:15:21.478795Z", "start_time": "2025-09-16T04:15:14.044833Z" } }, "outputs": [], "source": [ "pipe(sequence, fiction_categories)" ] }, { "cell_type": "code", "execution_count": null, "id": "23f2c1d7a1c73945", "metadata": { "ExecuteTime": { "end_time": "2025-09-16T03:30:19.621730Z", "start_time": "2025-09-16T03:30:12.489364Z" } }, "outputs": [], "source": [ "max_index = np.argmax(pipe(sequence, fiction_categories)[\"scores\"])\n", "max_label = pipe(sequence, fiction_categories)[\"labels\"][max_index]\n", "max_label" ] }, { "cell_type": "code", "execution_count": null, "id": "eb1273971a44738c", "metadata": { "ExecuteTime": { "end_time": "2025-09-16T04:15:21.672845Z", "start_time": "2025-09-16T04:15:21.660563Z" } }, "outputs": [], "source": [ "from tqdm import tqdm\n", "import pandas as pd\n", "import time\n", "from concurrent.futures import ThreadPoolExecutor, ProcessPoolExecutor\n", "import multiprocessing as mp\n", "\n", "# SOLUTION 1: Batch Processing (Most Important!)\n", "def generate_predictions(sequences, categories, batch_size=32):\n", " \"\"\"Process multiple sequences at once - much faster!\"\"\"\n", " predictions = []\n", "\n", " for i in tqdm(range(0, len(sequences), batch_size), desc=\"Processing batches\"):\n", " batch = sequences[i:i+batch_size]\n", "\n", " # Process entire batch at once\n", " batch_results = pipe(batch, categories)\n", "\n", " # Handle both single result and list of results\n", " if isinstance(batch_results, list):\n", " predictions.extend([result['labels'][0] for result in batch_results])\n", " else:\n", " predictions.append(batch_results['labels'][0])\n", "\n", " return predictions" ] }, { "cell_type": "code", "execution_count": null, "id": "7d024a18309a521d", "metadata": { "ExecuteTime": { "end_time": "2025-09-16T04:21:02.847544Z", "start_time": "2025-09-16T04:15:23.714181Z" } }, "outputs": [], "source": [ "# Get 300 nonfiction descriptions\n", "nonfiction_books = books.loc[books[\"simple_categories\"] == 'Nonfiction', 'description'].reset_index(drop=True)[:300]\n", "\n", "# Truncate for speed\n", "sequences = [desc[:400] for desc in nonfiction_books]\n", "\n", "# Process in batches of 20 (instead of 300 individual calls)\n", "batch_size = 20\n", "for i in tqdm(range(0, len(sequences), batch_size)):\n", " batch = sequences[i:i+batch_size]\n", "\n", " # One model call for 20 books instead of 20 separate calls\n", " results = pipe(batch, fiction_categories)\n", "\n", " # Extract predictions\n", " if isinstance(results, list):\n", " preddicted_cats += [r['labels'][0] for r in results]\n", " else:\n", " preddicted_cats += [results['labels'][0]]\n", "\n", " actual_cats += ['Nonfiction'] * len(batch)" ] }, { "cell_type": "code", "execution_count": null, "id": "fdc40689dfadf1", "metadata": { "ExecuteTime": { "end_time": "2025-09-16T04:21:08.483550Z", "start_time": "2025-09-16T04:21:08.405904Z" } }, "outputs": [], "source": [ "predicted_df = pd.DataFrame({\"actual_categories\": actual_cats, \"predicted_categories\": preddicted_cats})" ] }, { "cell_type": "code", "execution_count": null, "id": "ed0907a9093b94d0", "metadata": { "ExecuteTime": { "end_time": "2025-09-16T04:21:16.539324Z", "start_time": "2025-09-16T04:21:16.384515Z" } }, "outputs": [], "source": [ "predicted_df.head()" ] }, { "cell_type": "code", "execution_count": null, "id": "87d924edea28b476", "metadata": { "ExecuteTime": { "end_time": "2025-09-16T04:21:19.825460Z", "start_time": "2025-09-16T04:21:19.795117Z" } }, "outputs": [], "source": [ "predicted_df['correct_prediction'] = (np.where(predicted_df['actual_categories'] == predicted_df['predicted_categories'], 1, 0)\n", " )" ] }, { "cell_type": "code", "execution_count": null, "id": "6c25043f2e0d694a", "metadata": { "ExecuteTime": { "end_time": "2025-09-16T04:21:22.040362Z", "start_time": "2025-09-16T04:21:22.019264Z" } }, "outputs": [], "source": [ "predicted_df['correct_prediction'].sum()/len(predicted_df)" ] }, { "cell_type": "code", "execution_count": null, "id": "3c3611fc62b1d8df", "metadata": { "ExecuteTime": { "end_time": "2025-09-16T04:21:24.159383Z", "start_time": "2025-09-16T04:21:24.001792Z" } }, "outputs": [], "source": [ "isbns = []\n", "predicted_cats = []\n", "\n", "missing_cats = books.loc[books['simple_categories'].isna(), ['isbn13', 'description']].reset_index(drop=True)" ] }, { "cell_type": "code", "execution_count": null, "id": "5a6ee7c312cc4605", "metadata": { "ExecuteTime": { "end_time": "2025-09-16T04:48:29.368260Z", "start_time": "2025-09-16T04:47:55.181816Z" } }, "outputs": [], "source": [ "# Your current code (already run - don't re-run!)\n", "sequences = [str(desc)[:200] if pd.notna(desc) else \"\" for desc in missing_cats[\"description\"]]\n", "sequences = [seq for seq in sequences if seq.strip()] # This changed the length!\n", "isbns = missing_cats[\"isbn13\"].tolist()\n", "predicted_cats = generate_predictions(sequences, fiction_categories, batch_size=128)\n", "\n", "\n" ] }, { "cell_type": "code", "execution_count": null, "id": "4561a0670452fa3b", "metadata": { "ExecuteTime": { "end_time": "2025-09-16T04:51:30.775050Z", "start_time": "2025-09-16T04:51:30.573483Z" } }, "outputs": [], "source": [ "# FIX: Get the correct ISBNs that match your filtered sequences\n", "descriptions = missing_cats[\"description\"].tolist()\n", "isbns_full = missing_cats[\"isbn13\"].tolist()\n", "\n", "matching_isbns = []\n", "for i, desc in enumerate(descriptions):\n", " processed_desc = str(desc)[:200] if pd.notna(desc) else \"\"\n", " if processed_desc.strip(): # Same condition as your filter\n", " matching_isbns.append(isbns_full[i])\n", "\n", "# Now create DataFrame with matching lengths\n", "missing_predicted_df = pd.DataFrame({\n", " \"isbn13\": matching_isbns[:len(predicted_cats)], # Safety check\n", " \"predicted_categories\": predicted_cats\n", "})\n", "\n", "print(f\"āœ… DataFrame created successfully with {len(missing_predicted_df)} rows\")\n", "print(f\"šŸ“Š Predictions by category:\")\n", "print(missing_predicted_df['predicted_categories'].value_counts())\n", "\n", "# Save results\n", "missing_predicted_df.to_csv('missing_categories_predictions.csv', index=False)\n", "print(\"šŸ’¾ Results saved to missing_categories_predictions.csv\")" ] }, { "cell_type": "code", "execution_count": null, "id": "72fe9a8b4b28a1c6", "metadata": { "ExecuteTime": { "end_time": "2025-09-16T04:52:15.607087Z", "start_time": "2025-09-16T04:52:15.520116Z" } }, "outputs": [], "source": [ "missing_predicted_df.head()" ] }, { "cell_type": "code", "execution_count": null, "id": "8b1f7af8aebf289e", "metadata": { "ExecuteTime": { "end_time": "2025-09-16T05:00:37.137998Z", "start_time": "2025-09-16T05:00:36.885979Z" } }, "outputs": [], "source": [ "books = pd.merge(books, missing_predicted_df, on=\"isbn13\", how=\"left\")\n", "books[\"simple_categories\"] = np.where(books[\"simple_categories\"].isna(), books[\"predicted_categories\"], books[\"simple_categories\"])\n", "books = books.drop(columns=\"predicted_categories\")" ] }, { "cell_type": "code", "execution_count": null, "id": "fe5b161193dab1f", "metadata": { "ExecuteTime": { "end_time": "2025-09-16T05:00:50.989276Z", "start_time": "2025-09-16T05:00:50.952202Z" } }, "outputs": [], "source": [ "books" ] }, { "cell_type": "code", "execution_count": null, "id": "9d2e1a8dbbd5d6bc", "metadata": { "ExecuteTime": { "end_time": "2025-09-16T05:01:27.850818Z", "start_time": "2025-09-16T05:01:27.781563Z" } }, "outputs": [], "source": [ "books[books[\"categories\"].str.lower().isin([\n", " \"romance\",\n", " \"science fiction\",\n", " \"scifi\",\n", " \"fantasy\",\n", " \"horror\",\n", " \"mystery\",\n", " \"thriller\",\n", " \"comedy\",\n", " \"crime\",\n", " \"historical\"\n", "])]" ] }, { "cell_type": "code", "execution_count": null, "id": "bd067ee0696cac0b", "metadata": { "ExecuteTime": { "end_time": "2025-09-16T05:04:09.432347Z", "start_time": "2025-09-16T05:04:09.246658Z" } }, "outputs": [], "source": [ "books.to_csv(\"books_with_categories.csv\", index=False)" ] }, { "cell_type": "code", "execution_count": null, "id": "f8879607442c3f0f", "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 2 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython2", "version": "2.7.6" } }, "nbformat": 4, "nbformat_minor": 5 }