{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "initial_id",
   "metadata": {
    "ExecuteTime": {
     "end_time": "2025-09-16T03:30:06.906158Z",
     "start_time": "2025-09-16T03:30:06.897210Z"
    }
   },
   "outputs": [],
   "source": [
    "import numpy as np\n",
    "\n",
    "category_mapping = {'Fiction' : 'Fiction',\n",
    "                    'Juvenile Fiction' : \"Children's Fiction\",\n",
    "                    'Biography & Autobiography' : 'Nonfiction',\n",
    "                    'History' : 'Nonfiction',\n",
    "                    'Literary Criticism' : 'Nonfiction',\n",
    "                    'Philosophy' : 'Nonfiction',\n",
    "                    'Religion' : 'Nonfiction',\n",
    "                    'Comics & Graphic Novels' : 'Fiction',\n",
    "                    'Juvenile Nonfiction' : \"Children's Nonfiction\",\n",
    "                    'Science' : 'Nonfiction',\n",
    "                    'Poetry' : 'Fiction',\n",
    "                    }"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "abd407fcfb12529f",
   "metadata": {
    "ExecuteTime": {
     "end_time": "2025-09-15T09:11:18.779297Z",
     "start_time": "2025-09-15T09:11:18.685368Z"
    }
   },
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "books = pd.read_csv(\"books_cleaned.csv\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "8730b04764af7caa",
   "metadata": {
    "ExecuteTime": {
     "end_time": "2025-09-15T09:12:06.202207Z",
     "start_time": "2025-09-15T09:12:06.190052Z"
    }
   },
   "outputs": [],
   "source": [
    "books['simple_categories'] = books['categories'].map(category_mapping)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "17b0fe2cfe81778b",
   "metadata": {
    "ExecuteTime": {
     "end_time": "2025-09-15T09:13:56.419141Z",
     "start_time": "2025-09-15T09:13:56.325655Z"
    }
   },
   "outputs": [],
   "source": [
    "books"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "410d16934dfe2383",
   "metadata": {
    "ExecuteTime": {
     "end_time": "2025-09-15T09:39:48.441516Z",
     "start_time": "2025-09-15T09:39:48.396466Z"
    }
   },
   "outputs": [],
   "source": [
    "books[~(books['simple_categories'].isna())]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "a0d8dcd913296e3d",
   "metadata": {
    "ExecuteTime": {
     "end_time": "2025-09-15T10:23:22.076926Z",
     "start_time": "2025-09-15T10:20:53.043882Z"
    }
   },
   "outputs": [],
   "source": [
    "!pip install hf_xet\n",
    "from transformers import pipeline\n",
    "\n",
    "fiction_categories = ['Fiction', 'Nonfiction']\n",
    "pipe = pipeline(\"zero-shot-classification\",model=\"facebook/bart-large-mnli\", device=\"cuda\")\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "cd9edaa3ee8c1243",
   "metadata": {
    "ExecuteTime": {
     "end_time": "2025-09-15T10:23:46.232544Z",
     "start_time": "2025-09-15T10:23:43.525543Z"
    }
   },
   "outputs": [],
   "source": [
    "!pip install --upgrade huggingface_hub\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "83b78716648ebbe6",
   "metadata": {
    "ExecuteTime": {
     "end_time": "2025-09-15T10:23:55.154934Z",
     "start_time": "2025-09-15T10:23:53.226725Z"
    }
   },
   "outputs": [],
   "source": [
    "!pip install \"huggingface_hub[hf_xet]\"\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "8d02bd90c594fbac",
   "metadata": {
    "ExecuteTime": {
     "end_time": "2025-09-15T10:24:14.628937Z",
     "start_time": "2025-09-15T10:24:12.758899Z"
    }
   },
   "outputs": [],
   "source": [
    "!pip show huggingface_hub\n",
    "!pip show hf_xet\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "83e5151bdc46709a",
   "metadata": {
    "ExecuteTime": {
     "end_time": "2025-09-16T04:10:04.964668Z",
     "start_time": "2025-09-16T04:10:01.587200Z"
    }
   },
   "outputs": [],
   "source": [
    "from transformers import pipeline\n",
    "import torch\n",
    "import os\n",
    "\n",
    "print(\"Loading model... (this may take a few minutes on first run)\")\n",
    "print(f\"CUDA available: {torch.cuda.is_available()}\")\n",
    "if torch.cuda.is_available():\n",
    "    print(f\"GPU device: {torch.cuda.get_device_name(0)}\")\n",
    "\n",
    "# CRITICAL: Add GPU support and optimization parameters\n",
    "try:\n",
    "    os.environ[\"HF_HUB_DOWNLOAD_TIMEOUT\"] = \"120\"\n",
    "\n",
    "    pipe = pipeline(\n",
    "        \"zero-shot-classification\",\n",
    "        model=\"facebook/bart-large-mnli\",\n",
    "        device=0 if torch.cuda.is_available() else -1,  # Use GPU if available\n",
    "        batch_size=64,  # Internal pipeline batch size\n",
    "        max_length=512,  # Truncate long texts\n",
    "        truncation=True,\n",
    "        use_auth_token=False,\n",
    "        revision=\"main\"\n",
    "    )\n",
    "\n",
    "    print(\"✅ Model loaded successfully with GPU acceleration!\" if torch.cuda.is_available() else \"✅ Model loaded (CPU mode)\")\n",
    "\n",
    "except Exception as e:\n",
    "    print(f\"Error with facebook/bart-large-mnli: {e}\")\n",
    "    print(\"\\n🔄 Trying alternative model...\")\n",
    "\n",
    "    try:\n",
    "        pipe = pipeline(\n",
    "            \"zero-shot-classification\",\n",
    "            model=\"typeform/distilbert-base-uncased-mnli\",\n",
    "            device=0 if torch.cuda.is_available() else -1,  # GPU support\n",
    "            batch_size=64,\n",
    "            max_length=512,\n",
    "            truncation=True\n",
    "        )\n",
    "\n",
    "        print(\"✅ Alternative model loaded successfully!\")\n",
    "\n",
    "    except Exception as e2:\n",
    "        print(f\"❌ Error with alternative model: {e2}\")\n",
    "        print(\"Please check your internet connection and try again.\")\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "80bc187fbfff3e10",
   "metadata": {
    "ExecuteTime": {
     "end_time": "2025-09-16T04:15:12.296956Z",
     "start_time": "2025-09-16T04:15:12.116659Z"
    }
   },
   "outputs": [],
   "source": [
    "sequence = books.loc[books[\"simple_categories\"] == 'Fiction', 'description'].reset_index(drop=True)[0]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "8ba6836b2c958329",
   "metadata": {
    "ExecuteTime": {
     "end_time": "2025-09-16T04:15:21.478795Z",
     "start_time": "2025-09-16T04:15:14.044833Z"
    }
   },
   "outputs": [],
   "source": [
    "pipe(sequence, fiction_categories)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "23f2c1d7a1c73945",
   "metadata": {
    "ExecuteTime": {
     "end_time": "2025-09-16T03:30:19.621730Z",
     "start_time": "2025-09-16T03:30:12.489364Z"
    }
   },
   "outputs": [],
   "source": [
    "max_index = np.argmax(pipe(sequence, fiction_categories)[\"scores\"])\n",
    "max_label = pipe(sequence, fiction_categories)[\"labels\"][max_index]\n",
    "max_label"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "eb1273971a44738c",
   "metadata": {
    "ExecuteTime": {
     "end_time": "2025-09-16T04:15:21.672845Z",
     "start_time": "2025-09-16T04:15:21.660563Z"
    }
   },
   "outputs": [],
   "source": [
    "from tqdm import tqdm\n",
    "import pandas as pd\n",
    "import time\n",
    "from concurrent.futures import ThreadPoolExecutor, ProcessPoolExecutor\n",
    "import multiprocessing as mp\n",
    "\n",
    "# SOLUTION 1: Batch Processing (Most Important!)\n",
    "def generate_predictions(sequences, categories, batch_size=32):\n",
    "    \"\"\"Process multiple sequences at once - much faster!\"\"\"\n",
    "    predictions = []\n",
    "\n",
    "    for i in tqdm(range(0, len(sequences), batch_size), desc=\"Processing batches\"):\n",
    "        batch = sequences[i:i+batch_size]\n",
    "\n",
    "        # Process entire batch at once\n",
    "        batch_results = pipe(batch, categories)\n",
    "\n",
    "        # Handle both single result and list of results\n",
    "        if isinstance(batch_results, list):\n",
    "            predictions.extend([result['labels'][0] for result in batch_results])\n",
    "        else:\n",
    "            predictions.append(batch_results['labels'][0])\n",
    "\n",
    "    return predictions"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "7d024a18309a521d",
   "metadata": {
    "ExecuteTime": {
     "end_time": "2025-09-16T04:21:02.847544Z",
     "start_time": "2025-09-16T04:15:23.714181Z"
    }
   },
   "outputs": [],
   "source": [
    "# Get 300 nonfiction descriptions\n",
    "nonfiction_books = books.loc[books[\"simple_categories\"] == 'Nonfiction', 'description'].reset_index(drop=True)[:300]\n",
    "\n",
    "# Truncate for speed\n",
    "sequences = [desc[:400] for desc in nonfiction_books]\n",
    "\n",
    "# Process in batches of 20 (instead of 300 individual calls)\n",
    "batch_size = 20\n",
    "for i in tqdm(range(0, len(sequences), batch_size)):\n",
    "    batch = sequences[i:i+batch_size]\n",
    "\n",
    "    # One model call for 20 books instead of 20 separate calls\n",
    "    results = pipe(batch, fiction_categories)\n",
    "\n",
    "    # Extract predictions\n",
    "    if isinstance(results, list):\n",
    "        preddicted_cats += [r['labels'][0] for r in results]\n",
    "    else:\n",
    "        preddicted_cats += [results['labels'][0]]\n",
    "\n",
    "    actual_cats += ['Nonfiction'] * len(batch)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "fdc40689dfadf1",
   "metadata": {
    "ExecuteTime": {
     "end_time": "2025-09-16T04:21:08.483550Z",
     "start_time": "2025-09-16T04:21:08.405904Z"
    }
   },
   "outputs": [],
   "source": [
    "predicted_df = pd.DataFrame({\"actual_categories\": actual_cats, \"predicted_categories\": preddicted_cats})"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "ed0907a9093b94d0",
   "metadata": {
    "ExecuteTime": {
     "end_time": "2025-09-16T04:21:16.539324Z",
     "start_time": "2025-09-16T04:21:16.384515Z"
    }
   },
   "outputs": [],
   "source": [
    "predicted_df.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "87d924edea28b476",
   "metadata": {
    "ExecuteTime": {
     "end_time": "2025-09-16T04:21:19.825460Z",
     "start_time": "2025-09-16T04:21:19.795117Z"
    }
   },
   "outputs": [],
   "source": [
    "predicted_df['correct_prediction'] = (np.where(predicted_df['actual_categories'] == predicted_df['predicted_categories'], 1, 0)\n",
    "                                      )"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "6c25043f2e0d694a",
   "metadata": {
    "ExecuteTime": {
     "end_time": "2025-09-16T04:21:22.040362Z",
     "start_time": "2025-09-16T04:21:22.019264Z"
    }
   },
   "outputs": [],
   "source": [
    "predicted_df['correct_prediction'].sum()/len(predicted_df)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "3c3611fc62b1d8df",
   "metadata": {
    "ExecuteTime": {
     "end_time": "2025-09-16T04:21:24.159383Z",
     "start_time": "2025-09-16T04:21:24.001792Z"
    }
   },
   "outputs": [],
   "source": [
    "isbns = []\n",
    "predicted_cats = []\n",
    "\n",
    "missing_cats = books.loc[books['simple_categories'].isna(), ['isbn13', 'description']].reset_index(drop=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "5a6ee7c312cc4605",
   "metadata": {
    "ExecuteTime": {
     "end_time": "2025-09-16T04:48:29.368260Z",
     "start_time": "2025-09-16T04:47:55.181816Z"
    }
   },
   "outputs": [],
   "source": [
    "# Your current code (already run - don't re-run!)\n",
    "sequences = [str(desc)[:200] if pd.notna(desc) else \"\" for desc in missing_cats[\"description\"]]\n",
    "sequences = [seq for seq in sequences if seq.strip()]  # This changed the length!\n",
    "isbns = missing_cats[\"isbn13\"].tolist()\n",
    "predicted_cats = generate_predictions(sequences, fiction_categories, batch_size=128)\n",
    "\n",
    "\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "4561a0670452fa3b",
   "metadata": {
    "ExecuteTime": {
     "end_time": "2025-09-16T04:51:30.775050Z",
     "start_time": "2025-09-16T04:51:30.573483Z"
    }
   },
   "outputs": [],
   "source": [
    "# FIX: Get the correct ISBNs that match your filtered sequences\n",
    "descriptions = missing_cats[\"description\"].tolist()\n",
    "isbns_full = missing_cats[\"isbn13\"].tolist()\n",
    "\n",
    "matching_isbns = []\n",
    "for i, desc in enumerate(descriptions):\n",
    "    processed_desc = str(desc)[:200] if pd.notna(desc) else \"\"\n",
    "    if processed_desc.strip():  # Same condition as your filter\n",
    "        matching_isbns.append(isbns_full[i])\n",
    "\n",
    "# Now create DataFrame with matching lengths\n",
    "missing_predicted_df = pd.DataFrame({\n",
    "    \"isbn13\": matching_isbns[:len(predicted_cats)],  # Safety check\n",
    "    \"predicted_categories\": predicted_cats\n",
    "})\n",
    "\n",
    "print(f\"✅ DataFrame created successfully with {len(missing_predicted_df)} rows\")\n",
    "print(f\"📊 Predictions by category:\")\n",
    "print(missing_predicted_df['predicted_categories'].value_counts())\n",
    "\n",
    "# Save results\n",
    "missing_predicted_df.to_csv('missing_categories_predictions.csv', index=False)\n",
    "print(\"💾 Results saved to missing_categories_predictions.csv\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "72fe9a8b4b28a1c6",
   "metadata": {
    "ExecuteTime": {
     "end_time": "2025-09-16T04:52:15.607087Z",
     "start_time": "2025-09-16T04:52:15.520116Z"
    }
   },
   "outputs": [],
   "source": [
    "missing_predicted_df.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "8b1f7af8aebf289e",
   "metadata": {
    "ExecuteTime": {
     "end_time": "2025-09-16T05:00:37.137998Z",
     "start_time": "2025-09-16T05:00:36.885979Z"
    }
   },
   "outputs": [],
   "source": [
    "books = pd.merge(books, missing_predicted_df, on=\"isbn13\", how=\"left\")\n",
    "books[\"simple_categories\"] = np.where(books[\"simple_categories\"].isna(), books[\"predicted_categories\"], books[\"simple_categories\"])\n",
    "books = books.drop(columns=\"predicted_categories\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "fe5b161193dab1f",
   "metadata": {
    "ExecuteTime": {
     "end_time": "2025-09-16T05:00:50.989276Z",
     "start_time": "2025-09-16T05:00:50.952202Z"
    }
   },
   "outputs": [],
   "source": [
    "books"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "9d2e1a8dbbd5d6bc",
   "metadata": {
    "ExecuteTime": {
     "end_time": "2025-09-16T05:01:27.850818Z",
     "start_time": "2025-09-16T05:01:27.781563Z"
    }
   },
   "outputs": [],
   "source": [
    "books[books[\"categories\"].str.lower().isin([\n",
    "    \"romance\",\n",
    "    \"science fiction\",\n",
    "    \"scifi\",\n",
    "    \"fantasy\",\n",
    "    \"horror\",\n",
    "    \"mystery\",\n",
    "    \"thriller\",\n",
    "    \"comedy\",\n",
    "    \"crime\",\n",
    "    \"historical\"\n",
    "])]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "bd067ee0696cac0b",
   "metadata": {
    "ExecuteTime": {
     "end_time": "2025-09-16T05:04:09.432347Z",
     "start_time": "2025-09-16T05:04:09.246658Z"
    }
   },
   "outputs": [],
   "source": [
    "books.to_csv(\"books_with_categories.csv\", index=False)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "f8879607442c3f0f",
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 2
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython2",
   "version": "2.7.6"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}