Spaces:

gautamnancy
/

Book-Recommender

Sleeping

App Files Files Community

gautamnancy commited on Sep 17, 2025

Commit

17ddb51

verified ·

1 Parent(s): a8bb43e

Upload 6 files

Browse files

Files changed (6) hide show

data-exploration.ipynb +443 -0
gradio-dashboard.py +117 -0
sentiment-analysis.ipynb +445 -0
tagged_description.txt +0 -0
text-classification.ipynb +591 -0
vector-search.ipynb +296 -0

data-exploration.ipynb ADDED Viewed

	@@ -0,0 +1,443 @@

+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "initial_id",
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2025-09-14T07:12:03.050818Z",
+     "start_time": "2025-09-14T07:11:56.152605Z"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "from statistics import correlation\n",
+    "\n",
+    "import kagglehub\n",
+    "\n",
+    "# Download latest version\n",
+    "path = kagglehub.dataset_download(\"dylanjcastillo/7k-books-with-metadata\")\n",
+    "\n",
+    "print(\"Path to dataset files:\", path)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "ae99194daafd1775",
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2025-09-14T07:51:53.432293Z",
+     "start_time": "2025-09-14T07:51:52.436694Z"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "import pandas as pd"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "6df67758ebb1137c",
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2025-09-14T08:03:25.179234Z",
+     "start_time": "2025-09-14T08:03:24.185253Z"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "from pathlib import Path\n",
+    "\n",
+    "# Convert string path → Path object\n",
+    "path = Path(kagglehub.dataset_download(\"dylanjcastillo/7k-books-with-metadata\"))\n",
+    "\n",
+    "books = pd.read_csv(path / \"books.csv\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "94828bc9ccbfafa1",
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2025-09-14T08:03:37.133785Z",
+     "start_time": "2025-09-14T08:03:37.079170Z"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "books"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "9403c10bb9a0112e",
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2025-09-14T08:12:20.943772Z",
+     "start_time": "2025-09-14T08:12:16.468843Z"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "import seaborn as sns\n",
+    "import matplotlib.pyplot as plt"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "aaba3c5cc9492dbc",
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2025-09-14T08:16:47.484763Z",
+     "start_time": "2025-09-14T08:16:47.134190Z"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "ax = plt.axes()\n",
+    "sns.heatmap(books.isna().transpose(), cbar = False , ax=ax)\n",
+    "\n",
+    "plt.xlabel(\"Columns\")\n",
+    "plt.ylabel(\"Missing values\")\n",
+    "\n",
+    "plt.show()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "5020d8ec7f517390",
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2025-09-14T08:34:19.472432Z",
+     "start_time": "2025-09-14T08:34:19.396405Z"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "import numpy as np\n",
+    "books[\"missing_description\"] = np.where(books[\"description\"].isna(), 1, 0)\n",
+    "books[\"age_of_book\"] = 2024 - books[\"published_year\"]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "8693f57773a2f2ca",
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2025-09-14T08:45:22.257526Z",
+     "start_time": "2025-09-14T08:45:22.005185Z"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "columns_of_interest = [\"num_pages\", \"age_of_book\", \"missing_description\", \"average_rating\"]\n",
+    "correlation_matrix = books[columns_of_interest].corr(method = \"spearman\")\n",
+    "sns.set_theme(style=\"white\")\n",
+    "plt.figure(figsize = (8, 6))\n",
+    "heatmap = sns.heatmap(correlation_matrix, annot=True, fmt=\".2f\", cmap=\"coolwarm\", cbar_kws={\"label\": \"Spearman Correlation\"})\n",
+    "\n",
+    "heatmap.set_title(\"Correlation Heatmap\")\n",
+    "plt.show()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "1218eb9769f7ec28",
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2025-09-14T09:01:30.451492Z",
+     "start_time": "2025-09-14T09:01:30.397573Z"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "books_missing = books[(books[\"description\"].isna()) |\n",
+    "     ~(books[\"num_pages\"].isna()) &\n",
+    "     ~(books[\"average_rating\"].isna()) &\n",
+    "     ~(books[\"published_year\"].isna())\n",
+    "]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "a16b79d748237fa6",
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2025-09-14T09:29:57.037634Z",
+     "start_time": "2025-09-14T09:29:56.971479Z"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "books_missing = books[~(books[\"description\"].isna()) &\n",
+    "     ~(books[\"num_pages\"].isna()) &\n",
+    "     ~(books[\"average_rating\"].isna()) &\n",
+    "     ~(books[\"published_year\"].isna())\n",
+    "]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "997cafb5e60fef34",
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2025-09-14T09:30:14.028246Z",
+     "start_time": "2025-09-14T09:30:13.969750Z"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "books_missing"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "6aad6ddc169cf39d",
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2025-09-14T09:33:31.453933Z",
+     "start_time": "2025-09-14T09:33:31.395084Z"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "books_missing[\"categories\"].value_counts().reset_index().sort_values(\"count\", ascending=False)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "a7c0628d5619c32b",
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2025-09-14T09:52:43.243363Z",
+     "start_time": "2025-09-14T09:52:43.211576Z"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "books_missing"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "b971c57a22e2721e",
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2025-09-14T10:06:37.305268Z",
+     "start_time": "2025-09-14T10:06:37.242773Z"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "books_missing.loc[:, \"words_in_description\"] = books_missing[\"description\"].str.split().str.len()\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "5cf80ede1a996820",
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2025-09-14T10:07:11.889795Z",
+     "start_time": "2025-09-14T10:07:11.815772Z"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "books_missing"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "d4a20c7b8a28d843",
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2025-09-14T10:16:26.757853Z",
+     "start_time": "2025-09-14T10:16:26.738194Z"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "print(books_missing.loc[books_missing[\"words_in_description\"].between(25, 34), [\"description\", \"words_in_description\"]])\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "add578fb79f75576",
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2025-09-14T10:18:38.671378Z",
+     "start_time": "2025-09-14T10:18:38.655678Z"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "books_missing_25_words = books_missing[books_missing[\"words_in_description\"] >= 25]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "337cc14a7592597",
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2025-09-14T10:18:45.020133Z",
+     "start_time": "2025-09-14T10:18:44.995404Z"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "books_missing_25_words"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "15505042aaae206b",
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2025-09-14T10:36:25.385493Z",
+     "start_time": "2025-09-14T10:36:25.348788Z"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "books_missing_25_words.loc[:, \"title_and_subtitle\"] = np.where(\n",
+    "    books_missing_25_words[\"subtitle\"].isna(),\n",
+    "    books_missing_25_words[\"title\"],\n",
+    "    books_missing_25_words[[\"title\", \"subtitle\"]].astype(str).agg(\": \".join, axis=1)\n",
+    ")\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "8f48839b393f1be6",
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2025-09-14T10:36:36.463971Z",
+     "start_time": "2025-09-14T10:36:36.442637Z"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "books_missing_25_words"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "1033bd78abfa34a3",
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2025-09-14T10:38:24.509449Z",
+     "start_time": "2025-09-14T10:38:24.480830Z"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "books_missing_25_words[\"title_and_subtitle\"].value_counts().reset_index().sort_values(\"count\", ascending=False)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "1871d27d7eb01493",
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2025-09-14T10:45:15.551772Z",
+     "start_time": "2025-09-14T10:45:15.504051Z"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "books_missing_25_words = books_missing_25_words.copy()  # slice warning हटाने के लिए\n",
+    "\n",
+    "books_missing_25_words.loc[:, \"tagged_description\"] = (\n",
+    "    books_missing_25_words[[\"isbn13\", \"description\"]]\n",
+    "    .astype(str)\n",
+    "    .agg(\" \".join, axis=1)\n",
+    ")\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "20a704320865f12b",
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2025-09-14T10:45:38.585999Z",
+     "start_time": "2025-09-14T10:45:38.566081Z"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "books_missing_25_words"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "36a89080af8a4f1c",
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2025-09-14T10:49:30.500326Z",
+     "start_time": "2025-09-14T10:49:30.213437Z"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "(\n",
+    "    books_missing_25_words\n",
+    "    .drop([\"subtitle\", \"missing_description\", \"age_of_book\", \"words_in_description\"], axis=1)\n",
+    "    .to_csv(\"books_cleaned.csv\", index = False)\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "a2308b29e727ba70",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 2
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython2",
+   "version": "2.7.6"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}

gradio-dashboard.py ADDED Viewed

	@@ -0,0 +1,117 @@

+import pandas as pd
+import numpy as np
+from dotenv import load_dotenv
+from langchain.schema import Document
+from langchain_huggingface import HuggingFaceEmbeddings
+from langchain_chroma import Chroma
+import gradio as gr
+load_dotenv()
+books = pd.read_csv("books_with_emotions.csv")
+books["large_thumbnail"] = books["thumbnail"] + "&fife=w800"
+books["large_thumbnail"] = np.where(
+    books["large_thumbnail"].isna(),
+    "cover-not-found.jpg",
+    books["large_thumbnail"],
+)
+# Create documents directly from DataFrame instead of loading from file
+documents = []
+for _, row in books.iterrows():
+    content = f"{row['isbn13']} {row['description']}"
+    documents.append(Document(page_content=content))
+# Create the vector database using HuggingFace embeddings
+embeddings = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")
+db_books = Chroma.from_documents(documents, embeddings)
+def retrieve_semantic_recommendations(
+        query: str,
+        category: str = None,
+        tone: str = None,
+        initial_top_k: int = 50,
+        final_top_k: int = 16,
+) -> pd.DataFrame:
+    recs = db_books.similarity_search(query, k=initial_top_k)
+    books_list = [int(float(rec.page_content.strip('"').split()[0])) for rec in recs]
+    book_recs = books[books["isbn13"].isin(books_list)].head(initial_top_k)
+    if category != "All":
+        book_recs = book_recs[book_recs["simple_categories"] == category].head(final_top_k)
+    else:
+        book_recs = book_recs.head(final_top_k)
+    # Only sort by emotion if the columns exist
+    if tone == "Happy" and "joy" in book_recs.columns:
+        book_recs = book_recs.sort_values(by="joy", ascending=False)
+    elif tone == "Surprising" and "surprise" in book_recs.columns:
+        book_recs = book_recs.sort_values(by="surprise", ascending=False)
+    elif tone == "Angry" and "anger" in book_recs.columns:
+        book_recs = book_recs.sort_values(by="anger", ascending=False)
+    elif tone == "Suspenseful" and "fear" in book_recs.columns:
+        book_recs = book_recs.sort_values(by="fear", ascending=False)
+    elif tone == "Sad" and "sadness" in book_recs.columns:
+        book_recs = book_recs.sort_values(by="sadness", ascending=False)
+    return book_recs
+def recommend_books(
+        query: str,
+        category: str,
+        tone: str
+):
+    recommendations = retrieve_semantic_recommendations(query, category, tone)
+    results = []
+    for _, row in recommendations.iterrows():
+        description = row["description"]
+        truncated_desc_split = description.split()
+        truncated_description = " ".join(truncated_desc_split[:30]) + "..."
+        authors_split = row["authors"].split(";")
+        if len(authors_split) == 2:
+            authors_str = f"{authors_split[0]} and {authors_split[1]}"
+        elif len(authors_split) > 2:
+            authors_str = f"{', '.join(authors_split[:-1])}, and {authors_split[-1]}"
+        else:
+            authors_str = row["authors"]
+        caption = f"{row['title']} by {authors_str}: {truncated_description}"
+        results.append((row["large_thumbnail"], caption))
+    return results
+# Fix: Filter out NaN values before sorting
+categories = ["All"] + sorted(books["simple_categories"].dropna().unique())
+# Only include emotion tones if the emotion columns exist
+emotion_columns = ["joy", "surprise", "anger", "fear", "sadness"]
+emotion_labels = ["Happy", "Surprising", "Angry", "Suspenseful", "Sad"]
+available_emotions = [label for col, label in zip(emotion_columns, emotion_labels) if col in books.columns]
+tones = ["All"] + available_emotions
+with gr.Blocks(theme = gr.themes.Glass()) as dashboard:
+    gr.Markdown("# Semantic book recommender")
+    with gr.Row():
+        user_query = gr.Textbox(label = "Please enter a description of a book:",
+                                placeholder = "e.g., A story about forgiveness")
+        category_dropdown = gr.Dropdown(choices = categories, label = "Select a category:", value = "All")
+        tone_dropdown = gr.Dropdown(choices = tones, label = "Select an emotional tone:", value = "All")
+        submit_button = gr.Button("Find recommendations")
+    gr.Markdown("## Recommendations")
+    output = gr.Gallery(label = "Recommended books", columns = 8, rows = 2)
+    submit_button.click(fn = recommend_books,
+                        inputs = [user_query, category_dropdown, tone_dropdown],
+                        outputs = output)
+if __name__ == "__main__":
+    dashboard.launch()

sentiment-analysis.ipynb ADDED Viewed

	@@ -0,0 +1,445 @@

+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "initial_id",
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2025-09-16T09:43:18.055617Z",
+     "start_time": "2025-09-16T09:43:17.869905Z"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "import pandas as pd\n",
+    "\n",
+    "books = pd.read_csv(\"books_with_categories.csv\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "3d9a521af5640cd2",
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2025-09-16T09:43:20.918046Z",
+     "start_time": "2025-09-16T09:43:18.066451Z"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "!pip install torch transformers\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "a222cc24cb3d9e50",
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2025-09-16T09:43:20.956314Z",
+     "start_time": "2025-09-16T09:43:20.934627Z"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "import torch\n",
+    "import transformers\n",
+    "print(f\"PyTorch version: {torch.__version__}\")\n",
+    "print(f\"Transformers version: {transformers.__version__}\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "418145b8ff28c108",
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2025-09-16T09:43:23.555715Z",
+     "start_time": "2025-09-16T09:43:20.969958Z"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "# Fix the bug by making torch available in transformers namespace\n",
+    "transformers.torch = torch\n",
+    "\n",
+    "from transformers import pipeline\n",
+    "\n",
+    "pipe = pipeline(\n",
+    "    \"text-classification\",\n",
+    "    model=\"j-hartmann/emotion-english-distilroberta-base\",\n",
+    "    return_all_scores=True\n",
+    ")\n",
+    "\n",
+    "# Test it\n",
+    "text = \"I am so happy today!\"\n",
+    "result = pipe(text)\n",
+    "print(result)\n",
+    "\n",
+    "#top-k None\n",
+    "#device  -- mps /cuda for warnings"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "90acf250d3189ec1",
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2025-09-16T09:43:23.912340Z",
+     "start_time": "2025-09-16T09:43:23.574192Z"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "pipe(books[\"description\"][0])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "c9781bcf4224efd4",
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2025-09-16T09:43:24.797286Z",
+     "start_time": "2025-09-16T09:43:23.944842Z"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "pipe(books[\"description\"][0].split(\".\"))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "57fc949d567e3f7",
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2025-09-16T09:43:25.167345Z",
+     "start_time": "2025-09-16T09:43:24.810715Z"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "sentences = books[\"description\"][0].split(\".\")\n",
+    "predictions = pipe(sentences)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "41b5470987223a69",
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2025-09-16T09:43:25.187522Z",
+     "start_time": "2025-09-16T09:43:25.179974Z"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "sentences[0]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "81bb270a79fdd290",
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2025-09-16T09:43:25.232413Z",
+     "start_time": "2025-09-16T09:43:25.225824Z"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "predictions[0]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "d85ba7066b85eb7d",
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2025-09-16T09:43:25.273001Z",
+     "start_time": "2025-09-16T09:43:25.267108Z"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "sentences[4]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "8dea7d5c2077d566",
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2025-09-16T09:43:25.306831Z",
+     "start_time": "2025-09-16T09:43:25.300457Z"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "predictions[4]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "a540e26e090b9050",
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2025-09-16T09:43:25.342124Z",
+     "start_time": "2025-09-16T09:43:25.334958Z"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "sorted(predictions[0], key = lambda x: x['label'])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "a496645a7d858dcf",
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2025-09-16T09:43:25.369056Z",
+     "start_time": "2025-09-16T09:43:25.360888Z"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "import numpy as np\n",
+    "\n",
+    "emotion_labels = [\"anger\", \"disgust\", \"fear\", \"joy\", \"sadness\", \"surprise\", \"neutral\"]\n",
+    "isbn = []\n",
+    "emotion_scores = {label: [] for label in emotion_labels}\n",
+    "\n",
+    "def calculate_max_emotion_scores(predictions):\n",
+    "    per_emotion_scores = {label: [] for label in emotion_labels}\n",
+    "    for prediction in predictions:\n",
+    "        sorted_predictions = sorted(prediction, key=lambda x: x['label'], reverse=True)\n",
+    "        for index, label in enumerate(emotion_labels):\n",
+    "            per_emotion_scores[label].append(sorted_predictions[index]['score'])\n",
+    "    return {label: np.max(scores) for label, scores in per_emotion_scores.items()}"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "b911145893e482f3",
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2025-09-16T09:43:27.622710Z",
+     "start_time": "2025-09-16T09:43:25.385198Z"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "for i, row in books.head(10).iterrows():\n",
+    "    isbn.append(str(row[\"isbn13\"]))\n",
+    "\n",
+    "    sentences = str(row[\"description\"]).split(\".\")\n",
+    "    predictions = pipe(sentences)\n",
+    "    max_scores = calculate_max_emotion_scores(predictions)\n",
+    "\n",
+    "    for label in emotion_labels:\n",
+    "        # force conversion to Python float\n",
+    "        emotion_scores[label].append(float(max_scores[label]))\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "b1e1e2960a0314b3",
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2025-09-16T09:43:27.634705Z",
+     "start_time": "2025-09-16T09:43:27.630600Z"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "emotion_scores = {\n",
+    "    label: [float(x) for x in scores]\n",
+    "    for label, scores in emotion_scores.items()\n",
+    "}\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "b4d6abd593a32daa",
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2025-09-16T09:43:27.652229Z",
+     "start_time": "2025-09-16T09:43:27.644745Z"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "emotion_scores"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "632f787a4b7d3eaf",
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2025-09-16T11:10:46.250826Z",
+     "start_time": "2025-09-16T10:58:12.182208Z"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "import pandas as pd\n",
+    "import numpy as np\n",
+    "from tqdm import tqdm\n",
+    "from transformers import pipeline\n",
+    "\n",
+    "# Initialize the emotion analysis pipeline\n",
+    "pipe = pipeline(\"text-classification\", model=\"j-hartmann/emotion-english-distilroberta-base\", top_k=None)\n",
+    "\n",
+    "# Load your books data\n",
+    "books = pd.read_csv(\"books_with_categories.csv\")  # Replace with your actual file name\n",
+    "\n",
+    "emotion_labels = [\"anger\", \"disgust\", \"fear\", \"joy\", \"sadness\", \"surprise\", \"neutral\"]\n",
+    "isbn = []\n",
+    "emotion_scores = {label: [] for label in emotion_labels}\n",
+    "\n",
+    "def calculate_max_emotion_scores(predictions):\n",
+    "    \"\"\"Calculate maximum emotion scores across all sentences\"\"\"\n",
+    "    per_emotion_scores = {label: [] for label in emotion_labels}\n",
+    "\n",
+    "    for prediction in predictions:\n",
+    "        # Create a dictionary for easy lookup by label\n",
+    "        prediction_dict = {pred['label']: pred['score'] for pred in prediction}\n",
+    "\n",
+    "        # Add scores for each emotion label\n",
+    "        for label in emotion_labels:\n",
+    "            score = prediction_dict.get(label, 0.0)  # Default to 0 if label not found\n",
+    "            per_emotion_scores[label].append(score)\n",
+    "\n",
+    "    # Return maximum score for each emotion across all sentences\n",
+    "    return {label: np.max(scores) if scores else 0.0 for label, scores in per_emotion_scores.items()}\n",
+    "\n",
+    "print(\"Processing emotions for books...\")\n",
+    "for i, row in tqdm(books.iterrows(), total=len(books)):\n",
+    "    isbn.append(str(row[\"isbn13\"]))\n",
+    "\n",
+    "    # Handle missing descriptions\n",
+    "    description = str(row[\"description\"]) if pd.notna(row[\"description\"]) else \"\"\n",
+    "\n",
+    "    if description and description != \"nan\":\n",
+    "        # Split into sentences and filter out empty ones\n",
+    "        sentences = [s.strip() for s in description.split(\".\") if s.strip()]\n",
+    "\n",
+    "        if sentences:\n",
+    "            try:\n",
+    "                predictions = pipe(sentences)\n",
+    "                max_scores = calculate_max_emotion_scores(predictions)\n",
+    "            except Exception as e:\n",
+    "                print(f\"Error processing book {row['isbn13']}: {e}\")\n",
+    "                # Use default scores if processing fails\n",
+    "                max_scores = {label: 0.0 for label in emotion_labels}\n",
+    "        else:\n",
+    "            # Empty description\n",
+    "            max_scores = {label: 0.0 for label in emotion_labels}\n",
+    "    else:\n",
+    "        # No description available\n",
+    "        max_scores = {label: 0.0 for label in emotion_labels}\n",
+    "\n",
+    "    # Add scores to our lists\n",
+    "    for label in emotion_labels:\n",
+    "        emotion_scores[label].append(float(max_scores[label]))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "31dfb34d4f4aee9a",
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2025-09-16T11:11:55.455696Z",
+     "start_time": "2025-09-16T11:11:55.422818Z"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "# Create emotion DataFrame\n",
+    "emotion_df = pd.DataFrame(emotion_scores)\n",
+    "emotion_df['isbn13'] = isbn\n",
+    "\n",
+    "print(\"Emotion processing completed!\")\n",
+    "print(\"Sample emotion scores:\")\n",
+    "print(emotion_df.head(10))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "8db5e8f5cee59321",
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2025-09-16T11:12:15.342595Z",
+     "start_time": "2025-09-16T11:12:14.895173Z"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "books['isbn13'] = books['isbn13'].astype(str).str.replace('.0', '', regex=False)\n",
+    "emotion_df['isbn13'] = emotion_df['isbn13'].astype(str).str.replace('.0', '', regex=False)\n",
+    "\n",
+    "print(\"Data types before merge:\")\n",
+    "print(f\"Books isbn13 dtype: {books['isbn13'].dtype}\")\n",
+    "print(f\"Emotion isbn13 dtype: {emotion_df['isbn13'].dtype}\")\n",
+    "\n",
+    "# Merge emotion scores back to the original books DataFrame\n",
+    "books_with_emotions = books.merge(emotion_df, on='isbn13', how='left')\n",
+    "\n",
+    "# Save the combined DataFrame\n",
+    "books_with_emotions.to_csv(\"books_with_emotions.csv\", index=False)\n",
+    "\n",
+    "print(f\"Saved books with emotions to 'books_with_emotions.csv'\")\n",
+    "print(f\"Total books processed: {len(books_with_emotions)}\")\n",
+    "print(\"Available columns:\", books_with_emotions.columns.tolist())"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "e1cc83da7893e926",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 2
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython2",
+   "version": "2.7.6"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}

tagged_description.txt ADDED Viewed

The diff for this file is too large to render. See raw diff

text-classification.ipynb ADDED Viewed

	@@ -0,0 +1,591 @@

+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "initial_id",
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2025-09-16T03:30:06.906158Z",
+     "start_time": "2025-09-16T03:30:06.897210Z"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "import numpy as np\n",
+    "\n",
+    "category_mapping = {'Fiction' : 'Fiction',\n",
+    "                    'Juvenile Fiction' : \"Children's Fiction\",\n",
+    "                    'Biography & Autobiography' : 'Nonfiction',\n",
+    "                    'History' : 'Nonfiction',\n",
+    "                    'Literary Criticism' : 'Nonfiction',\n",
+    "                    'Philosophy' : 'Nonfiction',\n",
+    "                    'Religion' : 'Nonfiction',\n",
+    "                    'Comics & Graphic Novels' : 'Fiction',\n",
+    "                    'Juvenile Nonfiction' : \"Children's Nonfiction\",\n",
+    "                    'Science' : 'Nonfiction',\n",
+    "                    'Poetry' : 'Fiction',\n",
+    "                    }"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "abd407fcfb12529f",
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2025-09-15T09:11:18.779297Z",
+     "start_time": "2025-09-15T09:11:18.685368Z"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "import pandas as pd\n",
+    "books = pd.read_csv(\"books_cleaned.csv\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "8730b04764af7caa",
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2025-09-15T09:12:06.202207Z",
+     "start_time": "2025-09-15T09:12:06.190052Z"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "books['simple_categories'] = books['categories'].map(category_mapping)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "17b0fe2cfe81778b",
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2025-09-15T09:13:56.419141Z",
+     "start_time": "2025-09-15T09:13:56.325655Z"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "books"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "410d16934dfe2383",
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2025-09-15T09:39:48.441516Z",
+     "start_time": "2025-09-15T09:39:48.396466Z"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "books[~(books['simple_categories'].isna())]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "a0d8dcd913296e3d",
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2025-09-15T10:23:22.076926Z",
+     "start_time": "2025-09-15T10:20:53.043882Z"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "!pip install hf_xet\n",
+    "from transformers import pipeline\n",
+    "\n",
+    "fiction_categories = ['Fiction', 'Nonfiction']\n",
+    "pipe = pipeline(\"zero-shot-classification\",model=\"facebook/bart-large-mnli\", device=\"cuda\")\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "cd9edaa3ee8c1243",
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2025-09-15T10:23:46.232544Z",
+     "start_time": "2025-09-15T10:23:43.525543Z"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "!pip install --upgrade huggingface_hub\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "83b78716648ebbe6",
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2025-09-15T10:23:55.154934Z",
+     "start_time": "2025-09-15T10:23:53.226725Z"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "!pip install \"huggingface_hub[hf_xet]\"\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "8d02bd90c594fbac",
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2025-09-15T10:24:14.628937Z",
+     "start_time": "2025-09-15T10:24:12.758899Z"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "!pip show huggingface_hub\n",
+    "!pip show hf_xet\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "83e5151bdc46709a",
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2025-09-16T04:10:04.964668Z",
+     "start_time": "2025-09-16T04:10:01.587200Z"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "from transformers import pipeline\n",
+    "import torch\n",
+    "import os\n",
+    "\n",
+    "print(\"Loading model... (this may take a few minutes on first run)\")\n",
+    "print(f\"CUDA available: {torch.cuda.is_available()}\")\n",
+    "if torch.cuda.is_available():\n",
+    "    print(f\"GPU device: {torch.cuda.get_device_name(0)}\")\n",
+    "\n",
+    "# CRITICAL: Add GPU support and optimization parameters\n",
+    "try:\n",
+    "    os.environ[\"HF_HUB_DOWNLOAD_TIMEOUT\"] = \"120\"\n",
+    "\n",
+    "    pipe = pipeline(\n",
+    "        \"zero-shot-classification\",\n",
+    "        model=\"facebook/bart-large-mnli\",\n",
+    "        device=0 if torch.cuda.is_available() else -1,  # Use GPU if available\n",
+    "        batch_size=64,  # Internal pipeline batch size\n",
+    "        max_length=512,  # Truncate long texts\n",
+    "        truncation=True,\n",
+    "        use_auth_token=False,\n",
+    "        revision=\"main\"\n",
+    "    )\n",
+    "\n",
+    "    print(\"✅ Model loaded successfully with GPU acceleration!\" if torch.cuda.is_available() else \"✅ Model loaded (CPU mode)\")\n",
+    "\n",
+    "except Exception as e:\n",
+    "    print(f\"Error with facebook/bart-large-mnli: {e}\")\n",
+    "    print(\"\\n🔄 Trying alternative model...\")\n",
+    "\n",
+    "    try:\n",
+    "        pipe = pipeline(\n",
+    "            \"zero-shot-classification\",\n",
+    "            model=\"typeform/distilbert-base-uncased-mnli\",\n",
+    "            device=0 if torch.cuda.is_available() else -1,  # GPU support\n",
+    "            batch_size=64,\n",
+    "            max_length=512,\n",
+    "            truncation=True\n",
+    "        )\n",
+    "\n",
+    "        print(\"✅ Alternative model loaded successfully!\")\n",
+    "\n",
+    "    except Exception as e2:\n",
+    "        print(f\"❌ Error with alternative model: {e2}\")\n",
+    "        print(\"Please check your internet connection and try again.\")\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "80bc187fbfff3e10",
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2025-09-16T04:15:12.296956Z",
+     "start_time": "2025-09-16T04:15:12.116659Z"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "sequence = books.loc[books[\"simple_categories\"] == 'Fiction', 'description'].reset_index(drop=True)[0]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "8ba6836b2c958329",
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2025-09-16T04:15:21.478795Z",
+     "start_time": "2025-09-16T04:15:14.044833Z"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "pipe(sequence, fiction_categories)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "23f2c1d7a1c73945",
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2025-09-16T03:30:19.621730Z",
+     "start_time": "2025-09-16T03:30:12.489364Z"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "max_index = np.argmax(pipe(sequence, fiction_categories)[\"scores\"])\n",
+    "max_label = pipe(sequence, fiction_categories)[\"labels\"][max_index]\n",
+    "max_label"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "eb1273971a44738c",
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2025-09-16T04:15:21.672845Z",
+     "start_time": "2025-09-16T04:15:21.660563Z"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "from tqdm import tqdm\n",
+    "import pandas as pd\n",
+    "import time\n",
+    "from concurrent.futures import ThreadPoolExecutor, ProcessPoolExecutor\n",
+    "import multiprocessing as mp\n",
+    "\n",
+    "# SOLUTION 1: Batch Processing (Most Important!)\n",
+    "def generate_predictions(sequences, categories, batch_size=32):\n",
+    "    \"\"\"Process multiple sequences at once - much faster!\"\"\"\n",
+    "    predictions = []\n",
+    "\n",
+    "    for i in tqdm(range(0, len(sequences), batch_size), desc=\"Processing batches\"):\n",
+    "        batch = sequences[i:i+batch_size]\n",
+    "\n",
+    "        # Process entire batch at once\n",
+    "        batch_results = pipe(batch, categories)\n",
+    "\n",
+    "        # Handle both single result and list of results\n",
+    "        if isinstance(batch_results, list):\n",
+    "            predictions.extend([result['labels'][0] for result in batch_results])\n",
+    "        else:\n",
+    "            predictions.append(batch_results['labels'][0])\n",
+    "\n",
+    "    return predictions"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "7d024a18309a521d",
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2025-09-16T04:21:02.847544Z",
+     "start_time": "2025-09-16T04:15:23.714181Z"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "# Get 300 nonfiction descriptions\n",
+    "nonfiction_books = books.loc[books[\"simple_categories\"] == 'Nonfiction', 'description'].reset_index(drop=True)[:300]\n",
+    "\n",
+    "# Truncate for speed\n",
+    "sequences = [desc[:400] for desc in nonfiction_books]\n",
+    "\n",
+    "# Process in batches of 20 (instead of 300 individual calls)\n",
+    "batch_size = 20\n",
+    "for i in tqdm(range(0, len(sequences), batch_size)):\n",
+    "    batch = sequences[i:i+batch_size]\n",
+    "\n",
+    "    # One model call for 20 books instead of 20 separate calls\n",
+    "    results = pipe(batch, fiction_categories)\n",
+    "\n",
+    "    # Extract predictions\n",
+    "    if isinstance(results, list):\n",
+    "        preddicted_cats += [r['labels'][0] for r in results]\n",
+    "    else:\n",
+    "        preddicted_cats += [results['labels'][0]]\n",
+    "\n",
+    "    actual_cats += ['Nonfiction'] * len(batch)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "fdc40689dfadf1",
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2025-09-16T04:21:08.483550Z",
+     "start_time": "2025-09-16T04:21:08.405904Z"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "predicted_df = pd.DataFrame({\"actual_categories\": actual_cats, \"predicted_categories\": preddicted_cats})"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "ed0907a9093b94d0",
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2025-09-16T04:21:16.539324Z",
+     "start_time": "2025-09-16T04:21:16.384515Z"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "predicted_df.head()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "87d924edea28b476",
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2025-09-16T04:21:19.825460Z",
+     "start_time": "2025-09-16T04:21:19.795117Z"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "predicted_df['correct_prediction'] = (np.where(predicted_df['actual_categories'] == predicted_df['predicted_categories'], 1, 0)\n",
+    "                                      )"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "6c25043f2e0d694a",
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2025-09-16T04:21:22.040362Z",
+     "start_time": "2025-09-16T04:21:22.019264Z"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "predicted_df['correct_prediction'].sum()/len(predicted_df)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "3c3611fc62b1d8df",
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2025-09-16T04:21:24.159383Z",
+     "start_time": "2025-09-16T04:21:24.001792Z"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "isbns = []\n",
+    "predicted_cats = []\n",
+    "\n",
+    "missing_cats = books.loc[books['simple_categories'].isna(), ['isbn13', 'description']].reset_index(drop=True)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "5a6ee7c312cc4605",
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2025-09-16T04:48:29.368260Z",
+     "start_time": "2025-09-16T04:47:55.181816Z"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "# Your current code (already run - don't re-run!)\n",
+    "sequences = [str(desc)[:200] if pd.notna(desc) else \"\" for desc in missing_cats[\"description\"]]\n",
+    "sequences = [seq for seq in sequences if seq.strip()]  # This changed the length!\n",
+    "isbns = missing_cats[\"isbn13\"].tolist()\n",
+    "predicted_cats = generate_predictions(sequences, fiction_categories, batch_size=128)\n",
+    "\n",
+    "\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "4561a0670452fa3b",
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2025-09-16T04:51:30.775050Z",
+     "start_time": "2025-09-16T04:51:30.573483Z"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "# FIX: Get the correct ISBNs that match your filtered sequences\n",
+    "descriptions = missing_cats[\"description\"].tolist()\n",
+    "isbns_full = missing_cats[\"isbn13\"].tolist()\n",
+    "\n",
+    "matching_isbns = []\n",
+    "for i, desc in enumerate(descriptions):\n",
+    "    processed_desc = str(desc)[:200] if pd.notna(desc) else \"\"\n",
+    "    if processed_desc.strip():  # Same condition as your filter\n",
+    "        matching_isbns.append(isbns_full[i])\n",
+    "\n",
+    "# Now create DataFrame with matching lengths\n",
+    "missing_predicted_df = pd.DataFrame({\n",
+    "    \"isbn13\": matching_isbns[:len(predicted_cats)],  # Safety check\n",
+    "    \"predicted_categories\": predicted_cats\n",
+    "})\n",
+    "\n",
+    "print(f\"✅ DataFrame created successfully with {len(missing_predicted_df)} rows\")\n",
+    "print(f\"📊 Predictions by category:\")\n",
+    "print(missing_predicted_df['predicted_categories'].value_counts())\n",
+    "\n",
+    "# Save results\n",
+    "missing_predicted_df.to_csv('missing_categories_predictions.csv', index=False)\n",
+    "print(\"💾 Results saved to missing_categories_predictions.csv\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "72fe9a8b4b28a1c6",
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2025-09-16T04:52:15.607087Z",
+     "start_time": "2025-09-16T04:52:15.520116Z"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "missing_predicted_df.head()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "8b1f7af8aebf289e",
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2025-09-16T05:00:37.137998Z",
+     "start_time": "2025-09-16T05:00:36.885979Z"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "books = pd.merge(books, missing_predicted_df, on=\"isbn13\", how=\"left\")\n",
+    "books[\"simple_categories\"] = np.where(books[\"simple_categories\"].isna(), books[\"predicted_categories\"], books[\"simple_categories\"])\n",
+    "books = books.drop(columns=\"predicted_categories\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "fe5b161193dab1f",
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2025-09-16T05:00:50.989276Z",
+     "start_time": "2025-09-16T05:00:50.952202Z"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "books"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "9d2e1a8dbbd5d6bc",
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2025-09-16T05:01:27.850818Z",
+     "start_time": "2025-09-16T05:01:27.781563Z"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "books[books[\"categories\"].str.lower().isin([\n",
+    "    \"romance\",\n",
+    "    \"science fiction\",\n",
+    "    \"scifi\",\n",
+    "    \"fantasy\",\n",
+    "    \"horror\",\n",
+    "    \"mystery\",\n",
+    "    \"thriller\",\n",
+    "    \"comedy\",\n",
+    "    \"crime\",\n",
+    "    \"historical\"\n",
+    "])]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "bd067ee0696cac0b",
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2025-09-16T05:04:09.432347Z",
+     "start_time": "2025-09-16T05:04:09.246658Z"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "books.to_csv(\"books_with_categories.csv\", index=False)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "f8879607442c3f0f",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 2
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython2",
+   "version": "2.7.6"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}

vector-search.ipynb ADDED Viewed

	@@ -0,0 +1,296 @@

+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "initial_id",
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2025-09-15T07:35:25.488414Z",
+     "start_time": "2025-09-15T07:35:25.460656Z"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "from langchain.embeddings import HuggingFaceEmbeddings\n",
+    "\n",
+    "from langchain_text_splitters import CharacterTextSplitter\n",
+    "from langchain_openai import OpenAIEmbeddings\n",
+    "from langchain_chroma import Chroma\n",
+    "\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "9e2d7510161fceb6",
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2025-09-15T07:35:27.755330Z",
+     "start_time": "2025-09-15T07:35:27.736857Z"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "from dotenv import load_dotenv\n",
+    "from dotenv import load_dotenv\n",
+    "import os\n",
+    "\n",
+    "# Load environment variables\n",
+    "load_dotenv()\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "b1c5ca1012315fd2",
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2025-09-15T07:35:30.169857Z",
+     "start_time": "2025-09-15T07:35:30.074451Z"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "import pandas as pd\n",
+    "\n",
+    "books = pd.read_csv(\"books_cleaned.csv\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "694a28505e311eea",
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2025-09-15T07:35:32.747269Z",
+     "start_time": "2025-09-15T07:35:32.725973Z"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "books"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "eb17356cf0ecbbef",
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2025-09-15T07:35:35.292093Z",
+     "start_time": "2025-09-15T07:35:35.243618Z"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "books[\"tagged_description\"].to_csv(\"tagged_description.txt\",\n",
+    "                                   index=False,\n",
+    "                                   header=False)\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "2db289c35716805c",
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2025-09-15T07:35:42.877672Z",
+     "start_time": "2025-09-15T07:35:42.683378Z"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "\n",
+    "# Load the file manually (more reliable)\n",
+    "with open(\"tagged_description.txt\", 'r', encoding='utf-8') as file:\n",
+    "    content = file.read()\n",
+    "\n",
+    "# Create a document object\n",
+    "raw_documents = [Document(page_content=content)]\n",
+    "\n",
+    "# Split into chunks\n",
+    "text_splitter = CharacterTextSplitter(\n",
+    "    chunk_size=1500,  # Increased to avoid warnings\n",
+    "    chunk_overlap=150,\n",
+    "    separator=\"\\n\"\n",
+    ")\n",
+    "\n",
+    "documents = text_splitter.split_documents(raw_documents)\n",
+    "\n",
+    "print(f\"Successfully created {len(documents)} chunks\")\n",
+    "print(f\"First chunk preview:\\n{documents[0].page_content[:200]}...\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "12d6dc1c1f518682",
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2025-09-15T07:35:45.472985Z",
+     "start_time": "2025-09-15T07:35:45.467714Z"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "documents[0]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "d73b0e5261855919",
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2025-09-15T07:47:27.888830Z",
+     "start_time": "2025-09-15T07:36:56.075724Z"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "!pip install sentence_transformers\n",
+    "embeddings = HuggingFaceEmbeddings(\n",
+    "    model_name=\"all-MiniLM-L6-v2\",  # Free, fast, and good quality\n",
+    "    model_kwargs={'device': 'cpu'}   # Use 'cuda' if you have a GPU\n",
+    ")\n",
+    "\n",
+    "db_books = Chroma.from_documents(\n",
+    "    documents,\n",
+    "    embedding=embeddings\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "9473a4b393977d6f",
+   "metadata": {},
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "8c28a61479deb520",
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2025-09-15T07:47:49.568125Z",
+     "start_time": "2025-09-15T07:47:49.337737Z"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "query = \"A book to teach children about nature\"\n",
+    "docs = db_books.similarity_search(query, k = 10)\n",
+    "docs"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "57cebcff1d436b6a",
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2025-09-15T07:54:09.612026Z",
+     "start_time": "2025-09-15T07:54:09.538027Z"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "# Extract and clean the ISBN\n",
+    "isbn_str = docs[0].page_content.split()[0].strip()\n",
+    "# Remove quotes and convert to float first, then int\n",
+    "isbn_clean = isbn_str.replace('\"', '').replace(\"'\", \"\")\n",
+    "isbn_int = int(float(isbn_clean))  # float first to handle .0, then int\n",
+    "\n",
+    "# Now search\n",
+    "result = books[books[\"isbn13\"] == isbn_int]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "4155cc001df44e93",
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2025-09-15T07:54:49.157935Z",
+     "start_time": "2025-09-15T07:54:49.088922Z"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "result"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "4c644a4b395fda08",
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2025-09-15T08:23:46.545582Z",
+     "start_time": "2025-09-15T08:23:46.531998Z"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "def retrieve_semantic_recommendations(\n",
+    "    query: str,\n",
+    "    top_k: int = 10,\n",
+    ") -> pd.DataFrame:\n",
+    "    recs = db_books.similarity_search(query, k=50)\n",
+    "\n",
+    "    books_list = []\n",
+    "\n",
+    "    for i in range(0, len(recs)):\n",
+    "        isbn_str = recs[i].page_content.strip('\"').split()[0]\n",
+    "        books_list += [int(float(isbn_str))]  # float() first, then int()\n",
+    "\n",
+    "    return books[books[\"isbn13\"].isin(books_list)].head(top_k)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "b9eada846c702825",
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2025-09-15T08:23:47.659278Z",
+     "start_time": "2025-09-15T08:23:47.501425Z"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "retrieve_semantic_recommendations(\"A book to teach children about nature\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "36d5bb5ac34f9b2d",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 2
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython2",
+   "version": "2.7.6"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}