Spaces:
Sleeping
Sleeping
Upload 6 files
Browse files- data-exploration.ipynb +443 -0
- gradio-dashboard.py +117 -0
- sentiment-analysis.ipynb +445 -0
- tagged_description.txt +0 -0
- text-classification.ipynb +591 -0
- vector-search.ipynb +296 -0
data-exploration.ipynb
ADDED
|
@@ -0,0 +1,443 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"cells": [
|
| 3 |
+
{
|
| 4 |
+
"cell_type": "code",
|
| 5 |
+
"execution_count": null,
|
| 6 |
+
"id": "initial_id",
|
| 7 |
+
"metadata": {
|
| 8 |
+
"ExecuteTime": {
|
| 9 |
+
"end_time": "2025-09-14T07:12:03.050818Z",
|
| 10 |
+
"start_time": "2025-09-14T07:11:56.152605Z"
|
| 11 |
+
}
|
| 12 |
+
},
|
| 13 |
+
"outputs": [],
|
| 14 |
+
"source": [
|
| 15 |
+
"from statistics import correlation\n",
|
| 16 |
+
"\n",
|
| 17 |
+
"import kagglehub\n",
|
| 18 |
+
"\n",
|
| 19 |
+
"# Download latest version\n",
|
| 20 |
+
"path = kagglehub.dataset_download(\"dylanjcastillo/7k-books-with-metadata\")\n",
|
| 21 |
+
"\n",
|
| 22 |
+
"print(\"Path to dataset files:\", path)"
|
| 23 |
+
]
|
| 24 |
+
},
|
| 25 |
+
{
|
| 26 |
+
"cell_type": "code",
|
| 27 |
+
"execution_count": null,
|
| 28 |
+
"id": "ae99194daafd1775",
|
| 29 |
+
"metadata": {
|
| 30 |
+
"ExecuteTime": {
|
| 31 |
+
"end_time": "2025-09-14T07:51:53.432293Z",
|
| 32 |
+
"start_time": "2025-09-14T07:51:52.436694Z"
|
| 33 |
+
}
|
| 34 |
+
},
|
| 35 |
+
"outputs": [],
|
| 36 |
+
"source": [
|
| 37 |
+
"import pandas as pd"
|
| 38 |
+
]
|
| 39 |
+
},
|
| 40 |
+
{
|
| 41 |
+
"cell_type": "code",
|
| 42 |
+
"execution_count": null,
|
| 43 |
+
"id": "6df67758ebb1137c",
|
| 44 |
+
"metadata": {
|
| 45 |
+
"ExecuteTime": {
|
| 46 |
+
"end_time": "2025-09-14T08:03:25.179234Z",
|
| 47 |
+
"start_time": "2025-09-14T08:03:24.185253Z"
|
| 48 |
+
}
|
| 49 |
+
},
|
| 50 |
+
"outputs": [],
|
| 51 |
+
"source": [
|
| 52 |
+
"from pathlib import Path\n",
|
| 53 |
+
"\n",
|
| 54 |
+
"# Convert string path → Path object\n",
|
| 55 |
+
"path = Path(kagglehub.dataset_download(\"dylanjcastillo/7k-books-with-metadata\"))\n",
|
| 56 |
+
"\n",
|
| 57 |
+
"books = pd.read_csv(path / \"books.csv\")"
|
| 58 |
+
]
|
| 59 |
+
},
|
| 60 |
+
{
|
| 61 |
+
"cell_type": "code",
|
| 62 |
+
"execution_count": null,
|
| 63 |
+
"id": "94828bc9ccbfafa1",
|
| 64 |
+
"metadata": {
|
| 65 |
+
"ExecuteTime": {
|
| 66 |
+
"end_time": "2025-09-14T08:03:37.133785Z",
|
| 67 |
+
"start_time": "2025-09-14T08:03:37.079170Z"
|
| 68 |
+
}
|
| 69 |
+
},
|
| 70 |
+
"outputs": [],
|
| 71 |
+
"source": [
|
| 72 |
+
"books"
|
| 73 |
+
]
|
| 74 |
+
},
|
| 75 |
+
{
|
| 76 |
+
"cell_type": "code",
|
| 77 |
+
"execution_count": null,
|
| 78 |
+
"id": "9403c10bb9a0112e",
|
| 79 |
+
"metadata": {
|
| 80 |
+
"ExecuteTime": {
|
| 81 |
+
"end_time": "2025-09-14T08:12:20.943772Z",
|
| 82 |
+
"start_time": "2025-09-14T08:12:16.468843Z"
|
| 83 |
+
}
|
| 84 |
+
},
|
| 85 |
+
"outputs": [],
|
| 86 |
+
"source": [
|
| 87 |
+
"import seaborn as sns\n",
|
| 88 |
+
"import matplotlib.pyplot as plt"
|
| 89 |
+
]
|
| 90 |
+
},
|
| 91 |
+
{
|
| 92 |
+
"cell_type": "code",
|
| 93 |
+
"execution_count": null,
|
| 94 |
+
"id": "aaba3c5cc9492dbc",
|
| 95 |
+
"metadata": {
|
| 96 |
+
"ExecuteTime": {
|
| 97 |
+
"end_time": "2025-09-14T08:16:47.484763Z",
|
| 98 |
+
"start_time": "2025-09-14T08:16:47.134190Z"
|
| 99 |
+
}
|
| 100 |
+
},
|
| 101 |
+
"outputs": [],
|
| 102 |
+
"source": [
|
| 103 |
+
"ax = plt.axes()\n",
|
| 104 |
+
"sns.heatmap(books.isna().transpose(), cbar = False , ax=ax)\n",
|
| 105 |
+
"\n",
|
| 106 |
+
"plt.xlabel(\"Columns\")\n",
|
| 107 |
+
"plt.ylabel(\"Missing values\")\n",
|
| 108 |
+
"\n",
|
| 109 |
+
"plt.show()"
|
| 110 |
+
]
|
| 111 |
+
},
|
| 112 |
+
{
|
| 113 |
+
"cell_type": "code",
|
| 114 |
+
"execution_count": null,
|
| 115 |
+
"id": "5020d8ec7f517390",
|
| 116 |
+
"metadata": {
|
| 117 |
+
"ExecuteTime": {
|
| 118 |
+
"end_time": "2025-09-14T08:34:19.472432Z",
|
| 119 |
+
"start_time": "2025-09-14T08:34:19.396405Z"
|
| 120 |
+
}
|
| 121 |
+
},
|
| 122 |
+
"outputs": [],
|
| 123 |
+
"source": [
|
| 124 |
+
"import numpy as np\n",
|
| 125 |
+
"books[\"missing_description\"] = np.where(books[\"description\"].isna(), 1, 0)\n",
|
| 126 |
+
"books[\"age_of_book\"] = 2024 - books[\"published_year\"]"
|
| 127 |
+
]
|
| 128 |
+
},
|
| 129 |
+
{
|
| 130 |
+
"cell_type": "code",
|
| 131 |
+
"execution_count": null,
|
| 132 |
+
"id": "8693f57773a2f2ca",
|
| 133 |
+
"metadata": {
|
| 134 |
+
"ExecuteTime": {
|
| 135 |
+
"end_time": "2025-09-14T08:45:22.257526Z",
|
| 136 |
+
"start_time": "2025-09-14T08:45:22.005185Z"
|
| 137 |
+
}
|
| 138 |
+
},
|
| 139 |
+
"outputs": [],
|
| 140 |
+
"source": [
|
| 141 |
+
"columns_of_interest = [\"num_pages\", \"age_of_book\", \"missing_description\", \"average_rating\"]\n",
|
| 142 |
+
"correlation_matrix = books[columns_of_interest].corr(method = \"spearman\")\n",
|
| 143 |
+
"sns.set_theme(style=\"white\")\n",
|
| 144 |
+
"plt.figure(figsize = (8, 6))\n",
|
| 145 |
+
"heatmap = sns.heatmap(correlation_matrix, annot=True, fmt=\".2f\", cmap=\"coolwarm\", cbar_kws={\"label\": \"Spearman Correlation\"})\n",
|
| 146 |
+
"\n",
|
| 147 |
+
"heatmap.set_title(\"Correlation Heatmap\")\n",
|
| 148 |
+
"plt.show()"
|
| 149 |
+
]
|
| 150 |
+
},
|
| 151 |
+
{
|
| 152 |
+
"cell_type": "code",
|
| 153 |
+
"execution_count": null,
|
| 154 |
+
"id": "1218eb9769f7ec28",
|
| 155 |
+
"metadata": {
|
| 156 |
+
"ExecuteTime": {
|
| 157 |
+
"end_time": "2025-09-14T09:01:30.451492Z",
|
| 158 |
+
"start_time": "2025-09-14T09:01:30.397573Z"
|
| 159 |
+
}
|
| 160 |
+
},
|
| 161 |
+
"outputs": [],
|
| 162 |
+
"source": [
|
| 163 |
+
"books_missing = books[(books[\"description\"].isna()) |\n",
|
| 164 |
+
" ~(books[\"num_pages\"].isna()) &\n",
|
| 165 |
+
" ~(books[\"average_rating\"].isna()) &\n",
|
| 166 |
+
" ~(books[\"published_year\"].isna())\n",
|
| 167 |
+
"]"
|
| 168 |
+
]
|
| 169 |
+
},
|
| 170 |
+
{
|
| 171 |
+
"cell_type": "code",
|
| 172 |
+
"execution_count": null,
|
| 173 |
+
"id": "a16b79d748237fa6",
|
| 174 |
+
"metadata": {
|
| 175 |
+
"ExecuteTime": {
|
| 176 |
+
"end_time": "2025-09-14T09:29:57.037634Z",
|
| 177 |
+
"start_time": "2025-09-14T09:29:56.971479Z"
|
| 178 |
+
}
|
| 179 |
+
},
|
| 180 |
+
"outputs": [],
|
| 181 |
+
"source": [
|
| 182 |
+
"books_missing = books[~(books[\"description\"].isna()) &\n",
|
| 183 |
+
" ~(books[\"num_pages\"].isna()) &\n",
|
| 184 |
+
" ~(books[\"average_rating\"].isna()) &\n",
|
| 185 |
+
" ~(books[\"published_year\"].isna())\n",
|
| 186 |
+
"]"
|
| 187 |
+
]
|
| 188 |
+
},
|
| 189 |
+
{
|
| 190 |
+
"cell_type": "code",
|
| 191 |
+
"execution_count": null,
|
| 192 |
+
"id": "997cafb5e60fef34",
|
| 193 |
+
"metadata": {
|
| 194 |
+
"ExecuteTime": {
|
| 195 |
+
"end_time": "2025-09-14T09:30:14.028246Z",
|
| 196 |
+
"start_time": "2025-09-14T09:30:13.969750Z"
|
| 197 |
+
}
|
| 198 |
+
},
|
| 199 |
+
"outputs": [],
|
| 200 |
+
"source": [
|
| 201 |
+
"books_missing"
|
| 202 |
+
]
|
| 203 |
+
},
|
| 204 |
+
{
|
| 205 |
+
"cell_type": "code",
|
| 206 |
+
"execution_count": null,
|
| 207 |
+
"id": "6aad6ddc169cf39d",
|
| 208 |
+
"metadata": {
|
| 209 |
+
"ExecuteTime": {
|
| 210 |
+
"end_time": "2025-09-14T09:33:31.453933Z",
|
| 211 |
+
"start_time": "2025-09-14T09:33:31.395084Z"
|
| 212 |
+
}
|
| 213 |
+
},
|
| 214 |
+
"outputs": [],
|
| 215 |
+
"source": [
|
| 216 |
+
"books_missing[\"categories\"].value_counts().reset_index().sort_values(\"count\", ascending=False)"
|
| 217 |
+
]
|
| 218 |
+
},
|
| 219 |
+
{
|
| 220 |
+
"cell_type": "code",
|
| 221 |
+
"execution_count": null,
|
| 222 |
+
"id": "a7c0628d5619c32b",
|
| 223 |
+
"metadata": {
|
| 224 |
+
"ExecuteTime": {
|
| 225 |
+
"end_time": "2025-09-14T09:52:43.243363Z",
|
| 226 |
+
"start_time": "2025-09-14T09:52:43.211576Z"
|
| 227 |
+
}
|
| 228 |
+
},
|
| 229 |
+
"outputs": [],
|
| 230 |
+
"source": [
|
| 231 |
+
"books_missing"
|
| 232 |
+
]
|
| 233 |
+
},
|
| 234 |
+
{
|
| 235 |
+
"cell_type": "code",
|
| 236 |
+
"execution_count": null,
|
| 237 |
+
"id": "b971c57a22e2721e",
|
| 238 |
+
"metadata": {
|
| 239 |
+
"ExecuteTime": {
|
| 240 |
+
"end_time": "2025-09-14T10:06:37.305268Z",
|
| 241 |
+
"start_time": "2025-09-14T10:06:37.242773Z"
|
| 242 |
+
}
|
| 243 |
+
},
|
| 244 |
+
"outputs": [],
|
| 245 |
+
"source": [
|
| 246 |
+
"books_missing.loc[:, \"words_in_description\"] = books_missing[\"description\"].str.split().str.len()\n"
|
| 247 |
+
]
|
| 248 |
+
},
|
| 249 |
+
{
|
| 250 |
+
"cell_type": "code",
|
| 251 |
+
"execution_count": null,
|
| 252 |
+
"id": "5cf80ede1a996820",
|
| 253 |
+
"metadata": {
|
| 254 |
+
"ExecuteTime": {
|
| 255 |
+
"end_time": "2025-09-14T10:07:11.889795Z",
|
| 256 |
+
"start_time": "2025-09-14T10:07:11.815772Z"
|
| 257 |
+
}
|
| 258 |
+
},
|
| 259 |
+
"outputs": [],
|
| 260 |
+
"source": [
|
| 261 |
+
"books_missing"
|
| 262 |
+
]
|
| 263 |
+
},
|
| 264 |
+
{
|
| 265 |
+
"cell_type": "code",
|
| 266 |
+
"execution_count": null,
|
| 267 |
+
"id": "d4a20c7b8a28d843",
|
| 268 |
+
"metadata": {
|
| 269 |
+
"ExecuteTime": {
|
| 270 |
+
"end_time": "2025-09-14T10:16:26.757853Z",
|
| 271 |
+
"start_time": "2025-09-14T10:16:26.738194Z"
|
| 272 |
+
}
|
| 273 |
+
},
|
| 274 |
+
"outputs": [],
|
| 275 |
+
"source": [
|
| 276 |
+
"print(books_missing.loc[books_missing[\"words_in_description\"].between(25, 34), [\"description\", \"words_in_description\"]])\n"
|
| 277 |
+
]
|
| 278 |
+
},
|
| 279 |
+
{
|
| 280 |
+
"cell_type": "code",
|
| 281 |
+
"execution_count": null,
|
| 282 |
+
"id": "add578fb79f75576",
|
| 283 |
+
"metadata": {
|
| 284 |
+
"ExecuteTime": {
|
| 285 |
+
"end_time": "2025-09-14T10:18:38.671378Z",
|
| 286 |
+
"start_time": "2025-09-14T10:18:38.655678Z"
|
| 287 |
+
}
|
| 288 |
+
},
|
| 289 |
+
"outputs": [],
|
| 290 |
+
"source": [
|
| 291 |
+
"books_missing_25_words = books_missing[books_missing[\"words_in_description\"] >= 25]"
|
| 292 |
+
]
|
| 293 |
+
},
|
| 294 |
+
{
|
| 295 |
+
"cell_type": "code",
|
| 296 |
+
"execution_count": null,
|
| 297 |
+
"id": "337cc14a7592597",
|
| 298 |
+
"metadata": {
|
| 299 |
+
"ExecuteTime": {
|
| 300 |
+
"end_time": "2025-09-14T10:18:45.020133Z",
|
| 301 |
+
"start_time": "2025-09-14T10:18:44.995404Z"
|
| 302 |
+
}
|
| 303 |
+
},
|
| 304 |
+
"outputs": [],
|
| 305 |
+
"source": [
|
| 306 |
+
"books_missing_25_words"
|
| 307 |
+
]
|
| 308 |
+
},
|
| 309 |
+
{
|
| 310 |
+
"cell_type": "code",
|
| 311 |
+
"execution_count": null,
|
| 312 |
+
"id": "15505042aaae206b",
|
| 313 |
+
"metadata": {
|
| 314 |
+
"ExecuteTime": {
|
| 315 |
+
"end_time": "2025-09-14T10:36:25.385493Z",
|
| 316 |
+
"start_time": "2025-09-14T10:36:25.348788Z"
|
| 317 |
+
}
|
| 318 |
+
},
|
| 319 |
+
"outputs": [],
|
| 320 |
+
"source": [
|
| 321 |
+
"books_missing_25_words.loc[:, \"title_and_subtitle\"] = np.where(\n",
|
| 322 |
+
" books_missing_25_words[\"subtitle\"].isna(),\n",
|
| 323 |
+
" books_missing_25_words[\"title\"],\n",
|
| 324 |
+
" books_missing_25_words[[\"title\", \"subtitle\"]].astype(str).agg(\": \".join, axis=1)\n",
|
| 325 |
+
")\n"
|
| 326 |
+
]
|
| 327 |
+
},
|
| 328 |
+
{
|
| 329 |
+
"cell_type": "code",
|
| 330 |
+
"execution_count": null,
|
| 331 |
+
"id": "8f48839b393f1be6",
|
| 332 |
+
"metadata": {
|
| 333 |
+
"ExecuteTime": {
|
| 334 |
+
"end_time": "2025-09-14T10:36:36.463971Z",
|
| 335 |
+
"start_time": "2025-09-14T10:36:36.442637Z"
|
| 336 |
+
}
|
| 337 |
+
},
|
| 338 |
+
"outputs": [],
|
| 339 |
+
"source": [
|
| 340 |
+
"books_missing_25_words"
|
| 341 |
+
]
|
| 342 |
+
},
|
| 343 |
+
{
|
| 344 |
+
"cell_type": "code",
|
| 345 |
+
"execution_count": null,
|
| 346 |
+
"id": "1033bd78abfa34a3",
|
| 347 |
+
"metadata": {
|
| 348 |
+
"ExecuteTime": {
|
| 349 |
+
"end_time": "2025-09-14T10:38:24.509449Z",
|
| 350 |
+
"start_time": "2025-09-14T10:38:24.480830Z"
|
| 351 |
+
}
|
| 352 |
+
},
|
| 353 |
+
"outputs": [],
|
| 354 |
+
"source": [
|
| 355 |
+
"books_missing_25_words[\"title_and_subtitle\"].value_counts().reset_index().sort_values(\"count\", ascending=False)"
|
| 356 |
+
]
|
| 357 |
+
},
|
| 358 |
+
{
|
| 359 |
+
"cell_type": "code",
|
| 360 |
+
"execution_count": null,
|
| 361 |
+
"id": "1871d27d7eb01493",
|
| 362 |
+
"metadata": {
|
| 363 |
+
"ExecuteTime": {
|
| 364 |
+
"end_time": "2025-09-14T10:45:15.551772Z",
|
| 365 |
+
"start_time": "2025-09-14T10:45:15.504051Z"
|
| 366 |
+
}
|
| 367 |
+
},
|
| 368 |
+
"outputs": [],
|
| 369 |
+
"source": [
|
| 370 |
+
"books_missing_25_words = books_missing_25_words.copy() # slice warning हटाने के लिए\n",
|
| 371 |
+
"\n",
|
| 372 |
+
"books_missing_25_words.loc[:, \"tagged_description\"] = (\n",
|
| 373 |
+
" books_missing_25_words[[\"isbn13\", \"description\"]]\n",
|
| 374 |
+
" .astype(str)\n",
|
| 375 |
+
" .agg(\" \".join, axis=1)\n",
|
| 376 |
+
")\n"
|
| 377 |
+
]
|
| 378 |
+
},
|
| 379 |
+
{
|
| 380 |
+
"cell_type": "code",
|
| 381 |
+
"execution_count": null,
|
| 382 |
+
"id": "20a704320865f12b",
|
| 383 |
+
"metadata": {
|
| 384 |
+
"ExecuteTime": {
|
| 385 |
+
"end_time": "2025-09-14T10:45:38.585999Z",
|
| 386 |
+
"start_time": "2025-09-14T10:45:38.566081Z"
|
| 387 |
+
}
|
| 388 |
+
},
|
| 389 |
+
"outputs": [],
|
| 390 |
+
"source": [
|
| 391 |
+
"books_missing_25_words"
|
| 392 |
+
]
|
| 393 |
+
},
|
| 394 |
+
{
|
| 395 |
+
"cell_type": "code",
|
| 396 |
+
"execution_count": null,
|
| 397 |
+
"id": "36a89080af8a4f1c",
|
| 398 |
+
"metadata": {
|
| 399 |
+
"ExecuteTime": {
|
| 400 |
+
"end_time": "2025-09-14T10:49:30.500326Z",
|
| 401 |
+
"start_time": "2025-09-14T10:49:30.213437Z"
|
| 402 |
+
}
|
| 403 |
+
},
|
| 404 |
+
"outputs": [],
|
| 405 |
+
"source": [
|
| 406 |
+
"(\n",
|
| 407 |
+
" books_missing_25_words\n",
|
| 408 |
+
" .drop([\"subtitle\", \"missing_description\", \"age_of_book\", \"words_in_description\"], axis=1)\n",
|
| 409 |
+
" .to_csv(\"books_cleaned.csv\", index = False)\n",
|
| 410 |
+
")"
|
| 411 |
+
]
|
| 412 |
+
},
|
| 413 |
+
{
|
| 414 |
+
"cell_type": "code",
|
| 415 |
+
"execution_count": null,
|
| 416 |
+
"id": "a2308b29e727ba70",
|
| 417 |
+
"metadata": {},
|
| 418 |
+
"outputs": [],
|
| 419 |
+
"source": []
|
| 420 |
+
}
|
| 421 |
+
],
|
| 422 |
+
"metadata": {
|
| 423 |
+
"kernelspec": {
|
| 424 |
+
"display_name": "Python 3",
|
| 425 |
+
"language": "python",
|
| 426 |
+
"name": "python3"
|
| 427 |
+
},
|
| 428 |
+
"language_info": {
|
| 429 |
+
"codemirror_mode": {
|
| 430 |
+
"name": "ipython",
|
| 431 |
+
"version": 2
|
| 432 |
+
},
|
| 433 |
+
"file_extension": ".py",
|
| 434 |
+
"mimetype": "text/x-python",
|
| 435 |
+
"name": "python",
|
| 436 |
+
"nbconvert_exporter": "python",
|
| 437 |
+
"pygments_lexer": "ipython2",
|
| 438 |
+
"version": "2.7.6"
|
| 439 |
+
}
|
| 440 |
+
},
|
| 441 |
+
"nbformat": 4,
|
| 442 |
+
"nbformat_minor": 5
|
| 443 |
+
}
|
gradio-dashboard.py
ADDED
|
@@ -0,0 +1,117 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import pandas as pd
|
| 2 |
+
import numpy as np
|
| 3 |
+
from dotenv import load_dotenv
|
| 4 |
+
|
| 5 |
+
from langchain.schema import Document
|
| 6 |
+
from langchain_huggingface import HuggingFaceEmbeddings
|
| 7 |
+
from langchain_chroma import Chroma
|
| 8 |
+
|
| 9 |
+
import gradio as gr
|
| 10 |
+
|
| 11 |
+
load_dotenv()
|
| 12 |
+
|
| 13 |
+
books = pd.read_csv("books_with_emotions.csv")
|
| 14 |
+
books["large_thumbnail"] = books["thumbnail"] + "&fife=w800"
|
| 15 |
+
books["large_thumbnail"] = np.where(
|
| 16 |
+
books["large_thumbnail"].isna(),
|
| 17 |
+
"cover-not-found.jpg",
|
| 18 |
+
books["large_thumbnail"],
|
| 19 |
+
)
|
| 20 |
+
|
| 21 |
+
# Create documents directly from DataFrame instead of loading from file
|
| 22 |
+
documents = []
|
| 23 |
+
for _, row in books.iterrows():
|
| 24 |
+
content = f"{row['isbn13']} {row['description']}"
|
| 25 |
+
documents.append(Document(page_content=content))
|
| 26 |
+
|
| 27 |
+
# Create the vector database using HuggingFace embeddings
|
| 28 |
+
embeddings = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")
|
| 29 |
+
db_books = Chroma.from_documents(documents, embeddings)
|
| 30 |
+
|
| 31 |
+
|
| 32 |
+
def retrieve_semantic_recommendations(
|
| 33 |
+
query: str,
|
| 34 |
+
category: str = None,
|
| 35 |
+
tone: str = None,
|
| 36 |
+
initial_top_k: int = 50,
|
| 37 |
+
final_top_k: int = 16,
|
| 38 |
+
) -> pd.DataFrame:
|
| 39 |
+
|
| 40 |
+
recs = db_books.similarity_search(query, k=initial_top_k)
|
| 41 |
+
books_list = [int(float(rec.page_content.strip('"').split()[0])) for rec in recs]
|
| 42 |
+
book_recs = books[books["isbn13"].isin(books_list)].head(initial_top_k)
|
| 43 |
+
|
| 44 |
+
if category != "All":
|
| 45 |
+
book_recs = book_recs[book_recs["simple_categories"] == category].head(final_top_k)
|
| 46 |
+
else:
|
| 47 |
+
book_recs = book_recs.head(final_top_k)
|
| 48 |
+
|
| 49 |
+
# Only sort by emotion if the columns exist
|
| 50 |
+
if tone == "Happy" and "joy" in book_recs.columns:
|
| 51 |
+
book_recs = book_recs.sort_values(by="joy", ascending=False)
|
| 52 |
+
elif tone == "Surprising" and "surprise" in book_recs.columns:
|
| 53 |
+
book_recs = book_recs.sort_values(by="surprise", ascending=False)
|
| 54 |
+
elif tone == "Angry" and "anger" in book_recs.columns:
|
| 55 |
+
book_recs = book_recs.sort_values(by="anger", ascending=False)
|
| 56 |
+
elif tone == "Suspenseful" and "fear" in book_recs.columns:
|
| 57 |
+
book_recs = book_recs.sort_values(by="fear", ascending=False)
|
| 58 |
+
elif tone == "Sad" and "sadness" in book_recs.columns:
|
| 59 |
+
book_recs = book_recs.sort_values(by="sadness", ascending=False)
|
| 60 |
+
|
| 61 |
+
return book_recs
|
| 62 |
+
|
| 63 |
+
|
| 64 |
+
def recommend_books(
|
| 65 |
+
query: str,
|
| 66 |
+
category: str,
|
| 67 |
+
tone: str
|
| 68 |
+
):
|
| 69 |
+
recommendations = retrieve_semantic_recommendations(query, category, tone)
|
| 70 |
+
results = []
|
| 71 |
+
|
| 72 |
+
for _, row in recommendations.iterrows():
|
| 73 |
+
description = row["description"]
|
| 74 |
+
truncated_desc_split = description.split()
|
| 75 |
+
truncated_description = " ".join(truncated_desc_split[:30]) + "..."
|
| 76 |
+
|
| 77 |
+
authors_split = row["authors"].split(";")
|
| 78 |
+
if len(authors_split) == 2:
|
| 79 |
+
authors_str = f"{authors_split[0]} and {authors_split[1]}"
|
| 80 |
+
elif len(authors_split) > 2:
|
| 81 |
+
authors_str = f"{', '.join(authors_split[:-1])}, and {authors_split[-1]}"
|
| 82 |
+
else:
|
| 83 |
+
authors_str = row["authors"]
|
| 84 |
+
|
| 85 |
+
caption = f"{row['title']} by {authors_str}: {truncated_description}"
|
| 86 |
+
results.append((row["large_thumbnail"], caption))
|
| 87 |
+
return results
|
| 88 |
+
|
| 89 |
+
# Fix: Filter out NaN values before sorting
|
| 90 |
+
categories = ["All"] + sorted(books["simple_categories"].dropna().unique())
|
| 91 |
+
|
| 92 |
+
# Only include emotion tones if the emotion columns exist
|
| 93 |
+
emotion_columns = ["joy", "surprise", "anger", "fear", "sadness"]
|
| 94 |
+
emotion_labels = ["Happy", "Surprising", "Angry", "Suspenseful", "Sad"]
|
| 95 |
+
available_emotions = [label for col, label in zip(emotion_columns, emotion_labels) if col in books.columns]
|
| 96 |
+
tones = ["All"] + available_emotions
|
| 97 |
+
|
| 98 |
+
with gr.Blocks(theme = gr.themes.Glass()) as dashboard:
|
| 99 |
+
gr.Markdown("# Semantic book recommender")
|
| 100 |
+
|
| 101 |
+
with gr.Row():
|
| 102 |
+
user_query = gr.Textbox(label = "Please enter a description of a book:",
|
| 103 |
+
placeholder = "e.g., A story about forgiveness")
|
| 104 |
+
category_dropdown = gr.Dropdown(choices = categories, label = "Select a category:", value = "All")
|
| 105 |
+
tone_dropdown = gr.Dropdown(choices = tones, label = "Select an emotional tone:", value = "All")
|
| 106 |
+
submit_button = gr.Button("Find recommendations")
|
| 107 |
+
|
| 108 |
+
gr.Markdown("## Recommendations")
|
| 109 |
+
output = gr.Gallery(label = "Recommended books", columns = 8, rows = 2)
|
| 110 |
+
|
| 111 |
+
submit_button.click(fn = recommend_books,
|
| 112 |
+
inputs = [user_query, category_dropdown, tone_dropdown],
|
| 113 |
+
outputs = output)
|
| 114 |
+
|
| 115 |
+
|
| 116 |
+
if __name__ == "__main__":
|
| 117 |
+
dashboard.launch()
|
sentiment-analysis.ipynb
ADDED
|
@@ -0,0 +1,445 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"cells": [
|
| 3 |
+
{
|
| 4 |
+
"cell_type": "code",
|
| 5 |
+
"execution_count": null,
|
| 6 |
+
"id": "initial_id",
|
| 7 |
+
"metadata": {
|
| 8 |
+
"ExecuteTime": {
|
| 9 |
+
"end_time": "2025-09-16T09:43:18.055617Z",
|
| 10 |
+
"start_time": "2025-09-16T09:43:17.869905Z"
|
| 11 |
+
}
|
| 12 |
+
},
|
| 13 |
+
"outputs": [],
|
| 14 |
+
"source": [
|
| 15 |
+
"import pandas as pd\n",
|
| 16 |
+
"\n",
|
| 17 |
+
"books = pd.read_csv(\"books_with_categories.csv\")"
|
| 18 |
+
]
|
| 19 |
+
},
|
| 20 |
+
{
|
| 21 |
+
"cell_type": "code",
|
| 22 |
+
"execution_count": null,
|
| 23 |
+
"id": "3d9a521af5640cd2",
|
| 24 |
+
"metadata": {
|
| 25 |
+
"ExecuteTime": {
|
| 26 |
+
"end_time": "2025-09-16T09:43:20.918046Z",
|
| 27 |
+
"start_time": "2025-09-16T09:43:18.066451Z"
|
| 28 |
+
}
|
| 29 |
+
},
|
| 30 |
+
"outputs": [],
|
| 31 |
+
"source": [
|
| 32 |
+
"!pip install torch transformers\n"
|
| 33 |
+
]
|
| 34 |
+
},
|
| 35 |
+
{
|
| 36 |
+
"cell_type": "code",
|
| 37 |
+
"execution_count": null,
|
| 38 |
+
"id": "a222cc24cb3d9e50",
|
| 39 |
+
"metadata": {
|
| 40 |
+
"ExecuteTime": {
|
| 41 |
+
"end_time": "2025-09-16T09:43:20.956314Z",
|
| 42 |
+
"start_time": "2025-09-16T09:43:20.934627Z"
|
| 43 |
+
}
|
| 44 |
+
},
|
| 45 |
+
"outputs": [],
|
| 46 |
+
"source": [
|
| 47 |
+
"import torch\n",
|
| 48 |
+
"import transformers\n",
|
| 49 |
+
"print(f\"PyTorch version: {torch.__version__}\")\n",
|
| 50 |
+
"print(f\"Transformers version: {transformers.__version__}\")"
|
| 51 |
+
]
|
| 52 |
+
},
|
| 53 |
+
{
|
| 54 |
+
"cell_type": "code",
|
| 55 |
+
"execution_count": null,
|
| 56 |
+
"id": "418145b8ff28c108",
|
| 57 |
+
"metadata": {
|
| 58 |
+
"ExecuteTime": {
|
| 59 |
+
"end_time": "2025-09-16T09:43:23.555715Z",
|
| 60 |
+
"start_time": "2025-09-16T09:43:20.969958Z"
|
| 61 |
+
}
|
| 62 |
+
},
|
| 63 |
+
"outputs": [],
|
| 64 |
+
"source": [
|
| 65 |
+
"# Fix the bug by making torch available in transformers namespace\n",
|
| 66 |
+
"transformers.torch = torch\n",
|
| 67 |
+
"\n",
|
| 68 |
+
"from transformers import pipeline\n",
|
| 69 |
+
"\n",
|
| 70 |
+
"pipe = pipeline(\n",
|
| 71 |
+
" \"text-classification\",\n",
|
| 72 |
+
" model=\"j-hartmann/emotion-english-distilroberta-base\",\n",
|
| 73 |
+
" return_all_scores=True\n",
|
| 74 |
+
")\n",
|
| 75 |
+
"\n",
|
| 76 |
+
"# Test it\n",
|
| 77 |
+
"text = \"I am so happy today!\"\n",
|
| 78 |
+
"result = pipe(text)\n",
|
| 79 |
+
"print(result)\n",
|
| 80 |
+
"\n",
|
| 81 |
+
"#top-k None\n",
|
| 82 |
+
"#device -- mps /cuda for warnings"
|
| 83 |
+
]
|
| 84 |
+
},
|
| 85 |
+
{
|
| 86 |
+
"cell_type": "code",
|
| 87 |
+
"execution_count": null,
|
| 88 |
+
"id": "90acf250d3189ec1",
|
| 89 |
+
"metadata": {
|
| 90 |
+
"ExecuteTime": {
|
| 91 |
+
"end_time": "2025-09-16T09:43:23.912340Z",
|
| 92 |
+
"start_time": "2025-09-16T09:43:23.574192Z"
|
| 93 |
+
}
|
| 94 |
+
},
|
| 95 |
+
"outputs": [],
|
| 96 |
+
"source": [
|
| 97 |
+
"pipe(books[\"description\"][0])"
|
| 98 |
+
]
|
| 99 |
+
},
|
| 100 |
+
{
|
| 101 |
+
"cell_type": "code",
|
| 102 |
+
"execution_count": null,
|
| 103 |
+
"id": "c9781bcf4224efd4",
|
| 104 |
+
"metadata": {
|
| 105 |
+
"ExecuteTime": {
|
| 106 |
+
"end_time": "2025-09-16T09:43:24.797286Z",
|
| 107 |
+
"start_time": "2025-09-16T09:43:23.944842Z"
|
| 108 |
+
}
|
| 109 |
+
},
|
| 110 |
+
"outputs": [],
|
| 111 |
+
"source": [
|
| 112 |
+
"pipe(books[\"description\"][0].split(\".\"))"
|
| 113 |
+
]
|
| 114 |
+
},
|
| 115 |
+
{
|
| 116 |
+
"cell_type": "code",
|
| 117 |
+
"execution_count": null,
|
| 118 |
+
"id": "57fc949d567e3f7",
|
| 119 |
+
"metadata": {
|
| 120 |
+
"ExecuteTime": {
|
| 121 |
+
"end_time": "2025-09-16T09:43:25.167345Z",
|
| 122 |
+
"start_time": "2025-09-16T09:43:24.810715Z"
|
| 123 |
+
}
|
| 124 |
+
},
|
| 125 |
+
"outputs": [],
|
| 126 |
+
"source": [
|
| 127 |
+
"sentences = books[\"description\"][0].split(\".\")\n",
|
| 128 |
+
"predictions = pipe(sentences)"
|
| 129 |
+
]
|
| 130 |
+
},
|
| 131 |
+
{
|
| 132 |
+
"cell_type": "code",
|
| 133 |
+
"execution_count": null,
|
| 134 |
+
"id": "41b5470987223a69",
|
| 135 |
+
"metadata": {
|
| 136 |
+
"ExecuteTime": {
|
| 137 |
+
"end_time": "2025-09-16T09:43:25.187522Z",
|
| 138 |
+
"start_time": "2025-09-16T09:43:25.179974Z"
|
| 139 |
+
}
|
| 140 |
+
},
|
| 141 |
+
"outputs": [],
|
| 142 |
+
"source": [
|
| 143 |
+
"sentences[0]"
|
| 144 |
+
]
|
| 145 |
+
},
|
| 146 |
+
{
|
| 147 |
+
"cell_type": "code",
|
| 148 |
+
"execution_count": null,
|
| 149 |
+
"id": "81bb270a79fdd290",
|
| 150 |
+
"metadata": {
|
| 151 |
+
"ExecuteTime": {
|
| 152 |
+
"end_time": "2025-09-16T09:43:25.232413Z",
|
| 153 |
+
"start_time": "2025-09-16T09:43:25.225824Z"
|
| 154 |
+
}
|
| 155 |
+
},
|
| 156 |
+
"outputs": [],
|
| 157 |
+
"source": [
|
| 158 |
+
"predictions[0]"
|
| 159 |
+
]
|
| 160 |
+
},
|
| 161 |
+
{
|
| 162 |
+
"cell_type": "code",
|
| 163 |
+
"execution_count": null,
|
| 164 |
+
"id": "d85ba7066b85eb7d",
|
| 165 |
+
"metadata": {
|
| 166 |
+
"ExecuteTime": {
|
| 167 |
+
"end_time": "2025-09-16T09:43:25.273001Z",
|
| 168 |
+
"start_time": "2025-09-16T09:43:25.267108Z"
|
| 169 |
+
}
|
| 170 |
+
},
|
| 171 |
+
"outputs": [],
|
| 172 |
+
"source": [
|
| 173 |
+
"sentences[4]"
|
| 174 |
+
]
|
| 175 |
+
},
|
| 176 |
+
{
|
| 177 |
+
"cell_type": "code",
|
| 178 |
+
"execution_count": null,
|
| 179 |
+
"id": "8dea7d5c2077d566",
|
| 180 |
+
"metadata": {
|
| 181 |
+
"ExecuteTime": {
|
| 182 |
+
"end_time": "2025-09-16T09:43:25.306831Z",
|
| 183 |
+
"start_time": "2025-09-16T09:43:25.300457Z"
|
| 184 |
+
}
|
| 185 |
+
},
|
| 186 |
+
"outputs": [],
|
| 187 |
+
"source": [
|
| 188 |
+
"predictions[4]"
|
| 189 |
+
]
|
| 190 |
+
},
|
| 191 |
+
{
|
| 192 |
+
"cell_type": "code",
|
| 193 |
+
"execution_count": null,
|
| 194 |
+
"id": "a540e26e090b9050",
|
| 195 |
+
"metadata": {
|
| 196 |
+
"ExecuteTime": {
|
| 197 |
+
"end_time": "2025-09-16T09:43:25.342124Z",
|
| 198 |
+
"start_time": "2025-09-16T09:43:25.334958Z"
|
| 199 |
+
}
|
| 200 |
+
},
|
| 201 |
+
"outputs": [],
|
| 202 |
+
"source": [
|
| 203 |
+
"sorted(predictions[0], key = lambda x: x['label'])"
|
| 204 |
+
]
|
| 205 |
+
},
|
| 206 |
+
{
|
| 207 |
+
"cell_type": "code",
|
| 208 |
+
"execution_count": null,
|
| 209 |
+
"id": "a496645a7d858dcf",
|
| 210 |
+
"metadata": {
|
| 211 |
+
"ExecuteTime": {
|
| 212 |
+
"end_time": "2025-09-16T09:43:25.369056Z",
|
| 213 |
+
"start_time": "2025-09-16T09:43:25.360888Z"
|
| 214 |
+
}
|
| 215 |
+
},
|
| 216 |
+
"outputs": [],
|
| 217 |
+
"source": [
|
| 218 |
+
"import numpy as np\n",
|
| 219 |
+
"\n",
|
| 220 |
+
"emotion_labels = [\"anger\", \"disgust\", \"fear\", \"joy\", \"sadness\", \"surprise\", \"neutral\"]\n",
|
| 221 |
+
"isbn = []\n",
|
| 222 |
+
"emotion_scores = {label: [] for label in emotion_labels}\n",
|
| 223 |
+
"\n",
|
| 224 |
+
"def calculate_max_emotion_scores(predictions):\n",
|
| 225 |
+
" per_emotion_scores = {label: [] for label in emotion_labels}\n",
|
| 226 |
+
" for prediction in predictions:\n",
|
| 227 |
+
" sorted_predictions = sorted(prediction, key=lambda x: x['label'], reverse=True)\n",
|
| 228 |
+
" for index, label in enumerate(emotion_labels):\n",
|
| 229 |
+
" per_emotion_scores[label].append(sorted_predictions[index]['score'])\n",
|
| 230 |
+
" return {label: np.max(scores) for label, scores in per_emotion_scores.items()}"
|
| 231 |
+
]
|
| 232 |
+
},
|
| 233 |
+
{
|
| 234 |
+
"cell_type": "code",
|
| 235 |
+
"execution_count": null,
|
| 236 |
+
"id": "b911145893e482f3",
|
| 237 |
+
"metadata": {
|
| 238 |
+
"ExecuteTime": {
|
| 239 |
+
"end_time": "2025-09-16T09:43:27.622710Z",
|
| 240 |
+
"start_time": "2025-09-16T09:43:25.385198Z"
|
| 241 |
+
}
|
| 242 |
+
},
|
| 243 |
+
"outputs": [],
|
| 244 |
+
"source": [
|
| 245 |
+
"for i, row in books.head(10).iterrows():\n",
|
| 246 |
+
" isbn.append(str(row[\"isbn13\"]))\n",
|
| 247 |
+
"\n",
|
| 248 |
+
" sentences = str(row[\"description\"]).split(\".\")\n",
|
| 249 |
+
" predictions = pipe(sentences)\n",
|
| 250 |
+
" max_scores = calculate_max_emotion_scores(predictions)\n",
|
| 251 |
+
"\n",
|
| 252 |
+
" for label in emotion_labels:\n",
|
| 253 |
+
" # force conversion to Python float\n",
|
| 254 |
+
" emotion_scores[label].append(float(max_scores[label]))\n"
|
| 255 |
+
]
|
| 256 |
+
},
|
| 257 |
+
{
|
| 258 |
+
"cell_type": "code",
|
| 259 |
+
"execution_count": null,
|
| 260 |
+
"id": "b1e1e2960a0314b3",
|
| 261 |
+
"metadata": {
|
| 262 |
+
"ExecuteTime": {
|
| 263 |
+
"end_time": "2025-09-16T09:43:27.634705Z",
|
| 264 |
+
"start_time": "2025-09-16T09:43:27.630600Z"
|
| 265 |
+
}
|
| 266 |
+
},
|
| 267 |
+
"outputs": [],
|
| 268 |
+
"source": [
|
| 269 |
+
"emotion_scores = {\n",
|
| 270 |
+
" label: [float(x) for x in scores]\n",
|
| 271 |
+
" for label, scores in emotion_scores.items()\n",
|
| 272 |
+
"}\n"
|
| 273 |
+
]
|
| 274 |
+
},
|
| 275 |
+
{
|
| 276 |
+
"cell_type": "code",
|
| 277 |
+
"execution_count": null,
|
| 278 |
+
"id": "b4d6abd593a32daa",
|
| 279 |
+
"metadata": {
|
| 280 |
+
"ExecuteTime": {
|
| 281 |
+
"end_time": "2025-09-16T09:43:27.652229Z",
|
| 282 |
+
"start_time": "2025-09-16T09:43:27.644745Z"
|
| 283 |
+
}
|
| 284 |
+
},
|
| 285 |
+
"outputs": [],
|
| 286 |
+
"source": [
|
| 287 |
+
"emotion_scores"
|
| 288 |
+
]
|
| 289 |
+
},
|
| 290 |
+
{
|
| 291 |
+
"cell_type": "code",
|
| 292 |
+
"execution_count": null,
|
| 293 |
+
"id": "632f787a4b7d3eaf",
|
| 294 |
+
"metadata": {
|
| 295 |
+
"ExecuteTime": {
|
| 296 |
+
"end_time": "2025-09-16T11:10:46.250826Z",
|
| 297 |
+
"start_time": "2025-09-16T10:58:12.182208Z"
|
| 298 |
+
}
|
| 299 |
+
},
|
| 300 |
+
"outputs": [],
|
| 301 |
+
"source": [
|
| 302 |
+
"import pandas as pd\n",
|
| 303 |
+
"import numpy as np\n",
|
| 304 |
+
"from tqdm import tqdm\n",
|
| 305 |
+
"from transformers import pipeline\n",
|
| 306 |
+
"\n",
|
| 307 |
+
"# Initialize the emotion analysis pipeline\n",
|
| 308 |
+
"pipe = pipeline(\"text-classification\", model=\"j-hartmann/emotion-english-distilroberta-base\", top_k=None)\n",
|
| 309 |
+
"\n",
|
| 310 |
+
"# Load your books data\n",
|
| 311 |
+
"books = pd.read_csv(\"books_with_categories.csv\") # Replace with your actual file name\n",
|
| 312 |
+
"\n",
|
| 313 |
+
"emotion_labels = [\"anger\", \"disgust\", \"fear\", \"joy\", \"sadness\", \"surprise\", \"neutral\"]\n",
|
| 314 |
+
"isbn = []\n",
|
| 315 |
+
"emotion_scores = {label: [] for label in emotion_labels}\n",
|
| 316 |
+
"\n",
|
| 317 |
+
"def calculate_max_emotion_scores(predictions):\n",
|
| 318 |
+
" \"\"\"Calculate maximum emotion scores across all sentences\"\"\"\n",
|
| 319 |
+
" per_emotion_scores = {label: [] for label in emotion_labels}\n",
|
| 320 |
+
"\n",
|
| 321 |
+
" for prediction in predictions:\n",
|
| 322 |
+
" # Create a dictionary for easy lookup by label\n",
|
| 323 |
+
" prediction_dict = {pred['label']: pred['score'] for pred in prediction}\n",
|
| 324 |
+
"\n",
|
| 325 |
+
" # Add scores for each emotion label\n",
|
| 326 |
+
" for label in emotion_labels:\n",
|
| 327 |
+
" score = prediction_dict.get(label, 0.0) # Default to 0 if label not found\n",
|
| 328 |
+
" per_emotion_scores[label].append(score)\n",
|
| 329 |
+
"\n",
|
| 330 |
+
" # Return maximum score for each emotion across all sentences\n",
|
| 331 |
+
" return {label: np.max(scores) if scores else 0.0 for label, scores in per_emotion_scores.items()}\n",
|
| 332 |
+
"\n",
|
| 333 |
+
"print(\"Processing emotions for books...\")\n",
|
| 334 |
+
"for i, row in tqdm(books.iterrows(), total=len(books)):\n",
|
| 335 |
+
" isbn.append(str(row[\"isbn13\"]))\n",
|
| 336 |
+
"\n",
|
| 337 |
+
" # Handle missing descriptions\n",
|
| 338 |
+
" description = str(row[\"description\"]) if pd.notna(row[\"description\"]) else \"\"\n",
|
| 339 |
+
"\n",
|
| 340 |
+
" if description and description != \"nan\":\n",
|
| 341 |
+
" # Split into sentences and filter out empty ones\n",
|
| 342 |
+
" sentences = [s.strip() for s in description.split(\".\") if s.strip()]\n",
|
| 343 |
+
"\n",
|
| 344 |
+
" if sentences:\n",
|
| 345 |
+
" try:\n",
|
| 346 |
+
" predictions = pipe(sentences)\n",
|
| 347 |
+
" max_scores = calculate_max_emotion_scores(predictions)\n",
|
| 348 |
+
" except Exception as e:\n",
|
| 349 |
+
" print(f\"Error processing book {row['isbn13']}: {e}\")\n",
|
| 350 |
+
" # Use default scores if processing fails\n",
|
| 351 |
+
" max_scores = {label: 0.0 for label in emotion_labels}\n",
|
| 352 |
+
" else:\n",
|
| 353 |
+
" # Empty description\n",
|
| 354 |
+
" max_scores = {label: 0.0 for label in emotion_labels}\n",
|
| 355 |
+
" else:\n",
|
| 356 |
+
" # No description available\n",
|
| 357 |
+
" max_scores = {label: 0.0 for label in emotion_labels}\n",
|
| 358 |
+
"\n",
|
| 359 |
+
" # Add scores to our lists\n",
|
| 360 |
+
" for label in emotion_labels:\n",
|
| 361 |
+
" emotion_scores[label].append(float(max_scores[label]))"
|
| 362 |
+
]
|
| 363 |
+
},
|
| 364 |
+
{
|
| 365 |
+
"cell_type": "code",
|
| 366 |
+
"execution_count": null,
|
| 367 |
+
"id": "31dfb34d4f4aee9a",
|
| 368 |
+
"metadata": {
|
| 369 |
+
"ExecuteTime": {
|
| 370 |
+
"end_time": "2025-09-16T11:11:55.455696Z",
|
| 371 |
+
"start_time": "2025-09-16T11:11:55.422818Z"
|
| 372 |
+
}
|
| 373 |
+
},
|
| 374 |
+
"outputs": [],
|
| 375 |
+
"source": [
|
| 376 |
+
"# Create emotion DataFrame\n",
|
| 377 |
+
"emotion_df = pd.DataFrame(emotion_scores)\n",
|
| 378 |
+
"emotion_df['isbn13'] = isbn\n",
|
| 379 |
+
"\n",
|
| 380 |
+
"print(\"Emotion processing completed!\")\n",
|
| 381 |
+
"print(\"Sample emotion scores:\")\n",
|
| 382 |
+
"print(emotion_df.head(10))"
|
| 383 |
+
]
|
| 384 |
+
},
|
| 385 |
+
{
|
| 386 |
+
"cell_type": "code",
|
| 387 |
+
"execution_count": null,
|
| 388 |
+
"id": "8db5e8f5cee59321",
|
| 389 |
+
"metadata": {
|
| 390 |
+
"ExecuteTime": {
|
| 391 |
+
"end_time": "2025-09-16T11:12:15.342595Z",
|
| 392 |
+
"start_time": "2025-09-16T11:12:14.895173Z"
|
| 393 |
+
}
|
| 394 |
+
},
|
| 395 |
+
"outputs": [],
|
| 396 |
+
"source": [
|
| 397 |
+
"books['isbn13'] = books['isbn13'].astype(str).str.replace('.0', '', regex=False)\n",
|
| 398 |
+
"emotion_df['isbn13'] = emotion_df['isbn13'].astype(str).str.replace('.0', '', regex=False)\n",
|
| 399 |
+
"\n",
|
| 400 |
+
"print(\"Data types before merge:\")\n",
|
| 401 |
+
"print(f\"Books isbn13 dtype: {books['isbn13'].dtype}\")\n",
|
| 402 |
+
"print(f\"Emotion isbn13 dtype: {emotion_df['isbn13'].dtype}\")\n",
|
| 403 |
+
"\n",
|
| 404 |
+
"# Merge emotion scores back to the original books DataFrame\n",
|
| 405 |
+
"books_with_emotions = books.merge(emotion_df, on='isbn13', how='left')\n",
|
| 406 |
+
"\n",
|
| 407 |
+
"# Save the combined DataFrame\n",
|
| 408 |
+
"books_with_emotions.to_csv(\"books_with_emotions.csv\", index=False)\n",
|
| 409 |
+
"\n",
|
| 410 |
+
"print(f\"Saved books with emotions to 'books_with_emotions.csv'\")\n",
|
| 411 |
+
"print(f\"Total books processed: {len(books_with_emotions)}\")\n",
|
| 412 |
+
"print(\"Available columns:\", books_with_emotions.columns.tolist())"
|
| 413 |
+
]
|
| 414 |
+
},
|
| 415 |
+
{
|
| 416 |
+
"cell_type": "code",
|
| 417 |
+
"execution_count": null,
|
| 418 |
+
"id": "e1cc83da7893e926",
|
| 419 |
+
"metadata": {},
|
| 420 |
+
"outputs": [],
|
| 421 |
+
"source": []
|
| 422 |
+
}
|
| 423 |
+
],
|
| 424 |
+
"metadata": {
|
| 425 |
+
"kernelspec": {
|
| 426 |
+
"display_name": "Python 3",
|
| 427 |
+
"language": "python",
|
| 428 |
+
"name": "python3"
|
| 429 |
+
},
|
| 430 |
+
"language_info": {
|
| 431 |
+
"codemirror_mode": {
|
| 432 |
+
"name": "ipython",
|
| 433 |
+
"version": 2
|
| 434 |
+
},
|
| 435 |
+
"file_extension": ".py",
|
| 436 |
+
"mimetype": "text/x-python",
|
| 437 |
+
"name": "python",
|
| 438 |
+
"nbconvert_exporter": "python",
|
| 439 |
+
"pygments_lexer": "ipython2",
|
| 440 |
+
"version": "2.7.6"
|
| 441 |
+
}
|
| 442 |
+
},
|
| 443 |
+
"nbformat": 4,
|
| 444 |
+
"nbformat_minor": 5
|
| 445 |
+
}
|
tagged_description.txt
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
text-classification.ipynb
ADDED
|
@@ -0,0 +1,591 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"cells": [
|
| 3 |
+
{
|
| 4 |
+
"cell_type": "code",
|
| 5 |
+
"execution_count": null,
|
| 6 |
+
"id": "initial_id",
|
| 7 |
+
"metadata": {
|
| 8 |
+
"ExecuteTime": {
|
| 9 |
+
"end_time": "2025-09-16T03:30:06.906158Z",
|
| 10 |
+
"start_time": "2025-09-16T03:30:06.897210Z"
|
| 11 |
+
}
|
| 12 |
+
},
|
| 13 |
+
"outputs": [],
|
| 14 |
+
"source": [
|
| 15 |
+
"import numpy as np\n",
|
| 16 |
+
"\n",
|
| 17 |
+
"category_mapping = {'Fiction' : 'Fiction',\n",
|
| 18 |
+
" 'Juvenile Fiction' : \"Children's Fiction\",\n",
|
| 19 |
+
" 'Biography & Autobiography' : 'Nonfiction',\n",
|
| 20 |
+
" 'History' : 'Nonfiction',\n",
|
| 21 |
+
" 'Literary Criticism' : 'Nonfiction',\n",
|
| 22 |
+
" 'Philosophy' : 'Nonfiction',\n",
|
| 23 |
+
" 'Religion' : 'Nonfiction',\n",
|
| 24 |
+
" 'Comics & Graphic Novels' : 'Fiction',\n",
|
| 25 |
+
" 'Juvenile Nonfiction' : \"Children's Nonfiction\",\n",
|
| 26 |
+
" 'Science' : 'Nonfiction',\n",
|
| 27 |
+
" 'Poetry' : 'Fiction',\n",
|
| 28 |
+
" }"
|
| 29 |
+
]
|
| 30 |
+
},
|
| 31 |
+
{
|
| 32 |
+
"cell_type": "code",
|
| 33 |
+
"execution_count": null,
|
| 34 |
+
"id": "abd407fcfb12529f",
|
| 35 |
+
"metadata": {
|
| 36 |
+
"ExecuteTime": {
|
| 37 |
+
"end_time": "2025-09-15T09:11:18.779297Z",
|
| 38 |
+
"start_time": "2025-09-15T09:11:18.685368Z"
|
| 39 |
+
}
|
| 40 |
+
},
|
| 41 |
+
"outputs": [],
|
| 42 |
+
"source": [
|
| 43 |
+
"import pandas as pd\n",
|
| 44 |
+
"books = pd.read_csv(\"books_cleaned.csv\")"
|
| 45 |
+
]
|
| 46 |
+
},
|
| 47 |
+
{
|
| 48 |
+
"cell_type": "code",
|
| 49 |
+
"execution_count": null,
|
| 50 |
+
"id": "8730b04764af7caa",
|
| 51 |
+
"metadata": {
|
| 52 |
+
"ExecuteTime": {
|
| 53 |
+
"end_time": "2025-09-15T09:12:06.202207Z",
|
| 54 |
+
"start_time": "2025-09-15T09:12:06.190052Z"
|
| 55 |
+
}
|
| 56 |
+
},
|
| 57 |
+
"outputs": [],
|
| 58 |
+
"source": [
|
| 59 |
+
"books['simple_categories'] = books['categories'].map(category_mapping)"
|
| 60 |
+
]
|
| 61 |
+
},
|
| 62 |
+
{
|
| 63 |
+
"cell_type": "code",
|
| 64 |
+
"execution_count": null,
|
| 65 |
+
"id": "17b0fe2cfe81778b",
|
| 66 |
+
"metadata": {
|
| 67 |
+
"ExecuteTime": {
|
| 68 |
+
"end_time": "2025-09-15T09:13:56.419141Z",
|
| 69 |
+
"start_time": "2025-09-15T09:13:56.325655Z"
|
| 70 |
+
}
|
| 71 |
+
},
|
| 72 |
+
"outputs": [],
|
| 73 |
+
"source": [
|
| 74 |
+
"books"
|
| 75 |
+
]
|
| 76 |
+
},
|
| 77 |
+
{
|
| 78 |
+
"cell_type": "code",
|
| 79 |
+
"execution_count": null,
|
| 80 |
+
"id": "410d16934dfe2383",
|
| 81 |
+
"metadata": {
|
| 82 |
+
"ExecuteTime": {
|
| 83 |
+
"end_time": "2025-09-15T09:39:48.441516Z",
|
| 84 |
+
"start_time": "2025-09-15T09:39:48.396466Z"
|
| 85 |
+
}
|
| 86 |
+
},
|
| 87 |
+
"outputs": [],
|
| 88 |
+
"source": [
|
| 89 |
+
"books[~(books['simple_categories'].isna())]"
|
| 90 |
+
]
|
| 91 |
+
},
|
| 92 |
+
{
|
| 93 |
+
"cell_type": "code",
|
| 94 |
+
"execution_count": null,
|
| 95 |
+
"id": "a0d8dcd913296e3d",
|
| 96 |
+
"metadata": {
|
| 97 |
+
"ExecuteTime": {
|
| 98 |
+
"end_time": "2025-09-15T10:23:22.076926Z",
|
| 99 |
+
"start_time": "2025-09-15T10:20:53.043882Z"
|
| 100 |
+
}
|
| 101 |
+
},
|
| 102 |
+
"outputs": [],
|
| 103 |
+
"source": [
|
| 104 |
+
"!pip install hf_xet\n",
|
| 105 |
+
"from transformers import pipeline\n",
|
| 106 |
+
"\n",
|
| 107 |
+
"fiction_categories = ['Fiction', 'Nonfiction']\n",
|
| 108 |
+
"pipe = pipeline(\"zero-shot-classification\",model=\"facebook/bart-large-mnli\", device=\"cuda\")\n"
|
| 109 |
+
]
|
| 110 |
+
},
|
| 111 |
+
{
|
| 112 |
+
"cell_type": "code",
|
| 113 |
+
"execution_count": null,
|
| 114 |
+
"id": "cd9edaa3ee8c1243",
|
| 115 |
+
"metadata": {
|
| 116 |
+
"ExecuteTime": {
|
| 117 |
+
"end_time": "2025-09-15T10:23:46.232544Z",
|
| 118 |
+
"start_time": "2025-09-15T10:23:43.525543Z"
|
| 119 |
+
}
|
| 120 |
+
},
|
| 121 |
+
"outputs": [],
|
| 122 |
+
"source": [
|
| 123 |
+
"!pip install --upgrade huggingface_hub\n"
|
| 124 |
+
]
|
| 125 |
+
},
|
| 126 |
+
{
|
| 127 |
+
"cell_type": "code",
|
| 128 |
+
"execution_count": null,
|
| 129 |
+
"id": "83b78716648ebbe6",
|
| 130 |
+
"metadata": {
|
| 131 |
+
"ExecuteTime": {
|
| 132 |
+
"end_time": "2025-09-15T10:23:55.154934Z",
|
| 133 |
+
"start_time": "2025-09-15T10:23:53.226725Z"
|
| 134 |
+
}
|
| 135 |
+
},
|
| 136 |
+
"outputs": [],
|
| 137 |
+
"source": [
|
| 138 |
+
"!pip install \"huggingface_hub[hf_xet]\"\n"
|
| 139 |
+
]
|
| 140 |
+
},
|
| 141 |
+
{
|
| 142 |
+
"cell_type": "code",
|
| 143 |
+
"execution_count": null,
|
| 144 |
+
"id": "8d02bd90c594fbac",
|
| 145 |
+
"metadata": {
|
| 146 |
+
"ExecuteTime": {
|
| 147 |
+
"end_time": "2025-09-15T10:24:14.628937Z",
|
| 148 |
+
"start_time": "2025-09-15T10:24:12.758899Z"
|
| 149 |
+
}
|
| 150 |
+
},
|
| 151 |
+
"outputs": [],
|
| 152 |
+
"source": [
|
| 153 |
+
"!pip show huggingface_hub\n",
|
| 154 |
+
"!pip show hf_xet\n"
|
| 155 |
+
]
|
| 156 |
+
},
|
| 157 |
+
{
|
| 158 |
+
"cell_type": "code",
|
| 159 |
+
"execution_count": null,
|
| 160 |
+
"id": "83e5151bdc46709a",
|
| 161 |
+
"metadata": {
|
| 162 |
+
"ExecuteTime": {
|
| 163 |
+
"end_time": "2025-09-16T04:10:04.964668Z",
|
| 164 |
+
"start_time": "2025-09-16T04:10:01.587200Z"
|
| 165 |
+
}
|
| 166 |
+
},
|
| 167 |
+
"outputs": [],
|
| 168 |
+
"source": [
|
| 169 |
+
"from transformers import pipeline\n",
|
| 170 |
+
"import torch\n",
|
| 171 |
+
"import os\n",
|
| 172 |
+
"\n",
|
| 173 |
+
"print(\"Loading model... (this may take a few minutes on first run)\")\n",
|
| 174 |
+
"print(f\"CUDA available: {torch.cuda.is_available()}\")\n",
|
| 175 |
+
"if torch.cuda.is_available():\n",
|
| 176 |
+
" print(f\"GPU device: {torch.cuda.get_device_name(0)}\")\n",
|
| 177 |
+
"\n",
|
| 178 |
+
"# CRITICAL: Add GPU support and optimization parameters\n",
|
| 179 |
+
"try:\n",
|
| 180 |
+
" os.environ[\"HF_HUB_DOWNLOAD_TIMEOUT\"] = \"120\"\n",
|
| 181 |
+
"\n",
|
| 182 |
+
" pipe = pipeline(\n",
|
| 183 |
+
" \"zero-shot-classification\",\n",
|
| 184 |
+
" model=\"facebook/bart-large-mnli\",\n",
|
| 185 |
+
" device=0 if torch.cuda.is_available() else -1, # Use GPU if available\n",
|
| 186 |
+
" batch_size=64, # Internal pipeline batch size\n",
|
| 187 |
+
" max_length=512, # Truncate long texts\n",
|
| 188 |
+
" truncation=True,\n",
|
| 189 |
+
" use_auth_token=False,\n",
|
| 190 |
+
" revision=\"main\"\n",
|
| 191 |
+
" )\n",
|
| 192 |
+
"\n",
|
| 193 |
+
" print(\"✅ Model loaded successfully with GPU acceleration!\" if torch.cuda.is_available() else \"✅ Model loaded (CPU mode)\")\n",
|
| 194 |
+
"\n",
|
| 195 |
+
"except Exception as e:\n",
|
| 196 |
+
" print(f\"Error with facebook/bart-large-mnli: {e}\")\n",
|
| 197 |
+
" print(\"\\n🔄 Trying alternative model...\")\n",
|
| 198 |
+
"\n",
|
| 199 |
+
" try:\n",
|
| 200 |
+
" pipe = pipeline(\n",
|
| 201 |
+
" \"zero-shot-classification\",\n",
|
| 202 |
+
" model=\"typeform/distilbert-base-uncased-mnli\",\n",
|
| 203 |
+
" device=0 if torch.cuda.is_available() else -1, # GPU support\n",
|
| 204 |
+
" batch_size=64,\n",
|
| 205 |
+
" max_length=512,\n",
|
| 206 |
+
" truncation=True\n",
|
| 207 |
+
" )\n",
|
| 208 |
+
"\n",
|
| 209 |
+
" print(\"✅ Alternative model loaded successfully!\")\n",
|
| 210 |
+
"\n",
|
| 211 |
+
" except Exception as e2:\n",
|
| 212 |
+
" print(f\"❌ Error with alternative model: {e2}\")\n",
|
| 213 |
+
" print(\"Please check your internet connection and try again.\")\n"
|
| 214 |
+
]
|
| 215 |
+
},
|
| 216 |
+
{
|
| 217 |
+
"cell_type": "code",
|
| 218 |
+
"execution_count": null,
|
| 219 |
+
"id": "80bc187fbfff3e10",
|
| 220 |
+
"metadata": {
|
| 221 |
+
"ExecuteTime": {
|
| 222 |
+
"end_time": "2025-09-16T04:15:12.296956Z",
|
| 223 |
+
"start_time": "2025-09-16T04:15:12.116659Z"
|
| 224 |
+
}
|
| 225 |
+
},
|
| 226 |
+
"outputs": [],
|
| 227 |
+
"source": [
|
| 228 |
+
"sequence = books.loc[books[\"simple_categories\"] == 'Fiction', 'description'].reset_index(drop=True)[0]"
|
| 229 |
+
]
|
| 230 |
+
},
|
| 231 |
+
{
|
| 232 |
+
"cell_type": "code",
|
| 233 |
+
"execution_count": null,
|
| 234 |
+
"id": "8ba6836b2c958329",
|
| 235 |
+
"metadata": {
|
| 236 |
+
"ExecuteTime": {
|
| 237 |
+
"end_time": "2025-09-16T04:15:21.478795Z",
|
| 238 |
+
"start_time": "2025-09-16T04:15:14.044833Z"
|
| 239 |
+
}
|
| 240 |
+
},
|
| 241 |
+
"outputs": [],
|
| 242 |
+
"source": [
|
| 243 |
+
"pipe(sequence, fiction_categories)"
|
| 244 |
+
]
|
| 245 |
+
},
|
| 246 |
+
{
|
| 247 |
+
"cell_type": "code",
|
| 248 |
+
"execution_count": null,
|
| 249 |
+
"id": "23f2c1d7a1c73945",
|
| 250 |
+
"metadata": {
|
| 251 |
+
"ExecuteTime": {
|
| 252 |
+
"end_time": "2025-09-16T03:30:19.621730Z",
|
| 253 |
+
"start_time": "2025-09-16T03:30:12.489364Z"
|
| 254 |
+
}
|
| 255 |
+
},
|
| 256 |
+
"outputs": [],
|
| 257 |
+
"source": [
|
| 258 |
+
"max_index = np.argmax(pipe(sequence, fiction_categories)[\"scores\"])\n",
|
| 259 |
+
"max_label = pipe(sequence, fiction_categories)[\"labels\"][max_index]\n",
|
| 260 |
+
"max_label"
|
| 261 |
+
]
|
| 262 |
+
},
|
| 263 |
+
{
|
| 264 |
+
"cell_type": "code",
|
| 265 |
+
"execution_count": null,
|
| 266 |
+
"id": "eb1273971a44738c",
|
| 267 |
+
"metadata": {
|
| 268 |
+
"ExecuteTime": {
|
| 269 |
+
"end_time": "2025-09-16T04:15:21.672845Z",
|
| 270 |
+
"start_time": "2025-09-16T04:15:21.660563Z"
|
| 271 |
+
}
|
| 272 |
+
},
|
| 273 |
+
"outputs": [],
|
| 274 |
+
"source": [
|
| 275 |
+
"from tqdm import tqdm\n",
|
| 276 |
+
"import pandas as pd\n",
|
| 277 |
+
"import time\n",
|
| 278 |
+
"from concurrent.futures import ThreadPoolExecutor, ProcessPoolExecutor\n",
|
| 279 |
+
"import multiprocessing as mp\n",
|
| 280 |
+
"\n",
|
| 281 |
+
"# SOLUTION 1: Batch Processing (Most Important!)\n",
|
| 282 |
+
"def generate_predictions(sequences, categories, batch_size=32):\n",
|
| 283 |
+
" \"\"\"Process multiple sequences at once - much faster!\"\"\"\n",
|
| 284 |
+
" predictions = []\n",
|
| 285 |
+
"\n",
|
| 286 |
+
" for i in tqdm(range(0, len(sequences), batch_size), desc=\"Processing batches\"):\n",
|
| 287 |
+
" batch = sequences[i:i+batch_size]\n",
|
| 288 |
+
"\n",
|
| 289 |
+
" # Process entire batch at once\n",
|
| 290 |
+
" batch_results = pipe(batch, categories)\n",
|
| 291 |
+
"\n",
|
| 292 |
+
" # Handle both single result and list of results\n",
|
| 293 |
+
" if isinstance(batch_results, list):\n",
|
| 294 |
+
" predictions.extend([result['labels'][0] for result in batch_results])\n",
|
| 295 |
+
" else:\n",
|
| 296 |
+
" predictions.append(batch_results['labels'][0])\n",
|
| 297 |
+
"\n",
|
| 298 |
+
" return predictions"
|
| 299 |
+
]
|
| 300 |
+
},
|
| 301 |
+
{
|
| 302 |
+
"cell_type": "code",
|
| 303 |
+
"execution_count": null,
|
| 304 |
+
"id": "7d024a18309a521d",
|
| 305 |
+
"metadata": {
|
| 306 |
+
"ExecuteTime": {
|
| 307 |
+
"end_time": "2025-09-16T04:21:02.847544Z",
|
| 308 |
+
"start_time": "2025-09-16T04:15:23.714181Z"
|
| 309 |
+
}
|
| 310 |
+
},
|
| 311 |
+
"outputs": [],
|
| 312 |
+
"source": [
|
| 313 |
+
"# Get 300 nonfiction descriptions\n",
|
| 314 |
+
"nonfiction_books = books.loc[books[\"simple_categories\"] == 'Nonfiction', 'description'].reset_index(drop=True)[:300]\n",
|
| 315 |
+
"\n",
|
| 316 |
+
"# Truncate for speed\n",
|
| 317 |
+
"sequences = [desc[:400] for desc in nonfiction_books]\n",
|
| 318 |
+
"\n",
|
| 319 |
+
"# Process in batches of 20 (instead of 300 individual calls)\n",
|
| 320 |
+
"batch_size = 20\n",
|
| 321 |
+
"for i in tqdm(range(0, len(sequences), batch_size)):\n",
|
| 322 |
+
" batch = sequences[i:i+batch_size]\n",
|
| 323 |
+
"\n",
|
| 324 |
+
" # One model call for 20 books instead of 20 separate calls\n",
|
| 325 |
+
" results = pipe(batch, fiction_categories)\n",
|
| 326 |
+
"\n",
|
| 327 |
+
" # Extract predictions\n",
|
| 328 |
+
" if isinstance(results, list):\n",
|
| 329 |
+
" preddicted_cats += [r['labels'][0] for r in results]\n",
|
| 330 |
+
" else:\n",
|
| 331 |
+
" preddicted_cats += [results['labels'][0]]\n",
|
| 332 |
+
"\n",
|
| 333 |
+
" actual_cats += ['Nonfiction'] * len(batch)"
|
| 334 |
+
]
|
| 335 |
+
},
|
| 336 |
+
{
|
| 337 |
+
"cell_type": "code",
|
| 338 |
+
"execution_count": null,
|
| 339 |
+
"id": "fdc40689dfadf1",
|
| 340 |
+
"metadata": {
|
| 341 |
+
"ExecuteTime": {
|
| 342 |
+
"end_time": "2025-09-16T04:21:08.483550Z",
|
| 343 |
+
"start_time": "2025-09-16T04:21:08.405904Z"
|
| 344 |
+
}
|
| 345 |
+
},
|
| 346 |
+
"outputs": [],
|
| 347 |
+
"source": [
|
| 348 |
+
"predicted_df = pd.DataFrame({\"actual_categories\": actual_cats, \"predicted_categories\": preddicted_cats})"
|
| 349 |
+
]
|
| 350 |
+
},
|
| 351 |
+
{
|
| 352 |
+
"cell_type": "code",
|
| 353 |
+
"execution_count": null,
|
| 354 |
+
"id": "ed0907a9093b94d0",
|
| 355 |
+
"metadata": {
|
| 356 |
+
"ExecuteTime": {
|
| 357 |
+
"end_time": "2025-09-16T04:21:16.539324Z",
|
| 358 |
+
"start_time": "2025-09-16T04:21:16.384515Z"
|
| 359 |
+
}
|
| 360 |
+
},
|
| 361 |
+
"outputs": [],
|
| 362 |
+
"source": [
|
| 363 |
+
"predicted_df.head()"
|
| 364 |
+
]
|
| 365 |
+
},
|
| 366 |
+
{
|
| 367 |
+
"cell_type": "code",
|
| 368 |
+
"execution_count": null,
|
| 369 |
+
"id": "87d924edea28b476",
|
| 370 |
+
"metadata": {
|
| 371 |
+
"ExecuteTime": {
|
| 372 |
+
"end_time": "2025-09-16T04:21:19.825460Z",
|
| 373 |
+
"start_time": "2025-09-16T04:21:19.795117Z"
|
| 374 |
+
}
|
| 375 |
+
},
|
| 376 |
+
"outputs": [],
|
| 377 |
+
"source": [
|
| 378 |
+
"predicted_df['correct_prediction'] = (np.where(predicted_df['actual_categories'] == predicted_df['predicted_categories'], 1, 0)\n",
|
| 379 |
+
" )"
|
| 380 |
+
]
|
| 381 |
+
},
|
| 382 |
+
{
|
| 383 |
+
"cell_type": "code",
|
| 384 |
+
"execution_count": null,
|
| 385 |
+
"id": "6c25043f2e0d694a",
|
| 386 |
+
"metadata": {
|
| 387 |
+
"ExecuteTime": {
|
| 388 |
+
"end_time": "2025-09-16T04:21:22.040362Z",
|
| 389 |
+
"start_time": "2025-09-16T04:21:22.019264Z"
|
| 390 |
+
}
|
| 391 |
+
},
|
| 392 |
+
"outputs": [],
|
| 393 |
+
"source": [
|
| 394 |
+
"predicted_df['correct_prediction'].sum()/len(predicted_df)"
|
| 395 |
+
]
|
| 396 |
+
},
|
| 397 |
+
{
|
| 398 |
+
"cell_type": "code",
|
| 399 |
+
"execution_count": null,
|
| 400 |
+
"id": "3c3611fc62b1d8df",
|
| 401 |
+
"metadata": {
|
| 402 |
+
"ExecuteTime": {
|
| 403 |
+
"end_time": "2025-09-16T04:21:24.159383Z",
|
| 404 |
+
"start_time": "2025-09-16T04:21:24.001792Z"
|
| 405 |
+
}
|
| 406 |
+
},
|
| 407 |
+
"outputs": [],
|
| 408 |
+
"source": [
|
| 409 |
+
"isbns = []\n",
|
| 410 |
+
"predicted_cats = []\n",
|
| 411 |
+
"\n",
|
| 412 |
+
"missing_cats = books.loc[books['simple_categories'].isna(), ['isbn13', 'description']].reset_index(drop=True)"
|
| 413 |
+
]
|
| 414 |
+
},
|
| 415 |
+
{
|
| 416 |
+
"cell_type": "code",
|
| 417 |
+
"execution_count": null,
|
| 418 |
+
"id": "5a6ee7c312cc4605",
|
| 419 |
+
"metadata": {
|
| 420 |
+
"ExecuteTime": {
|
| 421 |
+
"end_time": "2025-09-16T04:48:29.368260Z",
|
| 422 |
+
"start_time": "2025-09-16T04:47:55.181816Z"
|
| 423 |
+
}
|
| 424 |
+
},
|
| 425 |
+
"outputs": [],
|
| 426 |
+
"source": [
|
| 427 |
+
"# Your current code (already run - don't re-run!)\n",
|
| 428 |
+
"sequences = [str(desc)[:200] if pd.notna(desc) else \"\" for desc in missing_cats[\"description\"]]\n",
|
| 429 |
+
"sequences = [seq for seq in sequences if seq.strip()] # This changed the length!\n",
|
| 430 |
+
"isbns = missing_cats[\"isbn13\"].tolist()\n",
|
| 431 |
+
"predicted_cats = generate_predictions(sequences, fiction_categories, batch_size=128)\n",
|
| 432 |
+
"\n",
|
| 433 |
+
"\n"
|
| 434 |
+
]
|
| 435 |
+
},
|
| 436 |
+
{
|
| 437 |
+
"cell_type": "code",
|
| 438 |
+
"execution_count": null,
|
| 439 |
+
"id": "4561a0670452fa3b",
|
| 440 |
+
"metadata": {
|
| 441 |
+
"ExecuteTime": {
|
| 442 |
+
"end_time": "2025-09-16T04:51:30.775050Z",
|
| 443 |
+
"start_time": "2025-09-16T04:51:30.573483Z"
|
| 444 |
+
}
|
| 445 |
+
},
|
| 446 |
+
"outputs": [],
|
| 447 |
+
"source": [
|
| 448 |
+
"# FIX: Get the correct ISBNs that match your filtered sequences\n",
|
| 449 |
+
"descriptions = missing_cats[\"description\"].tolist()\n",
|
| 450 |
+
"isbns_full = missing_cats[\"isbn13\"].tolist()\n",
|
| 451 |
+
"\n",
|
| 452 |
+
"matching_isbns = []\n",
|
| 453 |
+
"for i, desc in enumerate(descriptions):\n",
|
| 454 |
+
" processed_desc = str(desc)[:200] if pd.notna(desc) else \"\"\n",
|
| 455 |
+
" if processed_desc.strip(): # Same condition as your filter\n",
|
| 456 |
+
" matching_isbns.append(isbns_full[i])\n",
|
| 457 |
+
"\n",
|
| 458 |
+
"# Now create DataFrame with matching lengths\n",
|
| 459 |
+
"missing_predicted_df = pd.DataFrame({\n",
|
| 460 |
+
" \"isbn13\": matching_isbns[:len(predicted_cats)], # Safety check\n",
|
| 461 |
+
" \"predicted_categories\": predicted_cats\n",
|
| 462 |
+
"})\n",
|
| 463 |
+
"\n",
|
| 464 |
+
"print(f\"✅ DataFrame created successfully with {len(missing_predicted_df)} rows\")\n",
|
| 465 |
+
"print(f\"📊 Predictions by category:\")\n",
|
| 466 |
+
"print(missing_predicted_df['predicted_categories'].value_counts())\n",
|
| 467 |
+
"\n",
|
| 468 |
+
"# Save results\n",
|
| 469 |
+
"missing_predicted_df.to_csv('missing_categories_predictions.csv', index=False)\n",
|
| 470 |
+
"print(\"💾 Results saved to missing_categories_predictions.csv\")"
|
| 471 |
+
]
|
| 472 |
+
},
|
| 473 |
+
{
|
| 474 |
+
"cell_type": "code",
|
| 475 |
+
"execution_count": null,
|
| 476 |
+
"id": "72fe9a8b4b28a1c6",
|
| 477 |
+
"metadata": {
|
| 478 |
+
"ExecuteTime": {
|
| 479 |
+
"end_time": "2025-09-16T04:52:15.607087Z",
|
| 480 |
+
"start_time": "2025-09-16T04:52:15.520116Z"
|
| 481 |
+
}
|
| 482 |
+
},
|
| 483 |
+
"outputs": [],
|
| 484 |
+
"source": [
|
| 485 |
+
"missing_predicted_df.head()"
|
| 486 |
+
]
|
| 487 |
+
},
|
| 488 |
+
{
|
| 489 |
+
"cell_type": "code",
|
| 490 |
+
"execution_count": null,
|
| 491 |
+
"id": "8b1f7af8aebf289e",
|
| 492 |
+
"metadata": {
|
| 493 |
+
"ExecuteTime": {
|
| 494 |
+
"end_time": "2025-09-16T05:00:37.137998Z",
|
| 495 |
+
"start_time": "2025-09-16T05:00:36.885979Z"
|
| 496 |
+
}
|
| 497 |
+
},
|
| 498 |
+
"outputs": [],
|
| 499 |
+
"source": [
|
| 500 |
+
"books = pd.merge(books, missing_predicted_df, on=\"isbn13\", how=\"left\")\n",
|
| 501 |
+
"books[\"simple_categories\"] = np.where(books[\"simple_categories\"].isna(), books[\"predicted_categories\"], books[\"simple_categories\"])\n",
|
| 502 |
+
"books = books.drop(columns=\"predicted_categories\")"
|
| 503 |
+
]
|
| 504 |
+
},
|
| 505 |
+
{
|
| 506 |
+
"cell_type": "code",
|
| 507 |
+
"execution_count": null,
|
| 508 |
+
"id": "fe5b161193dab1f",
|
| 509 |
+
"metadata": {
|
| 510 |
+
"ExecuteTime": {
|
| 511 |
+
"end_time": "2025-09-16T05:00:50.989276Z",
|
| 512 |
+
"start_time": "2025-09-16T05:00:50.952202Z"
|
| 513 |
+
}
|
| 514 |
+
},
|
| 515 |
+
"outputs": [],
|
| 516 |
+
"source": [
|
| 517 |
+
"books"
|
| 518 |
+
]
|
| 519 |
+
},
|
| 520 |
+
{
|
| 521 |
+
"cell_type": "code",
|
| 522 |
+
"execution_count": null,
|
| 523 |
+
"id": "9d2e1a8dbbd5d6bc",
|
| 524 |
+
"metadata": {
|
| 525 |
+
"ExecuteTime": {
|
| 526 |
+
"end_time": "2025-09-16T05:01:27.850818Z",
|
| 527 |
+
"start_time": "2025-09-16T05:01:27.781563Z"
|
| 528 |
+
}
|
| 529 |
+
},
|
| 530 |
+
"outputs": [],
|
| 531 |
+
"source": [
|
| 532 |
+
"books[books[\"categories\"].str.lower().isin([\n",
|
| 533 |
+
" \"romance\",\n",
|
| 534 |
+
" \"science fiction\",\n",
|
| 535 |
+
" \"scifi\",\n",
|
| 536 |
+
" \"fantasy\",\n",
|
| 537 |
+
" \"horror\",\n",
|
| 538 |
+
" \"mystery\",\n",
|
| 539 |
+
" \"thriller\",\n",
|
| 540 |
+
" \"comedy\",\n",
|
| 541 |
+
" \"crime\",\n",
|
| 542 |
+
" \"historical\"\n",
|
| 543 |
+
"])]"
|
| 544 |
+
]
|
| 545 |
+
},
|
| 546 |
+
{
|
| 547 |
+
"cell_type": "code",
|
| 548 |
+
"execution_count": null,
|
| 549 |
+
"id": "bd067ee0696cac0b",
|
| 550 |
+
"metadata": {
|
| 551 |
+
"ExecuteTime": {
|
| 552 |
+
"end_time": "2025-09-16T05:04:09.432347Z",
|
| 553 |
+
"start_time": "2025-09-16T05:04:09.246658Z"
|
| 554 |
+
}
|
| 555 |
+
},
|
| 556 |
+
"outputs": [],
|
| 557 |
+
"source": [
|
| 558 |
+
"books.to_csv(\"books_with_categories.csv\", index=False)"
|
| 559 |
+
]
|
| 560 |
+
},
|
| 561 |
+
{
|
| 562 |
+
"cell_type": "code",
|
| 563 |
+
"execution_count": null,
|
| 564 |
+
"id": "f8879607442c3f0f",
|
| 565 |
+
"metadata": {},
|
| 566 |
+
"outputs": [],
|
| 567 |
+
"source": []
|
| 568 |
+
}
|
| 569 |
+
],
|
| 570 |
+
"metadata": {
|
| 571 |
+
"kernelspec": {
|
| 572 |
+
"display_name": "Python 3",
|
| 573 |
+
"language": "python",
|
| 574 |
+
"name": "python3"
|
| 575 |
+
},
|
| 576 |
+
"language_info": {
|
| 577 |
+
"codemirror_mode": {
|
| 578 |
+
"name": "ipython",
|
| 579 |
+
"version": 2
|
| 580 |
+
},
|
| 581 |
+
"file_extension": ".py",
|
| 582 |
+
"mimetype": "text/x-python",
|
| 583 |
+
"name": "python",
|
| 584 |
+
"nbconvert_exporter": "python",
|
| 585 |
+
"pygments_lexer": "ipython2",
|
| 586 |
+
"version": "2.7.6"
|
| 587 |
+
}
|
| 588 |
+
},
|
| 589 |
+
"nbformat": 4,
|
| 590 |
+
"nbformat_minor": 5
|
| 591 |
+
}
|
vector-search.ipynb
ADDED
|
@@ -0,0 +1,296 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"cells": [
|
| 3 |
+
{
|
| 4 |
+
"cell_type": "code",
|
| 5 |
+
"execution_count": null,
|
| 6 |
+
"id": "initial_id",
|
| 7 |
+
"metadata": {
|
| 8 |
+
"ExecuteTime": {
|
| 9 |
+
"end_time": "2025-09-15T07:35:25.488414Z",
|
| 10 |
+
"start_time": "2025-09-15T07:35:25.460656Z"
|
| 11 |
+
}
|
| 12 |
+
},
|
| 13 |
+
"outputs": [],
|
| 14 |
+
"source": [
|
| 15 |
+
"from langchain.embeddings import HuggingFaceEmbeddings\n",
|
| 16 |
+
"\n",
|
| 17 |
+
"from langchain_text_splitters import CharacterTextSplitter\n",
|
| 18 |
+
"from langchain_openai import OpenAIEmbeddings\n",
|
| 19 |
+
"from langchain_chroma import Chroma\n",
|
| 20 |
+
"\n"
|
| 21 |
+
]
|
| 22 |
+
},
|
| 23 |
+
{
|
| 24 |
+
"cell_type": "code",
|
| 25 |
+
"execution_count": null,
|
| 26 |
+
"id": "9e2d7510161fceb6",
|
| 27 |
+
"metadata": {
|
| 28 |
+
"ExecuteTime": {
|
| 29 |
+
"end_time": "2025-09-15T07:35:27.755330Z",
|
| 30 |
+
"start_time": "2025-09-15T07:35:27.736857Z"
|
| 31 |
+
}
|
| 32 |
+
},
|
| 33 |
+
"outputs": [],
|
| 34 |
+
"source": [
|
| 35 |
+
"from dotenv import load_dotenv\n",
|
| 36 |
+
"from dotenv import load_dotenv\n",
|
| 37 |
+
"import os\n",
|
| 38 |
+
"\n",
|
| 39 |
+
"# Load environment variables\n",
|
| 40 |
+
"load_dotenv()\n"
|
| 41 |
+
]
|
| 42 |
+
},
|
| 43 |
+
{
|
| 44 |
+
"cell_type": "code",
|
| 45 |
+
"execution_count": null,
|
| 46 |
+
"id": "b1c5ca1012315fd2",
|
| 47 |
+
"metadata": {
|
| 48 |
+
"ExecuteTime": {
|
| 49 |
+
"end_time": "2025-09-15T07:35:30.169857Z",
|
| 50 |
+
"start_time": "2025-09-15T07:35:30.074451Z"
|
| 51 |
+
}
|
| 52 |
+
},
|
| 53 |
+
"outputs": [],
|
| 54 |
+
"source": [
|
| 55 |
+
"import pandas as pd\n",
|
| 56 |
+
"\n",
|
| 57 |
+
"books = pd.read_csv(\"books_cleaned.csv\")"
|
| 58 |
+
]
|
| 59 |
+
},
|
| 60 |
+
{
|
| 61 |
+
"cell_type": "code",
|
| 62 |
+
"execution_count": null,
|
| 63 |
+
"id": "694a28505e311eea",
|
| 64 |
+
"metadata": {
|
| 65 |
+
"ExecuteTime": {
|
| 66 |
+
"end_time": "2025-09-15T07:35:32.747269Z",
|
| 67 |
+
"start_time": "2025-09-15T07:35:32.725973Z"
|
| 68 |
+
}
|
| 69 |
+
},
|
| 70 |
+
"outputs": [],
|
| 71 |
+
"source": [
|
| 72 |
+
"books"
|
| 73 |
+
]
|
| 74 |
+
},
|
| 75 |
+
{
|
| 76 |
+
"cell_type": "code",
|
| 77 |
+
"execution_count": null,
|
| 78 |
+
"id": "eb17356cf0ecbbef",
|
| 79 |
+
"metadata": {
|
| 80 |
+
"ExecuteTime": {
|
| 81 |
+
"end_time": "2025-09-15T07:35:35.292093Z",
|
| 82 |
+
"start_time": "2025-09-15T07:35:35.243618Z"
|
| 83 |
+
}
|
| 84 |
+
},
|
| 85 |
+
"outputs": [],
|
| 86 |
+
"source": [
|
| 87 |
+
"books[\"tagged_description\"].to_csv(\"tagged_description.txt\",\n",
|
| 88 |
+
" index=False,\n",
|
| 89 |
+
" header=False)\n"
|
| 90 |
+
]
|
| 91 |
+
},
|
| 92 |
+
{
|
| 93 |
+
"cell_type": "code",
|
| 94 |
+
"execution_count": null,
|
| 95 |
+
"id": "2db289c35716805c",
|
| 96 |
+
"metadata": {
|
| 97 |
+
"ExecuteTime": {
|
| 98 |
+
"end_time": "2025-09-15T07:35:42.877672Z",
|
| 99 |
+
"start_time": "2025-09-15T07:35:42.683378Z"
|
| 100 |
+
}
|
| 101 |
+
},
|
| 102 |
+
"outputs": [],
|
| 103 |
+
"source": [
|
| 104 |
+
"\n",
|
| 105 |
+
"# Load the file manually (more reliable)\n",
|
| 106 |
+
"with open(\"tagged_description.txt\", 'r', encoding='utf-8') as file:\n",
|
| 107 |
+
" content = file.read()\n",
|
| 108 |
+
"\n",
|
| 109 |
+
"# Create a document object\n",
|
| 110 |
+
"raw_documents = [Document(page_content=content)]\n",
|
| 111 |
+
"\n",
|
| 112 |
+
"# Split into chunks\n",
|
| 113 |
+
"text_splitter = CharacterTextSplitter(\n",
|
| 114 |
+
" chunk_size=1500, # Increased to avoid warnings\n",
|
| 115 |
+
" chunk_overlap=150,\n",
|
| 116 |
+
" separator=\"\\n\"\n",
|
| 117 |
+
")\n",
|
| 118 |
+
"\n",
|
| 119 |
+
"documents = text_splitter.split_documents(raw_documents)\n",
|
| 120 |
+
"\n",
|
| 121 |
+
"print(f\"Successfully created {len(documents)} chunks\")\n",
|
| 122 |
+
"print(f\"First chunk preview:\\n{documents[0].page_content[:200]}...\")"
|
| 123 |
+
]
|
| 124 |
+
},
|
| 125 |
+
{
|
| 126 |
+
"cell_type": "code",
|
| 127 |
+
"execution_count": null,
|
| 128 |
+
"id": "12d6dc1c1f518682",
|
| 129 |
+
"metadata": {
|
| 130 |
+
"ExecuteTime": {
|
| 131 |
+
"end_time": "2025-09-15T07:35:45.472985Z",
|
| 132 |
+
"start_time": "2025-09-15T07:35:45.467714Z"
|
| 133 |
+
}
|
| 134 |
+
},
|
| 135 |
+
"outputs": [],
|
| 136 |
+
"source": [
|
| 137 |
+
"documents[0]"
|
| 138 |
+
]
|
| 139 |
+
},
|
| 140 |
+
{
|
| 141 |
+
"cell_type": "code",
|
| 142 |
+
"execution_count": null,
|
| 143 |
+
"id": "d73b0e5261855919",
|
| 144 |
+
"metadata": {
|
| 145 |
+
"ExecuteTime": {
|
| 146 |
+
"end_time": "2025-09-15T07:47:27.888830Z",
|
| 147 |
+
"start_time": "2025-09-15T07:36:56.075724Z"
|
| 148 |
+
}
|
| 149 |
+
},
|
| 150 |
+
"outputs": [],
|
| 151 |
+
"source": [
|
| 152 |
+
"!pip install sentence_transformers\n",
|
| 153 |
+
"embeddings = HuggingFaceEmbeddings(\n",
|
| 154 |
+
" model_name=\"all-MiniLM-L6-v2\", # Free, fast, and good quality\n",
|
| 155 |
+
" model_kwargs={'device': 'cpu'} # Use 'cuda' if you have a GPU\n",
|
| 156 |
+
")\n",
|
| 157 |
+
"\n",
|
| 158 |
+
"db_books = Chroma.from_documents(\n",
|
| 159 |
+
" documents,\n",
|
| 160 |
+
" embedding=embeddings\n",
|
| 161 |
+
")"
|
| 162 |
+
]
|
| 163 |
+
},
|
| 164 |
+
{
|
| 165 |
+
"cell_type": "markdown",
|
| 166 |
+
"id": "9473a4b393977d6f",
|
| 167 |
+
"metadata": {},
|
| 168 |
+
"source": []
|
| 169 |
+
},
|
| 170 |
+
{
|
| 171 |
+
"cell_type": "code",
|
| 172 |
+
"execution_count": null,
|
| 173 |
+
"id": "8c28a61479deb520",
|
| 174 |
+
"metadata": {
|
| 175 |
+
"ExecuteTime": {
|
| 176 |
+
"end_time": "2025-09-15T07:47:49.568125Z",
|
| 177 |
+
"start_time": "2025-09-15T07:47:49.337737Z"
|
| 178 |
+
}
|
| 179 |
+
},
|
| 180 |
+
"outputs": [],
|
| 181 |
+
"source": [
|
| 182 |
+
"query = \"A book to teach children about nature\"\n",
|
| 183 |
+
"docs = db_books.similarity_search(query, k = 10)\n",
|
| 184 |
+
"docs"
|
| 185 |
+
]
|
| 186 |
+
},
|
| 187 |
+
{
|
| 188 |
+
"cell_type": "code",
|
| 189 |
+
"execution_count": null,
|
| 190 |
+
"id": "57cebcff1d436b6a",
|
| 191 |
+
"metadata": {
|
| 192 |
+
"ExecuteTime": {
|
| 193 |
+
"end_time": "2025-09-15T07:54:09.612026Z",
|
| 194 |
+
"start_time": "2025-09-15T07:54:09.538027Z"
|
| 195 |
+
}
|
| 196 |
+
},
|
| 197 |
+
"outputs": [],
|
| 198 |
+
"source": [
|
| 199 |
+
"# Extract and clean the ISBN\n",
|
| 200 |
+
"isbn_str = docs[0].page_content.split()[0].strip()\n",
|
| 201 |
+
"# Remove quotes and convert to float first, then int\n",
|
| 202 |
+
"isbn_clean = isbn_str.replace('\"', '').replace(\"'\", \"\")\n",
|
| 203 |
+
"isbn_int = int(float(isbn_clean)) # float first to handle .0, then int\n",
|
| 204 |
+
"\n",
|
| 205 |
+
"# Now search\n",
|
| 206 |
+
"result = books[books[\"isbn13\"] == isbn_int]"
|
| 207 |
+
]
|
| 208 |
+
},
|
| 209 |
+
{
|
| 210 |
+
"cell_type": "code",
|
| 211 |
+
"execution_count": null,
|
| 212 |
+
"id": "4155cc001df44e93",
|
| 213 |
+
"metadata": {
|
| 214 |
+
"ExecuteTime": {
|
| 215 |
+
"end_time": "2025-09-15T07:54:49.157935Z",
|
| 216 |
+
"start_time": "2025-09-15T07:54:49.088922Z"
|
| 217 |
+
}
|
| 218 |
+
},
|
| 219 |
+
"outputs": [],
|
| 220 |
+
"source": [
|
| 221 |
+
"result"
|
| 222 |
+
]
|
| 223 |
+
},
|
| 224 |
+
{
|
| 225 |
+
"cell_type": "code",
|
| 226 |
+
"execution_count": null,
|
| 227 |
+
"id": "4c644a4b395fda08",
|
| 228 |
+
"metadata": {
|
| 229 |
+
"ExecuteTime": {
|
| 230 |
+
"end_time": "2025-09-15T08:23:46.545582Z",
|
| 231 |
+
"start_time": "2025-09-15T08:23:46.531998Z"
|
| 232 |
+
}
|
| 233 |
+
},
|
| 234 |
+
"outputs": [],
|
| 235 |
+
"source": [
|
| 236 |
+
"def retrieve_semantic_recommendations(\n",
|
| 237 |
+
" query: str,\n",
|
| 238 |
+
" top_k: int = 10,\n",
|
| 239 |
+
") -> pd.DataFrame:\n",
|
| 240 |
+
" recs = db_books.similarity_search(query, k=50)\n",
|
| 241 |
+
"\n",
|
| 242 |
+
" books_list = []\n",
|
| 243 |
+
"\n",
|
| 244 |
+
" for i in range(0, len(recs)):\n",
|
| 245 |
+
" isbn_str = recs[i].page_content.strip('\"').split()[0]\n",
|
| 246 |
+
" books_list += [int(float(isbn_str))] # float() first, then int()\n",
|
| 247 |
+
"\n",
|
| 248 |
+
" return books[books[\"isbn13\"].isin(books_list)].head(top_k)"
|
| 249 |
+
]
|
| 250 |
+
},
|
| 251 |
+
{
|
| 252 |
+
"cell_type": "code",
|
| 253 |
+
"execution_count": null,
|
| 254 |
+
"id": "b9eada846c702825",
|
| 255 |
+
"metadata": {
|
| 256 |
+
"ExecuteTime": {
|
| 257 |
+
"end_time": "2025-09-15T08:23:47.659278Z",
|
| 258 |
+
"start_time": "2025-09-15T08:23:47.501425Z"
|
| 259 |
+
}
|
| 260 |
+
},
|
| 261 |
+
"outputs": [],
|
| 262 |
+
"source": [
|
| 263 |
+
"retrieve_semantic_recommendations(\"A book to teach children about nature\")"
|
| 264 |
+
]
|
| 265 |
+
},
|
| 266 |
+
{
|
| 267 |
+
"cell_type": "code",
|
| 268 |
+
"execution_count": null,
|
| 269 |
+
"id": "36d5bb5ac34f9b2d",
|
| 270 |
+
"metadata": {},
|
| 271 |
+
"outputs": [],
|
| 272 |
+
"source": []
|
| 273 |
+
}
|
| 274 |
+
],
|
| 275 |
+
"metadata": {
|
| 276 |
+
"kernelspec": {
|
| 277 |
+
"display_name": "Python 3",
|
| 278 |
+
"language": "python",
|
| 279 |
+
"name": "python3"
|
| 280 |
+
},
|
| 281 |
+
"language_info": {
|
| 282 |
+
"codemirror_mode": {
|
| 283 |
+
"name": "ipython",
|
| 284 |
+
"version": 2
|
| 285 |
+
},
|
| 286 |
+
"file_extension": ".py",
|
| 287 |
+
"mimetype": "text/x-python",
|
| 288 |
+
"name": "python",
|
| 289 |
+
"nbconvert_exporter": "python",
|
| 290 |
+
"pygments_lexer": "ipython2",
|
| 291 |
+
"version": "2.7.6"
|
| 292 |
+
}
|
| 293 |
+
},
|
| 294 |
+
"nbformat": 4,
|
| 295 |
+
"nbformat_minor": 5
|
| 296 |
+
}
|