gautamnancy commited on
Commit
17ddb51
·
verified ·
1 Parent(s): a8bb43e

Upload 6 files

Browse files
data-exploration.ipynb ADDED
@@ -0,0 +1,443 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "code",
5
+ "execution_count": null,
6
+ "id": "initial_id",
7
+ "metadata": {
8
+ "ExecuteTime": {
9
+ "end_time": "2025-09-14T07:12:03.050818Z",
10
+ "start_time": "2025-09-14T07:11:56.152605Z"
11
+ }
12
+ },
13
+ "outputs": [],
14
+ "source": [
15
+ "from statistics import correlation\n",
16
+ "\n",
17
+ "import kagglehub\n",
18
+ "\n",
19
+ "# Download latest version\n",
20
+ "path = kagglehub.dataset_download(\"dylanjcastillo/7k-books-with-metadata\")\n",
21
+ "\n",
22
+ "print(\"Path to dataset files:\", path)"
23
+ ]
24
+ },
25
+ {
26
+ "cell_type": "code",
27
+ "execution_count": null,
28
+ "id": "ae99194daafd1775",
29
+ "metadata": {
30
+ "ExecuteTime": {
31
+ "end_time": "2025-09-14T07:51:53.432293Z",
32
+ "start_time": "2025-09-14T07:51:52.436694Z"
33
+ }
34
+ },
35
+ "outputs": [],
36
+ "source": [
37
+ "import pandas as pd"
38
+ ]
39
+ },
40
+ {
41
+ "cell_type": "code",
42
+ "execution_count": null,
43
+ "id": "6df67758ebb1137c",
44
+ "metadata": {
45
+ "ExecuteTime": {
46
+ "end_time": "2025-09-14T08:03:25.179234Z",
47
+ "start_time": "2025-09-14T08:03:24.185253Z"
48
+ }
49
+ },
50
+ "outputs": [],
51
+ "source": [
52
+ "from pathlib import Path\n",
53
+ "\n",
54
+ "# Convert string path → Path object\n",
55
+ "path = Path(kagglehub.dataset_download(\"dylanjcastillo/7k-books-with-metadata\"))\n",
56
+ "\n",
57
+ "books = pd.read_csv(path / \"books.csv\")"
58
+ ]
59
+ },
60
+ {
61
+ "cell_type": "code",
62
+ "execution_count": null,
63
+ "id": "94828bc9ccbfafa1",
64
+ "metadata": {
65
+ "ExecuteTime": {
66
+ "end_time": "2025-09-14T08:03:37.133785Z",
67
+ "start_time": "2025-09-14T08:03:37.079170Z"
68
+ }
69
+ },
70
+ "outputs": [],
71
+ "source": [
72
+ "books"
73
+ ]
74
+ },
75
+ {
76
+ "cell_type": "code",
77
+ "execution_count": null,
78
+ "id": "9403c10bb9a0112e",
79
+ "metadata": {
80
+ "ExecuteTime": {
81
+ "end_time": "2025-09-14T08:12:20.943772Z",
82
+ "start_time": "2025-09-14T08:12:16.468843Z"
83
+ }
84
+ },
85
+ "outputs": [],
86
+ "source": [
87
+ "import seaborn as sns\n",
88
+ "import matplotlib.pyplot as plt"
89
+ ]
90
+ },
91
+ {
92
+ "cell_type": "code",
93
+ "execution_count": null,
94
+ "id": "aaba3c5cc9492dbc",
95
+ "metadata": {
96
+ "ExecuteTime": {
97
+ "end_time": "2025-09-14T08:16:47.484763Z",
98
+ "start_time": "2025-09-14T08:16:47.134190Z"
99
+ }
100
+ },
101
+ "outputs": [],
102
+ "source": [
103
+ "ax = plt.axes()\n",
104
+ "sns.heatmap(books.isna().transpose(), cbar = False , ax=ax)\n",
105
+ "\n",
106
+ "plt.xlabel(\"Columns\")\n",
107
+ "plt.ylabel(\"Missing values\")\n",
108
+ "\n",
109
+ "plt.show()"
110
+ ]
111
+ },
112
+ {
113
+ "cell_type": "code",
114
+ "execution_count": null,
115
+ "id": "5020d8ec7f517390",
116
+ "metadata": {
117
+ "ExecuteTime": {
118
+ "end_time": "2025-09-14T08:34:19.472432Z",
119
+ "start_time": "2025-09-14T08:34:19.396405Z"
120
+ }
121
+ },
122
+ "outputs": [],
123
+ "source": [
124
+ "import numpy as np\n",
125
+ "books[\"missing_description\"] = np.where(books[\"description\"].isna(), 1, 0)\n",
126
+ "books[\"age_of_book\"] = 2024 - books[\"published_year\"]"
127
+ ]
128
+ },
129
+ {
130
+ "cell_type": "code",
131
+ "execution_count": null,
132
+ "id": "8693f57773a2f2ca",
133
+ "metadata": {
134
+ "ExecuteTime": {
135
+ "end_time": "2025-09-14T08:45:22.257526Z",
136
+ "start_time": "2025-09-14T08:45:22.005185Z"
137
+ }
138
+ },
139
+ "outputs": [],
140
+ "source": [
141
+ "columns_of_interest = [\"num_pages\", \"age_of_book\", \"missing_description\", \"average_rating\"]\n",
142
+ "correlation_matrix = books[columns_of_interest].corr(method = \"spearman\")\n",
143
+ "sns.set_theme(style=\"white\")\n",
144
+ "plt.figure(figsize = (8, 6))\n",
145
+ "heatmap = sns.heatmap(correlation_matrix, annot=True, fmt=\".2f\", cmap=\"coolwarm\", cbar_kws={\"label\": \"Spearman Correlation\"})\n",
146
+ "\n",
147
+ "heatmap.set_title(\"Correlation Heatmap\")\n",
148
+ "plt.show()"
149
+ ]
150
+ },
151
+ {
152
+ "cell_type": "code",
153
+ "execution_count": null,
154
+ "id": "1218eb9769f7ec28",
155
+ "metadata": {
156
+ "ExecuteTime": {
157
+ "end_time": "2025-09-14T09:01:30.451492Z",
158
+ "start_time": "2025-09-14T09:01:30.397573Z"
159
+ }
160
+ },
161
+ "outputs": [],
162
+ "source": [
163
+ "books_missing = books[(books[\"description\"].isna()) |\n",
164
+ " ~(books[\"num_pages\"].isna()) &\n",
165
+ " ~(books[\"average_rating\"].isna()) &\n",
166
+ " ~(books[\"published_year\"].isna())\n",
167
+ "]"
168
+ ]
169
+ },
170
+ {
171
+ "cell_type": "code",
172
+ "execution_count": null,
173
+ "id": "a16b79d748237fa6",
174
+ "metadata": {
175
+ "ExecuteTime": {
176
+ "end_time": "2025-09-14T09:29:57.037634Z",
177
+ "start_time": "2025-09-14T09:29:56.971479Z"
178
+ }
179
+ },
180
+ "outputs": [],
181
+ "source": [
182
+ "books_missing = books[~(books[\"description\"].isna()) &\n",
183
+ " ~(books[\"num_pages\"].isna()) &\n",
184
+ " ~(books[\"average_rating\"].isna()) &\n",
185
+ " ~(books[\"published_year\"].isna())\n",
186
+ "]"
187
+ ]
188
+ },
189
+ {
190
+ "cell_type": "code",
191
+ "execution_count": null,
192
+ "id": "997cafb5e60fef34",
193
+ "metadata": {
194
+ "ExecuteTime": {
195
+ "end_time": "2025-09-14T09:30:14.028246Z",
196
+ "start_time": "2025-09-14T09:30:13.969750Z"
197
+ }
198
+ },
199
+ "outputs": [],
200
+ "source": [
201
+ "books_missing"
202
+ ]
203
+ },
204
+ {
205
+ "cell_type": "code",
206
+ "execution_count": null,
207
+ "id": "6aad6ddc169cf39d",
208
+ "metadata": {
209
+ "ExecuteTime": {
210
+ "end_time": "2025-09-14T09:33:31.453933Z",
211
+ "start_time": "2025-09-14T09:33:31.395084Z"
212
+ }
213
+ },
214
+ "outputs": [],
215
+ "source": [
216
+ "books_missing[\"categories\"].value_counts().reset_index().sort_values(\"count\", ascending=False)"
217
+ ]
218
+ },
219
+ {
220
+ "cell_type": "code",
221
+ "execution_count": null,
222
+ "id": "a7c0628d5619c32b",
223
+ "metadata": {
224
+ "ExecuteTime": {
225
+ "end_time": "2025-09-14T09:52:43.243363Z",
226
+ "start_time": "2025-09-14T09:52:43.211576Z"
227
+ }
228
+ },
229
+ "outputs": [],
230
+ "source": [
231
+ "books_missing"
232
+ ]
233
+ },
234
+ {
235
+ "cell_type": "code",
236
+ "execution_count": null,
237
+ "id": "b971c57a22e2721e",
238
+ "metadata": {
239
+ "ExecuteTime": {
240
+ "end_time": "2025-09-14T10:06:37.305268Z",
241
+ "start_time": "2025-09-14T10:06:37.242773Z"
242
+ }
243
+ },
244
+ "outputs": [],
245
+ "source": [
246
+ "books_missing.loc[:, \"words_in_description\"] = books_missing[\"description\"].str.split().str.len()\n"
247
+ ]
248
+ },
249
+ {
250
+ "cell_type": "code",
251
+ "execution_count": null,
252
+ "id": "5cf80ede1a996820",
253
+ "metadata": {
254
+ "ExecuteTime": {
255
+ "end_time": "2025-09-14T10:07:11.889795Z",
256
+ "start_time": "2025-09-14T10:07:11.815772Z"
257
+ }
258
+ },
259
+ "outputs": [],
260
+ "source": [
261
+ "books_missing"
262
+ ]
263
+ },
264
+ {
265
+ "cell_type": "code",
266
+ "execution_count": null,
267
+ "id": "d4a20c7b8a28d843",
268
+ "metadata": {
269
+ "ExecuteTime": {
270
+ "end_time": "2025-09-14T10:16:26.757853Z",
271
+ "start_time": "2025-09-14T10:16:26.738194Z"
272
+ }
273
+ },
274
+ "outputs": [],
275
+ "source": [
276
+ "print(books_missing.loc[books_missing[\"words_in_description\"].between(25, 34), [\"description\", \"words_in_description\"]])\n"
277
+ ]
278
+ },
279
+ {
280
+ "cell_type": "code",
281
+ "execution_count": null,
282
+ "id": "add578fb79f75576",
283
+ "metadata": {
284
+ "ExecuteTime": {
285
+ "end_time": "2025-09-14T10:18:38.671378Z",
286
+ "start_time": "2025-09-14T10:18:38.655678Z"
287
+ }
288
+ },
289
+ "outputs": [],
290
+ "source": [
291
+ "books_missing_25_words = books_missing[books_missing[\"words_in_description\"] >= 25]"
292
+ ]
293
+ },
294
+ {
295
+ "cell_type": "code",
296
+ "execution_count": null,
297
+ "id": "337cc14a7592597",
298
+ "metadata": {
299
+ "ExecuteTime": {
300
+ "end_time": "2025-09-14T10:18:45.020133Z",
301
+ "start_time": "2025-09-14T10:18:44.995404Z"
302
+ }
303
+ },
304
+ "outputs": [],
305
+ "source": [
306
+ "books_missing_25_words"
307
+ ]
308
+ },
309
+ {
310
+ "cell_type": "code",
311
+ "execution_count": null,
312
+ "id": "15505042aaae206b",
313
+ "metadata": {
314
+ "ExecuteTime": {
315
+ "end_time": "2025-09-14T10:36:25.385493Z",
316
+ "start_time": "2025-09-14T10:36:25.348788Z"
317
+ }
318
+ },
319
+ "outputs": [],
320
+ "source": [
321
+ "books_missing_25_words.loc[:, \"title_and_subtitle\"] = np.where(\n",
322
+ " books_missing_25_words[\"subtitle\"].isna(),\n",
323
+ " books_missing_25_words[\"title\"],\n",
324
+ " books_missing_25_words[[\"title\", \"subtitle\"]].astype(str).agg(\": \".join, axis=1)\n",
325
+ ")\n"
326
+ ]
327
+ },
328
+ {
329
+ "cell_type": "code",
330
+ "execution_count": null,
331
+ "id": "8f48839b393f1be6",
332
+ "metadata": {
333
+ "ExecuteTime": {
334
+ "end_time": "2025-09-14T10:36:36.463971Z",
335
+ "start_time": "2025-09-14T10:36:36.442637Z"
336
+ }
337
+ },
338
+ "outputs": [],
339
+ "source": [
340
+ "books_missing_25_words"
341
+ ]
342
+ },
343
+ {
344
+ "cell_type": "code",
345
+ "execution_count": null,
346
+ "id": "1033bd78abfa34a3",
347
+ "metadata": {
348
+ "ExecuteTime": {
349
+ "end_time": "2025-09-14T10:38:24.509449Z",
350
+ "start_time": "2025-09-14T10:38:24.480830Z"
351
+ }
352
+ },
353
+ "outputs": [],
354
+ "source": [
355
+ "books_missing_25_words[\"title_and_subtitle\"].value_counts().reset_index().sort_values(\"count\", ascending=False)"
356
+ ]
357
+ },
358
+ {
359
+ "cell_type": "code",
360
+ "execution_count": null,
361
+ "id": "1871d27d7eb01493",
362
+ "metadata": {
363
+ "ExecuteTime": {
364
+ "end_time": "2025-09-14T10:45:15.551772Z",
365
+ "start_time": "2025-09-14T10:45:15.504051Z"
366
+ }
367
+ },
368
+ "outputs": [],
369
+ "source": [
370
+ "books_missing_25_words = books_missing_25_words.copy() # slice warning हटाने के लिए\n",
371
+ "\n",
372
+ "books_missing_25_words.loc[:, \"tagged_description\"] = (\n",
373
+ " books_missing_25_words[[\"isbn13\", \"description\"]]\n",
374
+ " .astype(str)\n",
375
+ " .agg(\" \".join, axis=1)\n",
376
+ ")\n"
377
+ ]
378
+ },
379
+ {
380
+ "cell_type": "code",
381
+ "execution_count": null,
382
+ "id": "20a704320865f12b",
383
+ "metadata": {
384
+ "ExecuteTime": {
385
+ "end_time": "2025-09-14T10:45:38.585999Z",
386
+ "start_time": "2025-09-14T10:45:38.566081Z"
387
+ }
388
+ },
389
+ "outputs": [],
390
+ "source": [
391
+ "books_missing_25_words"
392
+ ]
393
+ },
394
+ {
395
+ "cell_type": "code",
396
+ "execution_count": null,
397
+ "id": "36a89080af8a4f1c",
398
+ "metadata": {
399
+ "ExecuteTime": {
400
+ "end_time": "2025-09-14T10:49:30.500326Z",
401
+ "start_time": "2025-09-14T10:49:30.213437Z"
402
+ }
403
+ },
404
+ "outputs": [],
405
+ "source": [
406
+ "(\n",
407
+ " books_missing_25_words\n",
408
+ " .drop([\"subtitle\", \"missing_description\", \"age_of_book\", \"words_in_description\"], axis=1)\n",
409
+ " .to_csv(\"books_cleaned.csv\", index = False)\n",
410
+ ")"
411
+ ]
412
+ },
413
+ {
414
+ "cell_type": "code",
415
+ "execution_count": null,
416
+ "id": "a2308b29e727ba70",
417
+ "metadata": {},
418
+ "outputs": [],
419
+ "source": []
420
+ }
421
+ ],
422
+ "metadata": {
423
+ "kernelspec": {
424
+ "display_name": "Python 3",
425
+ "language": "python",
426
+ "name": "python3"
427
+ },
428
+ "language_info": {
429
+ "codemirror_mode": {
430
+ "name": "ipython",
431
+ "version": 2
432
+ },
433
+ "file_extension": ".py",
434
+ "mimetype": "text/x-python",
435
+ "name": "python",
436
+ "nbconvert_exporter": "python",
437
+ "pygments_lexer": "ipython2",
438
+ "version": "2.7.6"
439
+ }
440
+ },
441
+ "nbformat": 4,
442
+ "nbformat_minor": 5
443
+ }
gradio-dashboard.py ADDED
@@ -0,0 +1,117 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+ import numpy as np
3
+ from dotenv import load_dotenv
4
+
5
+ from langchain.schema import Document
6
+ from langchain_huggingface import HuggingFaceEmbeddings
7
+ from langchain_chroma import Chroma
8
+
9
+ import gradio as gr
10
+
11
+ load_dotenv()
12
+
13
+ books = pd.read_csv("books_with_emotions.csv")
14
+ books["large_thumbnail"] = books["thumbnail"] + "&fife=w800"
15
+ books["large_thumbnail"] = np.where(
16
+ books["large_thumbnail"].isna(),
17
+ "cover-not-found.jpg",
18
+ books["large_thumbnail"],
19
+ )
20
+
21
+ # Create documents directly from DataFrame instead of loading from file
22
+ documents = []
23
+ for _, row in books.iterrows():
24
+ content = f"{row['isbn13']} {row['description']}"
25
+ documents.append(Document(page_content=content))
26
+
27
+ # Create the vector database using HuggingFace embeddings
28
+ embeddings = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")
29
+ db_books = Chroma.from_documents(documents, embeddings)
30
+
31
+
32
+ def retrieve_semantic_recommendations(
33
+ query: str,
34
+ category: str = None,
35
+ tone: str = None,
36
+ initial_top_k: int = 50,
37
+ final_top_k: int = 16,
38
+ ) -> pd.DataFrame:
39
+
40
+ recs = db_books.similarity_search(query, k=initial_top_k)
41
+ books_list = [int(float(rec.page_content.strip('"').split()[0])) for rec in recs]
42
+ book_recs = books[books["isbn13"].isin(books_list)].head(initial_top_k)
43
+
44
+ if category != "All":
45
+ book_recs = book_recs[book_recs["simple_categories"] == category].head(final_top_k)
46
+ else:
47
+ book_recs = book_recs.head(final_top_k)
48
+
49
+ # Only sort by emotion if the columns exist
50
+ if tone == "Happy" and "joy" in book_recs.columns:
51
+ book_recs = book_recs.sort_values(by="joy", ascending=False)
52
+ elif tone == "Surprising" and "surprise" in book_recs.columns:
53
+ book_recs = book_recs.sort_values(by="surprise", ascending=False)
54
+ elif tone == "Angry" and "anger" in book_recs.columns:
55
+ book_recs = book_recs.sort_values(by="anger", ascending=False)
56
+ elif tone == "Suspenseful" and "fear" in book_recs.columns:
57
+ book_recs = book_recs.sort_values(by="fear", ascending=False)
58
+ elif tone == "Sad" and "sadness" in book_recs.columns:
59
+ book_recs = book_recs.sort_values(by="sadness", ascending=False)
60
+
61
+ return book_recs
62
+
63
+
64
+ def recommend_books(
65
+ query: str,
66
+ category: str,
67
+ tone: str
68
+ ):
69
+ recommendations = retrieve_semantic_recommendations(query, category, tone)
70
+ results = []
71
+
72
+ for _, row in recommendations.iterrows():
73
+ description = row["description"]
74
+ truncated_desc_split = description.split()
75
+ truncated_description = " ".join(truncated_desc_split[:30]) + "..."
76
+
77
+ authors_split = row["authors"].split(";")
78
+ if len(authors_split) == 2:
79
+ authors_str = f"{authors_split[0]} and {authors_split[1]}"
80
+ elif len(authors_split) > 2:
81
+ authors_str = f"{', '.join(authors_split[:-1])}, and {authors_split[-1]}"
82
+ else:
83
+ authors_str = row["authors"]
84
+
85
+ caption = f"{row['title']} by {authors_str}: {truncated_description}"
86
+ results.append((row["large_thumbnail"], caption))
87
+ return results
88
+
89
+ # Fix: Filter out NaN values before sorting
90
+ categories = ["All"] + sorted(books["simple_categories"].dropna().unique())
91
+
92
+ # Only include emotion tones if the emotion columns exist
93
+ emotion_columns = ["joy", "surprise", "anger", "fear", "sadness"]
94
+ emotion_labels = ["Happy", "Surprising", "Angry", "Suspenseful", "Sad"]
95
+ available_emotions = [label for col, label in zip(emotion_columns, emotion_labels) if col in books.columns]
96
+ tones = ["All"] + available_emotions
97
+
98
+ with gr.Blocks(theme = gr.themes.Glass()) as dashboard:
99
+ gr.Markdown("# Semantic book recommender")
100
+
101
+ with gr.Row():
102
+ user_query = gr.Textbox(label = "Please enter a description of a book:",
103
+ placeholder = "e.g., A story about forgiveness")
104
+ category_dropdown = gr.Dropdown(choices = categories, label = "Select a category:", value = "All")
105
+ tone_dropdown = gr.Dropdown(choices = tones, label = "Select an emotional tone:", value = "All")
106
+ submit_button = gr.Button("Find recommendations")
107
+
108
+ gr.Markdown("## Recommendations")
109
+ output = gr.Gallery(label = "Recommended books", columns = 8, rows = 2)
110
+
111
+ submit_button.click(fn = recommend_books,
112
+ inputs = [user_query, category_dropdown, tone_dropdown],
113
+ outputs = output)
114
+
115
+
116
+ if __name__ == "__main__":
117
+ dashboard.launch()
sentiment-analysis.ipynb ADDED
@@ -0,0 +1,445 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "code",
5
+ "execution_count": null,
6
+ "id": "initial_id",
7
+ "metadata": {
8
+ "ExecuteTime": {
9
+ "end_time": "2025-09-16T09:43:18.055617Z",
10
+ "start_time": "2025-09-16T09:43:17.869905Z"
11
+ }
12
+ },
13
+ "outputs": [],
14
+ "source": [
15
+ "import pandas as pd\n",
16
+ "\n",
17
+ "books = pd.read_csv(\"books_with_categories.csv\")"
18
+ ]
19
+ },
20
+ {
21
+ "cell_type": "code",
22
+ "execution_count": null,
23
+ "id": "3d9a521af5640cd2",
24
+ "metadata": {
25
+ "ExecuteTime": {
26
+ "end_time": "2025-09-16T09:43:20.918046Z",
27
+ "start_time": "2025-09-16T09:43:18.066451Z"
28
+ }
29
+ },
30
+ "outputs": [],
31
+ "source": [
32
+ "!pip install torch transformers\n"
33
+ ]
34
+ },
35
+ {
36
+ "cell_type": "code",
37
+ "execution_count": null,
38
+ "id": "a222cc24cb3d9e50",
39
+ "metadata": {
40
+ "ExecuteTime": {
41
+ "end_time": "2025-09-16T09:43:20.956314Z",
42
+ "start_time": "2025-09-16T09:43:20.934627Z"
43
+ }
44
+ },
45
+ "outputs": [],
46
+ "source": [
47
+ "import torch\n",
48
+ "import transformers\n",
49
+ "print(f\"PyTorch version: {torch.__version__}\")\n",
50
+ "print(f\"Transformers version: {transformers.__version__}\")"
51
+ ]
52
+ },
53
+ {
54
+ "cell_type": "code",
55
+ "execution_count": null,
56
+ "id": "418145b8ff28c108",
57
+ "metadata": {
58
+ "ExecuteTime": {
59
+ "end_time": "2025-09-16T09:43:23.555715Z",
60
+ "start_time": "2025-09-16T09:43:20.969958Z"
61
+ }
62
+ },
63
+ "outputs": [],
64
+ "source": [
65
+ "# Fix the bug by making torch available in transformers namespace\n",
66
+ "transformers.torch = torch\n",
67
+ "\n",
68
+ "from transformers import pipeline\n",
69
+ "\n",
70
+ "pipe = pipeline(\n",
71
+ " \"text-classification\",\n",
72
+ " model=\"j-hartmann/emotion-english-distilroberta-base\",\n",
73
+ " return_all_scores=True\n",
74
+ ")\n",
75
+ "\n",
76
+ "# Test it\n",
77
+ "text = \"I am so happy today!\"\n",
78
+ "result = pipe(text)\n",
79
+ "print(result)\n",
80
+ "\n",
81
+ "#top-k None\n",
82
+ "#device -- mps /cuda for warnings"
83
+ ]
84
+ },
85
+ {
86
+ "cell_type": "code",
87
+ "execution_count": null,
88
+ "id": "90acf250d3189ec1",
89
+ "metadata": {
90
+ "ExecuteTime": {
91
+ "end_time": "2025-09-16T09:43:23.912340Z",
92
+ "start_time": "2025-09-16T09:43:23.574192Z"
93
+ }
94
+ },
95
+ "outputs": [],
96
+ "source": [
97
+ "pipe(books[\"description\"][0])"
98
+ ]
99
+ },
100
+ {
101
+ "cell_type": "code",
102
+ "execution_count": null,
103
+ "id": "c9781bcf4224efd4",
104
+ "metadata": {
105
+ "ExecuteTime": {
106
+ "end_time": "2025-09-16T09:43:24.797286Z",
107
+ "start_time": "2025-09-16T09:43:23.944842Z"
108
+ }
109
+ },
110
+ "outputs": [],
111
+ "source": [
112
+ "pipe(books[\"description\"][0].split(\".\"))"
113
+ ]
114
+ },
115
+ {
116
+ "cell_type": "code",
117
+ "execution_count": null,
118
+ "id": "57fc949d567e3f7",
119
+ "metadata": {
120
+ "ExecuteTime": {
121
+ "end_time": "2025-09-16T09:43:25.167345Z",
122
+ "start_time": "2025-09-16T09:43:24.810715Z"
123
+ }
124
+ },
125
+ "outputs": [],
126
+ "source": [
127
+ "sentences = books[\"description\"][0].split(\".\")\n",
128
+ "predictions = pipe(sentences)"
129
+ ]
130
+ },
131
+ {
132
+ "cell_type": "code",
133
+ "execution_count": null,
134
+ "id": "41b5470987223a69",
135
+ "metadata": {
136
+ "ExecuteTime": {
137
+ "end_time": "2025-09-16T09:43:25.187522Z",
138
+ "start_time": "2025-09-16T09:43:25.179974Z"
139
+ }
140
+ },
141
+ "outputs": [],
142
+ "source": [
143
+ "sentences[0]"
144
+ ]
145
+ },
146
+ {
147
+ "cell_type": "code",
148
+ "execution_count": null,
149
+ "id": "81bb270a79fdd290",
150
+ "metadata": {
151
+ "ExecuteTime": {
152
+ "end_time": "2025-09-16T09:43:25.232413Z",
153
+ "start_time": "2025-09-16T09:43:25.225824Z"
154
+ }
155
+ },
156
+ "outputs": [],
157
+ "source": [
158
+ "predictions[0]"
159
+ ]
160
+ },
161
+ {
162
+ "cell_type": "code",
163
+ "execution_count": null,
164
+ "id": "d85ba7066b85eb7d",
165
+ "metadata": {
166
+ "ExecuteTime": {
167
+ "end_time": "2025-09-16T09:43:25.273001Z",
168
+ "start_time": "2025-09-16T09:43:25.267108Z"
169
+ }
170
+ },
171
+ "outputs": [],
172
+ "source": [
173
+ "sentences[4]"
174
+ ]
175
+ },
176
+ {
177
+ "cell_type": "code",
178
+ "execution_count": null,
179
+ "id": "8dea7d5c2077d566",
180
+ "metadata": {
181
+ "ExecuteTime": {
182
+ "end_time": "2025-09-16T09:43:25.306831Z",
183
+ "start_time": "2025-09-16T09:43:25.300457Z"
184
+ }
185
+ },
186
+ "outputs": [],
187
+ "source": [
188
+ "predictions[4]"
189
+ ]
190
+ },
191
+ {
192
+ "cell_type": "code",
193
+ "execution_count": null,
194
+ "id": "a540e26e090b9050",
195
+ "metadata": {
196
+ "ExecuteTime": {
197
+ "end_time": "2025-09-16T09:43:25.342124Z",
198
+ "start_time": "2025-09-16T09:43:25.334958Z"
199
+ }
200
+ },
201
+ "outputs": [],
202
+ "source": [
203
+ "sorted(predictions[0], key = lambda x: x['label'])"
204
+ ]
205
+ },
206
+ {
207
+ "cell_type": "code",
208
+ "execution_count": null,
209
+ "id": "a496645a7d858dcf",
210
+ "metadata": {
211
+ "ExecuteTime": {
212
+ "end_time": "2025-09-16T09:43:25.369056Z",
213
+ "start_time": "2025-09-16T09:43:25.360888Z"
214
+ }
215
+ },
216
+ "outputs": [],
217
+ "source": [
218
+ "import numpy as np\n",
219
+ "\n",
220
+ "emotion_labels = [\"anger\", \"disgust\", \"fear\", \"joy\", \"sadness\", \"surprise\", \"neutral\"]\n",
221
+ "isbn = []\n",
222
+ "emotion_scores = {label: [] for label in emotion_labels}\n",
223
+ "\n",
224
+ "def calculate_max_emotion_scores(predictions):\n",
225
+ " per_emotion_scores = {label: [] for label in emotion_labels}\n",
226
+ " for prediction in predictions:\n",
227
+ " sorted_predictions = sorted(prediction, key=lambda x: x['label'], reverse=True)\n",
228
+ " for index, label in enumerate(emotion_labels):\n",
229
+ " per_emotion_scores[label].append(sorted_predictions[index]['score'])\n",
230
+ " return {label: np.max(scores) for label, scores in per_emotion_scores.items()}"
231
+ ]
232
+ },
233
+ {
234
+ "cell_type": "code",
235
+ "execution_count": null,
236
+ "id": "b911145893e482f3",
237
+ "metadata": {
238
+ "ExecuteTime": {
239
+ "end_time": "2025-09-16T09:43:27.622710Z",
240
+ "start_time": "2025-09-16T09:43:25.385198Z"
241
+ }
242
+ },
243
+ "outputs": [],
244
+ "source": [
245
+ "for i, row in books.head(10).iterrows():\n",
246
+ " isbn.append(str(row[\"isbn13\"]))\n",
247
+ "\n",
248
+ " sentences = str(row[\"description\"]).split(\".\")\n",
249
+ " predictions = pipe(sentences)\n",
250
+ " max_scores = calculate_max_emotion_scores(predictions)\n",
251
+ "\n",
252
+ " for label in emotion_labels:\n",
253
+ " # force conversion to Python float\n",
254
+ " emotion_scores[label].append(float(max_scores[label]))\n"
255
+ ]
256
+ },
257
+ {
258
+ "cell_type": "code",
259
+ "execution_count": null,
260
+ "id": "b1e1e2960a0314b3",
261
+ "metadata": {
262
+ "ExecuteTime": {
263
+ "end_time": "2025-09-16T09:43:27.634705Z",
264
+ "start_time": "2025-09-16T09:43:27.630600Z"
265
+ }
266
+ },
267
+ "outputs": [],
268
+ "source": [
269
+ "emotion_scores = {\n",
270
+ " label: [float(x) for x in scores]\n",
271
+ " for label, scores in emotion_scores.items()\n",
272
+ "}\n"
273
+ ]
274
+ },
275
+ {
276
+ "cell_type": "code",
277
+ "execution_count": null,
278
+ "id": "b4d6abd593a32daa",
279
+ "metadata": {
280
+ "ExecuteTime": {
281
+ "end_time": "2025-09-16T09:43:27.652229Z",
282
+ "start_time": "2025-09-16T09:43:27.644745Z"
283
+ }
284
+ },
285
+ "outputs": [],
286
+ "source": [
287
+ "emotion_scores"
288
+ ]
289
+ },
290
+ {
291
+ "cell_type": "code",
292
+ "execution_count": null,
293
+ "id": "632f787a4b7d3eaf",
294
+ "metadata": {
295
+ "ExecuteTime": {
296
+ "end_time": "2025-09-16T11:10:46.250826Z",
297
+ "start_time": "2025-09-16T10:58:12.182208Z"
298
+ }
299
+ },
300
+ "outputs": [],
301
+ "source": [
302
+ "import pandas as pd\n",
303
+ "import numpy as np\n",
304
+ "from tqdm import tqdm\n",
305
+ "from transformers import pipeline\n",
306
+ "\n",
307
+ "# Initialize the emotion analysis pipeline\n",
308
+ "pipe = pipeline(\"text-classification\", model=\"j-hartmann/emotion-english-distilroberta-base\", top_k=None)\n",
309
+ "\n",
310
+ "# Load your books data\n",
311
+ "books = pd.read_csv(\"books_with_categories.csv\") # Replace with your actual file name\n",
312
+ "\n",
313
+ "emotion_labels = [\"anger\", \"disgust\", \"fear\", \"joy\", \"sadness\", \"surprise\", \"neutral\"]\n",
314
+ "isbn = []\n",
315
+ "emotion_scores = {label: [] for label in emotion_labels}\n",
316
+ "\n",
317
+ "def calculate_max_emotion_scores(predictions):\n",
318
+ " \"\"\"Calculate maximum emotion scores across all sentences\"\"\"\n",
319
+ " per_emotion_scores = {label: [] for label in emotion_labels}\n",
320
+ "\n",
321
+ " for prediction in predictions:\n",
322
+ " # Create a dictionary for easy lookup by label\n",
323
+ " prediction_dict = {pred['label']: pred['score'] for pred in prediction}\n",
324
+ "\n",
325
+ " # Add scores for each emotion label\n",
326
+ " for label in emotion_labels:\n",
327
+ " score = prediction_dict.get(label, 0.0) # Default to 0 if label not found\n",
328
+ " per_emotion_scores[label].append(score)\n",
329
+ "\n",
330
+ " # Return maximum score for each emotion across all sentences\n",
331
+ " return {label: np.max(scores) if scores else 0.0 for label, scores in per_emotion_scores.items()}\n",
332
+ "\n",
333
+ "print(\"Processing emotions for books...\")\n",
334
+ "for i, row in tqdm(books.iterrows(), total=len(books)):\n",
335
+ " isbn.append(str(row[\"isbn13\"]))\n",
336
+ "\n",
337
+ " # Handle missing descriptions\n",
338
+ " description = str(row[\"description\"]) if pd.notna(row[\"description\"]) else \"\"\n",
339
+ "\n",
340
+ " if description and description != \"nan\":\n",
341
+ " # Split into sentences and filter out empty ones\n",
342
+ " sentences = [s.strip() for s in description.split(\".\") if s.strip()]\n",
343
+ "\n",
344
+ " if sentences:\n",
345
+ " try:\n",
346
+ " predictions = pipe(sentences)\n",
347
+ " max_scores = calculate_max_emotion_scores(predictions)\n",
348
+ " except Exception as e:\n",
349
+ " print(f\"Error processing book {row['isbn13']}: {e}\")\n",
350
+ " # Use default scores if processing fails\n",
351
+ " max_scores = {label: 0.0 for label in emotion_labels}\n",
352
+ " else:\n",
353
+ " # Empty description\n",
354
+ " max_scores = {label: 0.0 for label in emotion_labels}\n",
355
+ " else:\n",
356
+ " # No description available\n",
357
+ " max_scores = {label: 0.0 for label in emotion_labels}\n",
358
+ "\n",
359
+ " # Add scores to our lists\n",
360
+ " for label in emotion_labels:\n",
361
+ " emotion_scores[label].append(float(max_scores[label]))"
362
+ ]
363
+ },
364
+ {
365
+ "cell_type": "code",
366
+ "execution_count": null,
367
+ "id": "31dfb34d4f4aee9a",
368
+ "metadata": {
369
+ "ExecuteTime": {
370
+ "end_time": "2025-09-16T11:11:55.455696Z",
371
+ "start_time": "2025-09-16T11:11:55.422818Z"
372
+ }
373
+ },
374
+ "outputs": [],
375
+ "source": [
376
+ "# Create emotion DataFrame\n",
377
+ "emotion_df = pd.DataFrame(emotion_scores)\n",
378
+ "emotion_df['isbn13'] = isbn\n",
379
+ "\n",
380
+ "print(\"Emotion processing completed!\")\n",
381
+ "print(\"Sample emotion scores:\")\n",
382
+ "print(emotion_df.head(10))"
383
+ ]
384
+ },
385
+ {
386
+ "cell_type": "code",
387
+ "execution_count": null,
388
+ "id": "8db5e8f5cee59321",
389
+ "metadata": {
390
+ "ExecuteTime": {
391
+ "end_time": "2025-09-16T11:12:15.342595Z",
392
+ "start_time": "2025-09-16T11:12:14.895173Z"
393
+ }
394
+ },
395
+ "outputs": [],
396
+ "source": [
397
+ "books['isbn13'] = books['isbn13'].astype(str).str.replace('.0', '', regex=False)\n",
398
+ "emotion_df['isbn13'] = emotion_df['isbn13'].astype(str).str.replace('.0', '', regex=False)\n",
399
+ "\n",
400
+ "print(\"Data types before merge:\")\n",
401
+ "print(f\"Books isbn13 dtype: {books['isbn13'].dtype}\")\n",
402
+ "print(f\"Emotion isbn13 dtype: {emotion_df['isbn13'].dtype}\")\n",
403
+ "\n",
404
+ "# Merge emotion scores back to the original books DataFrame\n",
405
+ "books_with_emotions = books.merge(emotion_df, on='isbn13', how='left')\n",
406
+ "\n",
407
+ "# Save the combined DataFrame\n",
408
+ "books_with_emotions.to_csv(\"books_with_emotions.csv\", index=False)\n",
409
+ "\n",
410
+ "print(f\"Saved books with emotions to 'books_with_emotions.csv'\")\n",
411
+ "print(f\"Total books processed: {len(books_with_emotions)}\")\n",
412
+ "print(\"Available columns:\", books_with_emotions.columns.tolist())"
413
+ ]
414
+ },
415
+ {
416
+ "cell_type": "code",
417
+ "execution_count": null,
418
+ "id": "e1cc83da7893e926",
419
+ "metadata": {},
420
+ "outputs": [],
421
+ "source": []
422
+ }
423
+ ],
424
+ "metadata": {
425
+ "kernelspec": {
426
+ "display_name": "Python 3",
427
+ "language": "python",
428
+ "name": "python3"
429
+ },
430
+ "language_info": {
431
+ "codemirror_mode": {
432
+ "name": "ipython",
433
+ "version": 2
434
+ },
435
+ "file_extension": ".py",
436
+ "mimetype": "text/x-python",
437
+ "name": "python",
438
+ "nbconvert_exporter": "python",
439
+ "pygments_lexer": "ipython2",
440
+ "version": "2.7.6"
441
+ }
442
+ },
443
+ "nbformat": 4,
444
+ "nbformat_minor": 5
445
+ }
tagged_description.txt ADDED
The diff for this file is too large to render. See raw diff
 
text-classification.ipynb ADDED
@@ -0,0 +1,591 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "code",
5
+ "execution_count": null,
6
+ "id": "initial_id",
7
+ "metadata": {
8
+ "ExecuteTime": {
9
+ "end_time": "2025-09-16T03:30:06.906158Z",
10
+ "start_time": "2025-09-16T03:30:06.897210Z"
11
+ }
12
+ },
13
+ "outputs": [],
14
+ "source": [
15
+ "import numpy as np\n",
16
+ "\n",
17
+ "category_mapping = {'Fiction' : 'Fiction',\n",
18
+ " 'Juvenile Fiction' : \"Children's Fiction\",\n",
19
+ " 'Biography & Autobiography' : 'Nonfiction',\n",
20
+ " 'History' : 'Nonfiction',\n",
21
+ " 'Literary Criticism' : 'Nonfiction',\n",
22
+ " 'Philosophy' : 'Nonfiction',\n",
23
+ " 'Religion' : 'Nonfiction',\n",
24
+ " 'Comics & Graphic Novels' : 'Fiction',\n",
25
+ " 'Juvenile Nonfiction' : \"Children's Nonfiction\",\n",
26
+ " 'Science' : 'Nonfiction',\n",
27
+ " 'Poetry' : 'Fiction',\n",
28
+ " }"
29
+ ]
30
+ },
31
+ {
32
+ "cell_type": "code",
33
+ "execution_count": null,
34
+ "id": "abd407fcfb12529f",
35
+ "metadata": {
36
+ "ExecuteTime": {
37
+ "end_time": "2025-09-15T09:11:18.779297Z",
38
+ "start_time": "2025-09-15T09:11:18.685368Z"
39
+ }
40
+ },
41
+ "outputs": [],
42
+ "source": [
43
+ "import pandas as pd\n",
44
+ "books = pd.read_csv(\"books_cleaned.csv\")"
45
+ ]
46
+ },
47
+ {
48
+ "cell_type": "code",
49
+ "execution_count": null,
50
+ "id": "8730b04764af7caa",
51
+ "metadata": {
52
+ "ExecuteTime": {
53
+ "end_time": "2025-09-15T09:12:06.202207Z",
54
+ "start_time": "2025-09-15T09:12:06.190052Z"
55
+ }
56
+ },
57
+ "outputs": [],
58
+ "source": [
59
+ "books['simple_categories'] = books['categories'].map(category_mapping)"
60
+ ]
61
+ },
62
+ {
63
+ "cell_type": "code",
64
+ "execution_count": null,
65
+ "id": "17b0fe2cfe81778b",
66
+ "metadata": {
67
+ "ExecuteTime": {
68
+ "end_time": "2025-09-15T09:13:56.419141Z",
69
+ "start_time": "2025-09-15T09:13:56.325655Z"
70
+ }
71
+ },
72
+ "outputs": [],
73
+ "source": [
74
+ "books"
75
+ ]
76
+ },
77
+ {
78
+ "cell_type": "code",
79
+ "execution_count": null,
80
+ "id": "410d16934dfe2383",
81
+ "metadata": {
82
+ "ExecuteTime": {
83
+ "end_time": "2025-09-15T09:39:48.441516Z",
84
+ "start_time": "2025-09-15T09:39:48.396466Z"
85
+ }
86
+ },
87
+ "outputs": [],
88
+ "source": [
89
+ "books[~(books['simple_categories'].isna())]"
90
+ ]
91
+ },
92
+ {
93
+ "cell_type": "code",
94
+ "execution_count": null,
95
+ "id": "a0d8dcd913296e3d",
96
+ "metadata": {
97
+ "ExecuteTime": {
98
+ "end_time": "2025-09-15T10:23:22.076926Z",
99
+ "start_time": "2025-09-15T10:20:53.043882Z"
100
+ }
101
+ },
102
+ "outputs": [],
103
+ "source": [
104
+ "!pip install hf_xet\n",
105
+ "from transformers import pipeline\n",
106
+ "\n",
107
+ "fiction_categories = ['Fiction', 'Nonfiction']\n",
108
+ "pipe = pipeline(\"zero-shot-classification\",model=\"facebook/bart-large-mnli\", device=\"cuda\")\n"
109
+ ]
110
+ },
111
+ {
112
+ "cell_type": "code",
113
+ "execution_count": null,
114
+ "id": "cd9edaa3ee8c1243",
115
+ "metadata": {
116
+ "ExecuteTime": {
117
+ "end_time": "2025-09-15T10:23:46.232544Z",
118
+ "start_time": "2025-09-15T10:23:43.525543Z"
119
+ }
120
+ },
121
+ "outputs": [],
122
+ "source": [
123
+ "!pip install --upgrade huggingface_hub\n"
124
+ ]
125
+ },
126
+ {
127
+ "cell_type": "code",
128
+ "execution_count": null,
129
+ "id": "83b78716648ebbe6",
130
+ "metadata": {
131
+ "ExecuteTime": {
132
+ "end_time": "2025-09-15T10:23:55.154934Z",
133
+ "start_time": "2025-09-15T10:23:53.226725Z"
134
+ }
135
+ },
136
+ "outputs": [],
137
+ "source": [
138
+ "!pip install \"huggingface_hub[hf_xet]\"\n"
139
+ ]
140
+ },
141
+ {
142
+ "cell_type": "code",
143
+ "execution_count": null,
144
+ "id": "8d02bd90c594fbac",
145
+ "metadata": {
146
+ "ExecuteTime": {
147
+ "end_time": "2025-09-15T10:24:14.628937Z",
148
+ "start_time": "2025-09-15T10:24:12.758899Z"
149
+ }
150
+ },
151
+ "outputs": [],
152
+ "source": [
153
+ "!pip show huggingface_hub\n",
154
+ "!pip show hf_xet\n"
155
+ ]
156
+ },
157
+ {
158
+ "cell_type": "code",
159
+ "execution_count": null,
160
+ "id": "83e5151bdc46709a",
161
+ "metadata": {
162
+ "ExecuteTime": {
163
+ "end_time": "2025-09-16T04:10:04.964668Z",
164
+ "start_time": "2025-09-16T04:10:01.587200Z"
165
+ }
166
+ },
167
+ "outputs": [],
168
+ "source": [
169
+ "from transformers import pipeline\n",
170
+ "import torch\n",
171
+ "import os\n",
172
+ "\n",
173
+ "print(\"Loading model... (this may take a few minutes on first run)\")\n",
174
+ "print(f\"CUDA available: {torch.cuda.is_available()}\")\n",
175
+ "if torch.cuda.is_available():\n",
176
+ " print(f\"GPU device: {torch.cuda.get_device_name(0)}\")\n",
177
+ "\n",
178
+ "# CRITICAL: Add GPU support and optimization parameters\n",
179
+ "try:\n",
180
+ " os.environ[\"HF_HUB_DOWNLOAD_TIMEOUT\"] = \"120\"\n",
181
+ "\n",
182
+ " pipe = pipeline(\n",
183
+ " \"zero-shot-classification\",\n",
184
+ " model=\"facebook/bart-large-mnli\",\n",
185
+ " device=0 if torch.cuda.is_available() else -1, # Use GPU if available\n",
186
+ " batch_size=64, # Internal pipeline batch size\n",
187
+ " max_length=512, # Truncate long texts\n",
188
+ " truncation=True,\n",
189
+ " use_auth_token=False,\n",
190
+ " revision=\"main\"\n",
191
+ " )\n",
192
+ "\n",
193
+ " print(\"✅ Model loaded successfully with GPU acceleration!\" if torch.cuda.is_available() else \"✅ Model loaded (CPU mode)\")\n",
194
+ "\n",
195
+ "except Exception as e:\n",
196
+ " print(f\"Error with facebook/bart-large-mnli: {e}\")\n",
197
+ " print(\"\\n🔄 Trying alternative model...\")\n",
198
+ "\n",
199
+ " try:\n",
200
+ " pipe = pipeline(\n",
201
+ " \"zero-shot-classification\",\n",
202
+ " model=\"typeform/distilbert-base-uncased-mnli\",\n",
203
+ " device=0 if torch.cuda.is_available() else -1, # GPU support\n",
204
+ " batch_size=64,\n",
205
+ " max_length=512,\n",
206
+ " truncation=True\n",
207
+ " )\n",
208
+ "\n",
209
+ " print(\"✅ Alternative model loaded successfully!\")\n",
210
+ "\n",
211
+ " except Exception as e2:\n",
212
+ " print(f\"❌ Error with alternative model: {e2}\")\n",
213
+ " print(\"Please check your internet connection and try again.\")\n"
214
+ ]
215
+ },
216
+ {
217
+ "cell_type": "code",
218
+ "execution_count": null,
219
+ "id": "80bc187fbfff3e10",
220
+ "metadata": {
221
+ "ExecuteTime": {
222
+ "end_time": "2025-09-16T04:15:12.296956Z",
223
+ "start_time": "2025-09-16T04:15:12.116659Z"
224
+ }
225
+ },
226
+ "outputs": [],
227
+ "source": [
228
+ "sequence = books.loc[books[\"simple_categories\"] == 'Fiction', 'description'].reset_index(drop=True)[0]"
229
+ ]
230
+ },
231
+ {
232
+ "cell_type": "code",
233
+ "execution_count": null,
234
+ "id": "8ba6836b2c958329",
235
+ "metadata": {
236
+ "ExecuteTime": {
237
+ "end_time": "2025-09-16T04:15:21.478795Z",
238
+ "start_time": "2025-09-16T04:15:14.044833Z"
239
+ }
240
+ },
241
+ "outputs": [],
242
+ "source": [
243
+ "pipe(sequence, fiction_categories)"
244
+ ]
245
+ },
246
+ {
247
+ "cell_type": "code",
248
+ "execution_count": null,
249
+ "id": "23f2c1d7a1c73945",
250
+ "metadata": {
251
+ "ExecuteTime": {
252
+ "end_time": "2025-09-16T03:30:19.621730Z",
253
+ "start_time": "2025-09-16T03:30:12.489364Z"
254
+ }
255
+ },
256
+ "outputs": [],
257
+ "source": [
258
+ "max_index = np.argmax(pipe(sequence, fiction_categories)[\"scores\"])\n",
259
+ "max_label = pipe(sequence, fiction_categories)[\"labels\"][max_index]\n",
260
+ "max_label"
261
+ ]
262
+ },
263
+ {
264
+ "cell_type": "code",
265
+ "execution_count": null,
266
+ "id": "eb1273971a44738c",
267
+ "metadata": {
268
+ "ExecuteTime": {
269
+ "end_time": "2025-09-16T04:15:21.672845Z",
270
+ "start_time": "2025-09-16T04:15:21.660563Z"
271
+ }
272
+ },
273
+ "outputs": [],
274
+ "source": [
275
+ "from tqdm import tqdm\n",
276
+ "import pandas as pd\n",
277
+ "import time\n",
278
+ "from concurrent.futures import ThreadPoolExecutor, ProcessPoolExecutor\n",
279
+ "import multiprocessing as mp\n",
280
+ "\n",
281
+ "# SOLUTION 1: Batch Processing (Most Important!)\n",
282
+ "def generate_predictions(sequences, categories, batch_size=32):\n",
283
+ " \"\"\"Process multiple sequences at once - much faster!\"\"\"\n",
284
+ " predictions = []\n",
285
+ "\n",
286
+ " for i in tqdm(range(0, len(sequences), batch_size), desc=\"Processing batches\"):\n",
287
+ " batch = sequences[i:i+batch_size]\n",
288
+ "\n",
289
+ " # Process entire batch at once\n",
290
+ " batch_results = pipe(batch, categories)\n",
291
+ "\n",
292
+ " # Handle both single result and list of results\n",
293
+ " if isinstance(batch_results, list):\n",
294
+ " predictions.extend([result['labels'][0] for result in batch_results])\n",
295
+ " else:\n",
296
+ " predictions.append(batch_results['labels'][0])\n",
297
+ "\n",
298
+ " return predictions"
299
+ ]
300
+ },
301
+ {
302
+ "cell_type": "code",
303
+ "execution_count": null,
304
+ "id": "7d024a18309a521d",
305
+ "metadata": {
306
+ "ExecuteTime": {
307
+ "end_time": "2025-09-16T04:21:02.847544Z",
308
+ "start_time": "2025-09-16T04:15:23.714181Z"
309
+ }
310
+ },
311
+ "outputs": [],
312
+ "source": [
313
+ "# Get 300 nonfiction descriptions\n",
314
+ "nonfiction_books = books.loc[books[\"simple_categories\"] == 'Nonfiction', 'description'].reset_index(drop=True)[:300]\n",
315
+ "\n",
316
+ "# Truncate for speed\n",
317
+ "sequences = [desc[:400] for desc in nonfiction_books]\n",
318
+ "\n",
319
+ "# Process in batches of 20 (instead of 300 individual calls)\n",
320
+ "batch_size = 20\n",
321
+ "for i in tqdm(range(0, len(sequences), batch_size)):\n",
322
+ " batch = sequences[i:i+batch_size]\n",
323
+ "\n",
324
+ " # One model call for 20 books instead of 20 separate calls\n",
325
+ " results = pipe(batch, fiction_categories)\n",
326
+ "\n",
327
+ " # Extract predictions\n",
328
+ " if isinstance(results, list):\n",
329
+ " preddicted_cats += [r['labels'][0] for r in results]\n",
330
+ " else:\n",
331
+ " preddicted_cats += [results['labels'][0]]\n",
332
+ "\n",
333
+ " actual_cats += ['Nonfiction'] * len(batch)"
334
+ ]
335
+ },
336
+ {
337
+ "cell_type": "code",
338
+ "execution_count": null,
339
+ "id": "fdc40689dfadf1",
340
+ "metadata": {
341
+ "ExecuteTime": {
342
+ "end_time": "2025-09-16T04:21:08.483550Z",
343
+ "start_time": "2025-09-16T04:21:08.405904Z"
344
+ }
345
+ },
346
+ "outputs": [],
347
+ "source": [
348
+ "predicted_df = pd.DataFrame({\"actual_categories\": actual_cats, \"predicted_categories\": preddicted_cats})"
349
+ ]
350
+ },
351
+ {
352
+ "cell_type": "code",
353
+ "execution_count": null,
354
+ "id": "ed0907a9093b94d0",
355
+ "metadata": {
356
+ "ExecuteTime": {
357
+ "end_time": "2025-09-16T04:21:16.539324Z",
358
+ "start_time": "2025-09-16T04:21:16.384515Z"
359
+ }
360
+ },
361
+ "outputs": [],
362
+ "source": [
363
+ "predicted_df.head()"
364
+ ]
365
+ },
366
+ {
367
+ "cell_type": "code",
368
+ "execution_count": null,
369
+ "id": "87d924edea28b476",
370
+ "metadata": {
371
+ "ExecuteTime": {
372
+ "end_time": "2025-09-16T04:21:19.825460Z",
373
+ "start_time": "2025-09-16T04:21:19.795117Z"
374
+ }
375
+ },
376
+ "outputs": [],
377
+ "source": [
378
+ "predicted_df['correct_prediction'] = (np.where(predicted_df['actual_categories'] == predicted_df['predicted_categories'], 1, 0)\n",
379
+ " )"
380
+ ]
381
+ },
382
+ {
383
+ "cell_type": "code",
384
+ "execution_count": null,
385
+ "id": "6c25043f2e0d694a",
386
+ "metadata": {
387
+ "ExecuteTime": {
388
+ "end_time": "2025-09-16T04:21:22.040362Z",
389
+ "start_time": "2025-09-16T04:21:22.019264Z"
390
+ }
391
+ },
392
+ "outputs": [],
393
+ "source": [
394
+ "predicted_df['correct_prediction'].sum()/len(predicted_df)"
395
+ ]
396
+ },
397
+ {
398
+ "cell_type": "code",
399
+ "execution_count": null,
400
+ "id": "3c3611fc62b1d8df",
401
+ "metadata": {
402
+ "ExecuteTime": {
403
+ "end_time": "2025-09-16T04:21:24.159383Z",
404
+ "start_time": "2025-09-16T04:21:24.001792Z"
405
+ }
406
+ },
407
+ "outputs": [],
408
+ "source": [
409
+ "isbns = []\n",
410
+ "predicted_cats = []\n",
411
+ "\n",
412
+ "missing_cats = books.loc[books['simple_categories'].isna(), ['isbn13', 'description']].reset_index(drop=True)"
413
+ ]
414
+ },
415
+ {
416
+ "cell_type": "code",
417
+ "execution_count": null,
418
+ "id": "5a6ee7c312cc4605",
419
+ "metadata": {
420
+ "ExecuteTime": {
421
+ "end_time": "2025-09-16T04:48:29.368260Z",
422
+ "start_time": "2025-09-16T04:47:55.181816Z"
423
+ }
424
+ },
425
+ "outputs": [],
426
+ "source": [
427
+ "# Your current code (already run - don't re-run!)\n",
428
+ "sequences = [str(desc)[:200] if pd.notna(desc) else \"\" for desc in missing_cats[\"description\"]]\n",
429
+ "sequences = [seq for seq in sequences if seq.strip()] # This changed the length!\n",
430
+ "isbns = missing_cats[\"isbn13\"].tolist()\n",
431
+ "predicted_cats = generate_predictions(sequences, fiction_categories, batch_size=128)\n",
432
+ "\n",
433
+ "\n"
434
+ ]
435
+ },
436
+ {
437
+ "cell_type": "code",
438
+ "execution_count": null,
439
+ "id": "4561a0670452fa3b",
440
+ "metadata": {
441
+ "ExecuteTime": {
442
+ "end_time": "2025-09-16T04:51:30.775050Z",
443
+ "start_time": "2025-09-16T04:51:30.573483Z"
444
+ }
445
+ },
446
+ "outputs": [],
447
+ "source": [
448
+ "# FIX: Get the correct ISBNs that match your filtered sequences\n",
449
+ "descriptions = missing_cats[\"description\"].tolist()\n",
450
+ "isbns_full = missing_cats[\"isbn13\"].tolist()\n",
451
+ "\n",
452
+ "matching_isbns = []\n",
453
+ "for i, desc in enumerate(descriptions):\n",
454
+ " processed_desc = str(desc)[:200] if pd.notna(desc) else \"\"\n",
455
+ " if processed_desc.strip(): # Same condition as your filter\n",
456
+ " matching_isbns.append(isbns_full[i])\n",
457
+ "\n",
458
+ "# Now create DataFrame with matching lengths\n",
459
+ "missing_predicted_df = pd.DataFrame({\n",
460
+ " \"isbn13\": matching_isbns[:len(predicted_cats)], # Safety check\n",
461
+ " \"predicted_categories\": predicted_cats\n",
462
+ "})\n",
463
+ "\n",
464
+ "print(f\"✅ DataFrame created successfully with {len(missing_predicted_df)} rows\")\n",
465
+ "print(f\"📊 Predictions by category:\")\n",
466
+ "print(missing_predicted_df['predicted_categories'].value_counts())\n",
467
+ "\n",
468
+ "# Save results\n",
469
+ "missing_predicted_df.to_csv('missing_categories_predictions.csv', index=False)\n",
470
+ "print(\"💾 Results saved to missing_categories_predictions.csv\")"
471
+ ]
472
+ },
473
+ {
474
+ "cell_type": "code",
475
+ "execution_count": null,
476
+ "id": "72fe9a8b4b28a1c6",
477
+ "metadata": {
478
+ "ExecuteTime": {
479
+ "end_time": "2025-09-16T04:52:15.607087Z",
480
+ "start_time": "2025-09-16T04:52:15.520116Z"
481
+ }
482
+ },
483
+ "outputs": [],
484
+ "source": [
485
+ "missing_predicted_df.head()"
486
+ ]
487
+ },
488
+ {
489
+ "cell_type": "code",
490
+ "execution_count": null,
491
+ "id": "8b1f7af8aebf289e",
492
+ "metadata": {
493
+ "ExecuteTime": {
494
+ "end_time": "2025-09-16T05:00:37.137998Z",
495
+ "start_time": "2025-09-16T05:00:36.885979Z"
496
+ }
497
+ },
498
+ "outputs": [],
499
+ "source": [
500
+ "books = pd.merge(books, missing_predicted_df, on=\"isbn13\", how=\"left\")\n",
501
+ "books[\"simple_categories\"] = np.where(books[\"simple_categories\"].isna(), books[\"predicted_categories\"], books[\"simple_categories\"])\n",
502
+ "books = books.drop(columns=\"predicted_categories\")"
503
+ ]
504
+ },
505
+ {
506
+ "cell_type": "code",
507
+ "execution_count": null,
508
+ "id": "fe5b161193dab1f",
509
+ "metadata": {
510
+ "ExecuteTime": {
511
+ "end_time": "2025-09-16T05:00:50.989276Z",
512
+ "start_time": "2025-09-16T05:00:50.952202Z"
513
+ }
514
+ },
515
+ "outputs": [],
516
+ "source": [
517
+ "books"
518
+ ]
519
+ },
520
+ {
521
+ "cell_type": "code",
522
+ "execution_count": null,
523
+ "id": "9d2e1a8dbbd5d6bc",
524
+ "metadata": {
525
+ "ExecuteTime": {
526
+ "end_time": "2025-09-16T05:01:27.850818Z",
527
+ "start_time": "2025-09-16T05:01:27.781563Z"
528
+ }
529
+ },
530
+ "outputs": [],
531
+ "source": [
532
+ "books[books[\"categories\"].str.lower().isin([\n",
533
+ " \"romance\",\n",
534
+ " \"science fiction\",\n",
535
+ " \"scifi\",\n",
536
+ " \"fantasy\",\n",
537
+ " \"horror\",\n",
538
+ " \"mystery\",\n",
539
+ " \"thriller\",\n",
540
+ " \"comedy\",\n",
541
+ " \"crime\",\n",
542
+ " \"historical\"\n",
543
+ "])]"
544
+ ]
545
+ },
546
+ {
547
+ "cell_type": "code",
548
+ "execution_count": null,
549
+ "id": "bd067ee0696cac0b",
550
+ "metadata": {
551
+ "ExecuteTime": {
552
+ "end_time": "2025-09-16T05:04:09.432347Z",
553
+ "start_time": "2025-09-16T05:04:09.246658Z"
554
+ }
555
+ },
556
+ "outputs": [],
557
+ "source": [
558
+ "books.to_csv(\"books_with_categories.csv\", index=False)"
559
+ ]
560
+ },
561
+ {
562
+ "cell_type": "code",
563
+ "execution_count": null,
564
+ "id": "f8879607442c3f0f",
565
+ "metadata": {},
566
+ "outputs": [],
567
+ "source": []
568
+ }
569
+ ],
570
+ "metadata": {
571
+ "kernelspec": {
572
+ "display_name": "Python 3",
573
+ "language": "python",
574
+ "name": "python3"
575
+ },
576
+ "language_info": {
577
+ "codemirror_mode": {
578
+ "name": "ipython",
579
+ "version": 2
580
+ },
581
+ "file_extension": ".py",
582
+ "mimetype": "text/x-python",
583
+ "name": "python",
584
+ "nbconvert_exporter": "python",
585
+ "pygments_lexer": "ipython2",
586
+ "version": "2.7.6"
587
+ }
588
+ },
589
+ "nbformat": 4,
590
+ "nbformat_minor": 5
591
+ }
vector-search.ipynb ADDED
@@ -0,0 +1,296 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "code",
5
+ "execution_count": null,
6
+ "id": "initial_id",
7
+ "metadata": {
8
+ "ExecuteTime": {
9
+ "end_time": "2025-09-15T07:35:25.488414Z",
10
+ "start_time": "2025-09-15T07:35:25.460656Z"
11
+ }
12
+ },
13
+ "outputs": [],
14
+ "source": [
15
+ "from langchain.embeddings import HuggingFaceEmbeddings\n",
16
+ "\n",
17
+ "from langchain_text_splitters import CharacterTextSplitter\n",
18
+ "from langchain_openai import OpenAIEmbeddings\n",
19
+ "from langchain_chroma import Chroma\n",
20
+ "\n"
21
+ ]
22
+ },
23
+ {
24
+ "cell_type": "code",
25
+ "execution_count": null,
26
+ "id": "9e2d7510161fceb6",
27
+ "metadata": {
28
+ "ExecuteTime": {
29
+ "end_time": "2025-09-15T07:35:27.755330Z",
30
+ "start_time": "2025-09-15T07:35:27.736857Z"
31
+ }
32
+ },
33
+ "outputs": [],
34
+ "source": [
35
+ "from dotenv import load_dotenv\n",
36
+ "from dotenv import load_dotenv\n",
37
+ "import os\n",
38
+ "\n",
39
+ "# Load environment variables\n",
40
+ "load_dotenv()\n"
41
+ ]
42
+ },
43
+ {
44
+ "cell_type": "code",
45
+ "execution_count": null,
46
+ "id": "b1c5ca1012315fd2",
47
+ "metadata": {
48
+ "ExecuteTime": {
49
+ "end_time": "2025-09-15T07:35:30.169857Z",
50
+ "start_time": "2025-09-15T07:35:30.074451Z"
51
+ }
52
+ },
53
+ "outputs": [],
54
+ "source": [
55
+ "import pandas as pd\n",
56
+ "\n",
57
+ "books = pd.read_csv(\"books_cleaned.csv\")"
58
+ ]
59
+ },
60
+ {
61
+ "cell_type": "code",
62
+ "execution_count": null,
63
+ "id": "694a28505e311eea",
64
+ "metadata": {
65
+ "ExecuteTime": {
66
+ "end_time": "2025-09-15T07:35:32.747269Z",
67
+ "start_time": "2025-09-15T07:35:32.725973Z"
68
+ }
69
+ },
70
+ "outputs": [],
71
+ "source": [
72
+ "books"
73
+ ]
74
+ },
75
+ {
76
+ "cell_type": "code",
77
+ "execution_count": null,
78
+ "id": "eb17356cf0ecbbef",
79
+ "metadata": {
80
+ "ExecuteTime": {
81
+ "end_time": "2025-09-15T07:35:35.292093Z",
82
+ "start_time": "2025-09-15T07:35:35.243618Z"
83
+ }
84
+ },
85
+ "outputs": [],
86
+ "source": [
87
+ "books[\"tagged_description\"].to_csv(\"tagged_description.txt\",\n",
88
+ " index=False,\n",
89
+ " header=False)\n"
90
+ ]
91
+ },
92
+ {
93
+ "cell_type": "code",
94
+ "execution_count": null,
95
+ "id": "2db289c35716805c",
96
+ "metadata": {
97
+ "ExecuteTime": {
98
+ "end_time": "2025-09-15T07:35:42.877672Z",
99
+ "start_time": "2025-09-15T07:35:42.683378Z"
100
+ }
101
+ },
102
+ "outputs": [],
103
+ "source": [
104
+ "\n",
105
+ "# Load the file manually (more reliable)\n",
106
+ "with open(\"tagged_description.txt\", 'r', encoding='utf-8') as file:\n",
107
+ " content = file.read()\n",
108
+ "\n",
109
+ "# Create a document object\n",
110
+ "raw_documents = [Document(page_content=content)]\n",
111
+ "\n",
112
+ "# Split into chunks\n",
113
+ "text_splitter = CharacterTextSplitter(\n",
114
+ " chunk_size=1500, # Increased to avoid warnings\n",
115
+ " chunk_overlap=150,\n",
116
+ " separator=\"\\n\"\n",
117
+ ")\n",
118
+ "\n",
119
+ "documents = text_splitter.split_documents(raw_documents)\n",
120
+ "\n",
121
+ "print(f\"Successfully created {len(documents)} chunks\")\n",
122
+ "print(f\"First chunk preview:\\n{documents[0].page_content[:200]}...\")"
123
+ ]
124
+ },
125
+ {
126
+ "cell_type": "code",
127
+ "execution_count": null,
128
+ "id": "12d6dc1c1f518682",
129
+ "metadata": {
130
+ "ExecuteTime": {
131
+ "end_time": "2025-09-15T07:35:45.472985Z",
132
+ "start_time": "2025-09-15T07:35:45.467714Z"
133
+ }
134
+ },
135
+ "outputs": [],
136
+ "source": [
137
+ "documents[0]"
138
+ ]
139
+ },
140
+ {
141
+ "cell_type": "code",
142
+ "execution_count": null,
143
+ "id": "d73b0e5261855919",
144
+ "metadata": {
145
+ "ExecuteTime": {
146
+ "end_time": "2025-09-15T07:47:27.888830Z",
147
+ "start_time": "2025-09-15T07:36:56.075724Z"
148
+ }
149
+ },
150
+ "outputs": [],
151
+ "source": [
152
+ "!pip install sentence_transformers\n",
153
+ "embeddings = HuggingFaceEmbeddings(\n",
154
+ " model_name=\"all-MiniLM-L6-v2\", # Free, fast, and good quality\n",
155
+ " model_kwargs={'device': 'cpu'} # Use 'cuda' if you have a GPU\n",
156
+ ")\n",
157
+ "\n",
158
+ "db_books = Chroma.from_documents(\n",
159
+ " documents,\n",
160
+ " embedding=embeddings\n",
161
+ ")"
162
+ ]
163
+ },
164
+ {
165
+ "cell_type": "markdown",
166
+ "id": "9473a4b393977d6f",
167
+ "metadata": {},
168
+ "source": []
169
+ },
170
+ {
171
+ "cell_type": "code",
172
+ "execution_count": null,
173
+ "id": "8c28a61479deb520",
174
+ "metadata": {
175
+ "ExecuteTime": {
176
+ "end_time": "2025-09-15T07:47:49.568125Z",
177
+ "start_time": "2025-09-15T07:47:49.337737Z"
178
+ }
179
+ },
180
+ "outputs": [],
181
+ "source": [
182
+ "query = \"A book to teach children about nature\"\n",
183
+ "docs = db_books.similarity_search(query, k = 10)\n",
184
+ "docs"
185
+ ]
186
+ },
187
+ {
188
+ "cell_type": "code",
189
+ "execution_count": null,
190
+ "id": "57cebcff1d436b6a",
191
+ "metadata": {
192
+ "ExecuteTime": {
193
+ "end_time": "2025-09-15T07:54:09.612026Z",
194
+ "start_time": "2025-09-15T07:54:09.538027Z"
195
+ }
196
+ },
197
+ "outputs": [],
198
+ "source": [
199
+ "# Extract and clean the ISBN\n",
200
+ "isbn_str = docs[0].page_content.split()[0].strip()\n",
201
+ "# Remove quotes and convert to float first, then int\n",
202
+ "isbn_clean = isbn_str.replace('\"', '').replace(\"'\", \"\")\n",
203
+ "isbn_int = int(float(isbn_clean)) # float first to handle .0, then int\n",
204
+ "\n",
205
+ "# Now search\n",
206
+ "result = books[books[\"isbn13\"] == isbn_int]"
207
+ ]
208
+ },
209
+ {
210
+ "cell_type": "code",
211
+ "execution_count": null,
212
+ "id": "4155cc001df44e93",
213
+ "metadata": {
214
+ "ExecuteTime": {
215
+ "end_time": "2025-09-15T07:54:49.157935Z",
216
+ "start_time": "2025-09-15T07:54:49.088922Z"
217
+ }
218
+ },
219
+ "outputs": [],
220
+ "source": [
221
+ "result"
222
+ ]
223
+ },
224
+ {
225
+ "cell_type": "code",
226
+ "execution_count": null,
227
+ "id": "4c644a4b395fda08",
228
+ "metadata": {
229
+ "ExecuteTime": {
230
+ "end_time": "2025-09-15T08:23:46.545582Z",
231
+ "start_time": "2025-09-15T08:23:46.531998Z"
232
+ }
233
+ },
234
+ "outputs": [],
235
+ "source": [
236
+ "def retrieve_semantic_recommendations(\n",
237
+ " query: str,\n",
238
+ " top_k: int = 10,\n",
239
+ ") -> pd.DataFrame:\n",
240
+ " recs = db_books.similarity_search(query, k=50)\n",
241
+ "\n",
242
+ " books_list = []\n",
243
+ "\n",
244
+ " for i in range(0, len(recs)):\n",
245
+ " isbn_str = recs[i].page_content.strip('\"').split()[0]\n",
246
+ " books_list += [int(float(isbn_str))] # float() first, then int()\n",
247
+ "\n",
248
+ " return books[books[\"isbn13\"].isin(books_list)].head(top_k)"
249
+ ]
250
+ },
251
+ {
252
+ "cell_type": "code",
253
+ "execution_count": null,
254
+ "id": "b9eada846c702825",
255
+ "metadata": {
256
+ "ExecuteTime": {
257
+ "end_time": "2025-09-15T08:23:47.659278Z",
258
+ "start_time": "2025-09-15T08:23:47.501425Z"
259
+ }
260
+ },
261
+ "outputs": [],
262
+ "source": [
263
+ "retrieve_semantic_recommendations(\"A book to teach children about nature\")"
264
+ ]
265
+ },
266
+ {
267
+ "cell_type": "code",
268
+ "execution_count": null,
269
+ "id": "36d5bb5ac34f9b2d",
270
+ "metadata": {},
271
+ "outputs": [],
272
+ "source": []
273
+ }
274
+ ],
275
+ "metadata": {
276
+ "kernelspec": {
277
+ "display_name": "Python 3",
278
+ "language": "python",
279
+ "name": "python3"
280
+ },
281
+ "language_info": {
282
+ "codemirror_mode": {
283
+ "name": "ipython",
284
+ "version": 2
285
+ },
286
+ "file_extension": ".py",
287
+ "mimetype": "text/x-python",
288
+ "name": "python",
289
+ "nbconvert_exporter": "python",
290
+ "pygments_lexer": "ipython2",
291
+ "version": "2.7.6"
292
+ }
293
+ },
294
+ "nbformat": 4,
295
+ "nbformat_minor": 5
296
+ }