DeathBlade020 commited on
Commit
ca9f575
·
verified ·
1 Parent(s): 78b4348

Upload 11 files

Browse files
app.py ADDED
@@ -0,0 +1,124 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+ import numpy as np
3
+ from dotenv import load_dotenv
4
+
5
+ from langchain_community.document_loaders import TextLoader
6
+ from langchain_openai import OpenAIEmbeddings
7
+ from langchain_text_splitters import CharacterTextSplitter
8
+ from langchain_community.vectorstores import Chroma
9
+
10
+
11
+ import gradio as gr
12
+ import os
13
+
14
+ load_dotenv()
15
+
16
+ books = pd.read_csv("books_with_emotions.csv")
17
+ books["large_thumbnail"] = books["thumbnail"] + "&fife=w800"
18
+ books["large_thumbnail"] = np.where(
19
+ books["large_thumbnail"].isna(),
20
+ "cover-not-found.jpg",
21
+ books["large_thumbnail"],
22
+ )
23
+
24
+
25
+ persist_directory = "chroma_db_saved"
26
+ embedding = OpenAIEmbeddings()
27
+
28
+ if os.path.exists(persist_directory) and os.listdir(persist_directory):
29
+ print("🔄 Using existing Chroma DB from disk.")
30
+ db_books = Chroma(persist_directory=persist_directory, embedding_function=embedding)
31
+ else:
32
+ print("🆕 Creating new Chroma DB from documents.")
33
+ raw_documents = TextLoader("tagged_description.txt", encoding="utf-8").load()
34
+ text_splitter = CharacterTextSplitter(separator="\n", chunk_size=0, chunk_overlap=0)
35
+ documents = text_splitter.split_documents(raw_documents)
36
+
37
+ db_books = Chroma.from_documents(documents, embedding, persist_directory=persist_directory)
38
+ db_books.persist()
39
+
40
+
41
+ def retrieve_semantic_recommendations(
42
+ query: str,
43
+ category: str = "All",
44
+ tone: str = "All",
45
+ initial_top_k: int = 50,
46
+ final_top_k: int = 24,
47
+ ) -> pd.DataFrame:
48
+
49
+ recs = db_books.similarity_search(query, k=initial_top_k)
50
+ books_list = [int(rec.page_content.strip('"').split()[0]) for rec in recs]
51
+ book_recs = books[books["isbn13"].isin(books_list)].head(initial_top_k)
52
+
53
+ if category != "All":
54
+ book_recs = book_recs[book_recs["simple_categories"] == category].head(final_top_k)
55
+ else:
56
+ book_recs = book_recs.head(final_top_k)
57
+
58
+ if tone == "Happy":
59
+ book_recs.sort_values(by="joy", ascending=False, inplace=True)
60
+ elif tone == "Surprising":
61
+ book_recs.sort_values(by="surprise", ascending=False, inplace=True)
62
+ elif tone == "Angry":
63
+ book_recs.sort_values(by="anger", ascending=False, inplace=True)
64
+ elif tone == "Suspenseful":
65
+ book_recs.sort_values(by="fear", ascending=False, inplace=True)
66
+ elif tone == "Sad":
67
+ book_recs.sort_values(by="sadness", ascending=False, inplace=True)
68
+
69
+ return book_recs
70
+
71
+
72
+ def recommend_books(
73
+ query: str,
74
+ category: str,
75
+ tone: str
76
+ ):
77
+ recommendations = retrieve_semantic_recommendations(query, category, tone)
78
+ results = []
79
+
80
+ for _, row in recommendations.iterrows():
81
+ description = row["description"]
82
+ truncated_desc_split = description.split()
83
+ truncated_description = " ".join(truncated_desc_split[:30]) + "..."
84
+
85
+ authors_raw = row.get("authors", "")
86
+ if not isinstance(authors_raw, str):
87
+ authors_raw = "Unknown"
88
+
89
+ authors_split = authors_raw.split(";")
90
+ if len(authors_split) == 2:
91
+ authors_str = f"{authors_split[0]} and {authors_split[1]}"
92
+ elif len(authors_split) > 2:
93
+ authors_str = f"{', '.join(authors_split[:-1])}, and {authors_split[-1]}"
94
+ else:
95
+ authors_str = authors_raw
96
+
97
+
98
+ caption = f"{row['title']} by {authors_str}: {truncated_description}"
99
+ results.append((row["large_thumbnail"], caption))
100
+ return results
101
+
102
+ categories = ["All"] + sorted(books["simple_categories"].unique())
103
+ tones = ["All"] + ["Happy", "Surprising", "Angry", "Suspenseful", "Sad"]
104
+
105
+ with gr.Blocks() as dashboard:
106
+ gr.Markdown("# Semantic book recommender")
107
+
108
+ with gr.Row():
109
+ user_query = gr.Textbox(label = "Please enter a description of a book:",
110
+ placeholder = "e.g., A story about forgiveness")
111
+ category_dropdown = gr.Dropdown(choices = categories, label = "Select a category:", value = "All")
112
+ tone_dropdown = gr.Dropdown(choices = tones, label = "Select an emotional tone:", value = "All")
113
+ submit_button = gr.Button("Find recommendations")
114
+
115
+ gr.Markdown("## Recommendations")
116
+ output = gr.Gallery(label = "Recommended books", columns = 8, rows = 3)
117
+
118
+ submit_button.click(fn = recommend_books,
119
+ inputs = [user_query, category_dropdown, tone_dropdown],
120
+ outputs = output)
121
+
122
+
123
+ if __name__ == "__main__":
124
+ dashboard.launch(share = True)
books_cleaned.csv ADDED
The diff for this file is too large to render. See raw diff
 
books_with_categories.csv ADDED
The diff for this file is too large to render. See raw diff
 
books_with_emotions.csv ADDED
The diff for this file is too large to render. See raw diff
 
cover-not-found.jpg ADDED
data_exploration.ipynb ADDED
@@ -0,0 +1,269 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "code",
5
+ "execution_count": null,
6
+ "id": "176502f3",
7
+ "metadata": {},
8
+ "outputs": [],
9
+ "source": [
10
+ "# Cell 1: Download the dataset from Kaggle using kagglehub\n",
11
+ "import kagglehub\n",
12
+ "\n",
13
+ "# Download latest version\n",
14
+ "path = kagglehub.dataset_download(\"dylanjcastillo/7k-books-with-metadata\")\n",
15
+ "\n",
16
+ "print(\"Path to dataset files:\", path)"
17
+ ]
18
+ },
19
+ {
20
+ "cell_type": "code",
21
+ "execution_count": null,
22
+ "id": "2cddaba0",
23
+ "metadata": {},
24
+ "outputs": [],
25
+ "source": [
26
+ "import pandas as pd\n",
27
+ "\n",
28
+ "path = r\"C:\\Users\\ravis\\.cache\\kagglehub\\datasets\\dylanjcastillo\\7k-books-with-metadata\\versions\\3\\books.csv\"\n",
29
+ "\n",
30
+ "books = pd.read_csv(path)"
31
+ ]
32
+ },
33
+ {
34
+ "cell_type": "code",
35
+ "execution_count": null,
36
+ "id": "0f0a325a",
37
+ "metadata": {},
38
+ "outputs": [],
39
+ "source": [
40
+ "books.head()"
41
+ ]
42
+ },
43
+ {
44
+ "cell_type": "code",
45
+ "execution_count": null,
46
+ "id": "64a81372",
47
+ "metadata": {},
48
+ "outputs": [],
49
+ "source": [
50
+ "# Cell 4: Add columns for missing descriptions and calculate the age of each book\n",
51
+ "import numpy as np\n",
52
+ "\n",
53
+ "books[\"missing_description\"] = np.where(books[\"description\"].isna(), 1, 0)\n",
54
+ "books[\"age_of_book\"] = 2023 - books[\"published_year\"]"
55
+ ]
56
+ },
57
+ {
58
+ "cell_type": "code",
59
+ "execution_count": null,
60
+ "id": "937762e4",
61
+ "metadata": {},
62
+ "outputs": [],
63
+ "source": [
64
+ "# Cell 5: Compute and visualize the Spearman correlation matrix for selected columns\n",
65
+ "columns = ['num_pages','age_of_book', 'missing_description', 'average_rating']\n",
66
+ "\n",
67
+ "correlation_matrix = books[columns].corr(method='spearman')\n",
68
+ "\n",
69
+ "import seaborn as sns\n",
70
+ "import matplotlib.pyplot as plt\n",
71
+ "\n",
72
+ "sns.set_theme(style=\"white\")\n",
73
+ "plt.figure(figsize=(8, 6))\n",
74
+ "\n",
75
+ "heatmap = sns.heatmap(correlation_matrix,\n",
76
+ " annot=True,\n",
77
+ " fmt=\".2f\",\n",
78
+ " cmap=\"coolwarm\",\n",
79
+ " cbar_kws={'label': 'Spearman Correlation'}\n",
80
+ " )\n",
81
+ "heatmap.set_title('Spearman Correlation Matrix', fontdict={'fontsize':16}, pad=12)\n",
82
+ "\n",
83
+ "plt.show()"
84
+ ]
85
+ },
86
+ {
87
+ "cell_type": "code",
88
+ "execution_count": null,
89
+ "id": "b071bcdd",
90
+ "metadata": {},
91
+ "outputs": [],
92
+ "source": [
93
+ "# Cell 6: Filter out rows with missing values in key columns\n",
94
+ "book_missing = books[\n",
95
+ " ~(books['description'].isna()) &\n",
96
+ " ~(books['num_pages'].isna()) &\n",
97
+ " ~(books['average_rating'].isna()) &\n",
98
+ " ~(books['published_year'].isna())\n",
99
+ " ]"
100
+ ]
101
+ },
102
+ {
103
+ "cell_type": "code",
104
+ "execution_count": null,
105
+ "id": "059ad7c0",
106
+ "metadata": {},
107
+ "outputs": [],
108
+ "source": [
109
+ "book_missing.shape"
110
+ ]
111
+ },
112
+ {
113
+ "cell_type": "code",
114
+ "execution_count": null,
115
+ "id": "56e5f02e",
116
+ "metadata": {},
117
+ "outputs": [],
118
+ "source": [
119
+ "book_missing['categories'].value_counts().reset_index().sort_values(\"count\", ascending=False)"
120
+ ]
121
+ },
122
+ {
123
+ "cell_type": "code",
124
+ "execution_count": null,
125
+ "id": "6637f49c",
126
+ "metadata": {},
127
+ "outputs": [],
128
+ "source": [
129
+ "# Cell 9: Add a column counting the number of words in each book's description\n",
130
+ "book_missing['words_in_description'] = book_missing['description'].str.split().str.len()"
131
+ ]
132
+ },
133
+ {
134
+ "cell_type": "code",
135
+ "execution_count": null,
136
+ "id": "406785b4",
137
+ "metadata": {},
138
+ "outputs": [],
139
+ "source": [
140
+ "book_missing.loc[book_missing['words_in_description'].between(1,4), 'description']"
141
+ ]
142
+ },
143
+ {
144
+ "cell_type": "code",
145
+ "execution_count": null,
146
+ "id": "c6edd620",
147
+ "metadata": {},
148
+ "outputs": [],
149
+ "source": [
150
+ "book_missing.loc[book_missing['words_in_description'].between(5,14), 'description']"
151
+ ]
152
+ },
153
+ {
154
+ "cell_type": "code",
155
+ "execution_count": null,
156
+ "id": "7b1d5305",
157
+ "metadata": {},
158
+ "outputs": [],
159
+ "source": [
160
+ "book_missing.loc[book_missing['words_in_description'].between(15,24), 'description']"
161
+ ]
162
+ },
163
+ {
164
+ "cell_type": "code",
165
+ "execution_count": null,
166
+ "id": "44fc9f68",
167
+ "metadata": {},
168
+ "outputs": [],
169
+ "source": [
170
+ "book_missing.loc[book_missing['words_in_description'].between(25,34), 'description']"
171
+ ]
172
+ },
173
+ {
174
+ "cell_type": "code",
175
+ "execution_count": null,
176
+ "id": "62597c72",
177
+ "metadata": {},
178
+ "outputs": [],
179
+ "source": [
180
+ "# Cell 14: Filter books with at least 25 words in the description and show the shape\n",
181
+ "book_missing_25_words = book_missing[book_missing['words_in_description'] >= 25]\n",
182
+ "book_missing_25_words.shape"
183
+ ]
184
+ },
185
+ {
186
+ "cell_type": "code",
187
+ "execution_count": null,
188
+ "id": "be102f7e",
189
+ "metadata": {},
190
+ "outputs": [],
191
+ "source": [
192
+ "# Cell 15: Create a new column combining title and subtitle (if available)\n",
193
+ "book_missing_25_words['title_and_subtitle'] = (\n",
194
+ " np.where(\n",
195
+ " book_missing_25_words['subtitle'].isna(), book_missing_25_words['title'],\n",
196
+ " book_missing_25_words[['title', 'subtitle']].astype(str).agg(': '.join, axis=1)\n",
197
+ " )\n",
198
+ ")"
199
+ ]
200
+ },
201
+ {
202
+ "cell_type": "code",
203
+ "execution_count": null,
204
+ "id": "d7fc57e4",
205
+ "metadata": {},
206
+ "outputs": [],
207
+ "source": [
208
+ "book_missing_25_words.head(4)"
209
+ ]
210
+ },
211
+ {
212
+ "cell_type": "code",
213
+ "execution_count": null,
214
+ "id": "1684a367",
215
+ "metadata": {},
216
+ "outputs": [],
217
+ "source": [
218
+ "# Cell 17: Create a new column combining isbn13 and description for tagging\n",
219
+ "book_missing_25_words['tagged_description'] = book_missing_25_words[['isbn13', 'description']].astype(str).agg(' '.join, axis=1)"
220
+ ]
221
+ },
222
+ {
223
+ "cell_type": "code",
224
+ "execution_count": null,
225
+ "id": "faf74e50",
226
+ "metadata": {},
227
+ "outputs": [],
228
+ "source": [
229
+ "book_missing_25_words.tagged_description"
230
+ ]
231
+ },
232
+ {
233
+ "cell_type": "code",
234
+ "execution_count": null,
235
+ "id": "ff617bea",
236
+ "metadata": {},
237
+ "outputs": [],
238
+ "source": [
239
+ "# Cell 19: Save the cleaned DataFrame to a CSV file, dropping some columns\n",
240
+ "(\n",
241
+ " book_missing_25_words\n",
242
+ " .drop([\"subtitle\", \"missing_description\", \"age_of_book\", \"words_in_description\"], axis=1)\n",
243
+ " .to_csv(\"books_cleaned.csv\", index = False)\n",
244
+ ")"
245
+ ]
246
+ }
247
+ ],
248
+ "metadata": {
249
+ "kernelspec": {
250
+ "display_name": "myenv",
251
+ "language": "python",
252
+ "name": "python3"
253
+ },
254
+ "language_info": {
255
+ "codemirror_mode": {
256
+ "name": "ipython",
257
+ "version": 3
258
+ },
259
+ "file_extension": ".py",
260
+ "mimetype": "text/x-python",
261
+ "name": "python",
262
+ "nbconvert_exporter": "python",
263
+ "pygments_lexer": "ipython3",
264
+ "version": "3.10.0"
265
+ }
266
+ },
267
+ "nbformat": 4,
268
+ "nbformat_minor": 5
269
+ }
requirements.txt ADDED
@@ -0,0 +1,212 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ aiohappyeyeballs==2.6.1
2
+ aiohttp==3.12.9
3
+ aiosignal==1.3.2
4
+ altair==5.5.0
5
+ annotated-types==0.7.0
6
+ anyio==4.8.0
7
+ asgiref==3.8.1
8
+ asttokens==3.0.0
9
+ async-timeout==4.0.3
10
+ attrs==25.3.0
11
+ backcall==0.2.0
12
+ backoff==2.2.1
13
+ bcrypt==4.3.0
14
+ beautifulsoup4==4.13.4
15
+ bleach==6.2.0
16
+ blinker==1.9.0
17
+ build==1.2.2.post1
18
+ cachetools==5.5.2
19
+ certifi==2025.1.31
20
+ cffi==1.17.1
21
+ charset-normalizer==3.4.1
22
+ chromadb==1.0.12
23
+ click==8.1.8
24
+ colorama==0.4.6
25
+ coloredlogs==15.0.1
26
+ comm==0.2.2
27
+ cryptography==44.0.2
28
+ dataclasses-json==0.6.7
29
+ debugpy==1.8.14
30
+ decorator==5.2.1
31
+ defusedxml==0.7.1
32
+ Deprecated==1.2.18
33
+ distro==1.9.0
34
+ dnspython==2.7.0
35
+ docopt==0.6.2
36
+ durationpy==0.10
37
+ ecdsa==0.19.1
38
+ exceptiongroup==1.2.2
39
+ executing==2.2.0
40
+ fastapi==0.115.12
41
+ fastjsonschema==2.21.1
42
+ filelock==3.18.0
43
+ flatbuffers==25.2.10
44
+ frozenlist==1.6.2
45
+ fsspec==2025.5.1
46
+ gitdb==4.0.12
47
+ GitPython==3.1.44
48
+ google-auth==2.40.3
49
+ googleapis-common-protos==1.70.0
50
+ greenlet==3.2.2
51
+ groq==0.26.0
52
+ grpcio==1.72.1
53
+ h11==0.16.0
54
+ httpcore==1.0.9
55
+ httptools==0.6.4
56
+ httpx==0.28.1
57
+ httpx-sse==0.4.0
58
+ huggingface-hub==0.32.4
59
+ humanfriendly==10.0
60
+ idna==3.10
61
+ importlib_metadata==8.7.0
62
+ importlib_resources==6.5.2
63
+ ipykernel==6.29.5
64
+ ipython==8.12.3
65
+ ipywidgets==8.1.7
66
+ jedi==0.19.2
67
+ Jinja2==3.1.6
68
+ jiter==0.10.0
69
+ joblib==1.5.1
70
+ jsonpatch==1.33
71
+ jsonpointer==3.0.0
72
+ jsonschema==4.24.0
73
+ jsonschema-specifications==2025.4.1
74
+ jupyter_client==8.6.3
75
+ jupyter_core==5.7.2
76
+ jupyterlab_pygments==0.3.0
77
+ jupyterlab_widgets==3.0.15
78
+ kubernetes==32.0.1
79
+ langchain==0.3.25
80
+ langchain-community==0.3.24
81
+ langchain-core==0.3.65
82
+ langchain-groq==0.3.2
83
+ langchain-openai==0.3.23
84
+ langchain-text-splitters==0.3.8
85
+ langchainhub==0.1.21
86
+ langsmith==0.3.45
87
+ limits==4.1
88
+ markdown-it-py==3.0.0
89
+ MarkupSafe==3.0.2
90
+ marshmallow==3.26.1
91
+ matplotlib-inline==0.1.7
92
+ mdurl==0.1.2
93
+ mistune==3.1.3
94
+ mmh3==5.1.0
95
+ mpmath==1.3.0
96
+ multidict==6.4.4
97
+ mypy_extensions==1.1.0
98
+ narwhals==1.41.0
99
+ nbclient==0.10.2
100
+ nbconvert==7.16.6
101
+ nbformat==5.10.4
102
+ nest-asyncio==1.6.0
103
+ networkx==3.4.2
104
+ numpy==2.2.5
105
+ oauthlib==3.2.2
106
+ onnxruntime==1.22.0
107
+ openai==1.82.1
108
+ opentelemetry-api==1.34.0
109
+ opentelemetry-exporter-otlp-proto-common==1.34.0
110
+ opentelemetry-exporter-otlp-proto-grpc==1.34.0
111
+ opentelemetry-instrumentation==0.55b0
112
+ opentelemetry-instrumentation-asgi==0.55b0
113
+ opentelemetry-instrumentation-fastapi==0.55b0
114
+ opentelemetry-proto==1.34.0
115
+ opentelemetry-sdk==1.34.0
116
+ opentelemetry-semantic-conventions==0.55b0
117
+ opentelemetry-util-http==0.55b0
118
+ orjson==3.10.18
119
+ overrides==7.7.0
120
+ packaging==24.2
121
+ pandas==2.3.0
122
+ pandocfilters==1.5.1
123
+ parso==0.8.4
124
+ pickleshare==0.7.5
125
+ pillow==11.2.1
126
+ pipreqs==0.5.0
127
+ platformdirs==4.3.7
128
+ posthog==4.4.0
129
+ prompt_toolkit==3.0.51
130
+ propcache==0.3.1
131
+ protobuf==5.29.5
132
+ psutil==7.0.0
133
+ psycopg==3.2.9
134
+ psycopg-binary==3.2.9
135
+ pure_eval==0.2.3
136
+ pyarrow==20.0.0
137
+ pyasn1==0.6.1
138
+ pyasn1_modules==0.4.2
139
+ pycparser==2.22
140
+ pydantic==2.10.6
141
+ pydantic-settings==2.8.1
142
+ pydantic_core==2.27.2
143
+ pydeck==0.9.1
144
+ Pygments==2.19.1
145
+ PyJWT==2.10.1
146
+ pymongo==4.11.2
147
+ PyPika==0.48.9
148
+ pyproject_hooks==1.2.0
149
+ pyreadline3==3.5.4
150
+ python-dateutil==2.9.0.post0
151
+ python-dotenv==1.0.1
152
+ python-jose==3.4.0
153
+ pytz==2025.1
154
+ pywin32==310
155
+ PyYAML==6.0.2
156
+ pyzmq==26.4.0
157
+ referencing==0.36.2
158
+ regex==2024.11.6
159
+ requests==2.32.3
160
+ requests-oauthlib==2.0.0
161
+ requests-toolbelt==1.0.0
162
+ rich==14.0.0
163
+ rpds-py==0.25.1
164
+ rsa==4.9
165
+ safetensors==0.5.3
166
+ scikit-learn==1.7.0
167
+ scipy==1.15.3
168
+ sentence-transformers==4.1.0
169
+ shellingham==1.5.4
170
+ six==1.17.0
171
+ slowapi==0.1.9
172
+ smmap==5.0.2
173
+ sniffio==1.3.1
174
+ soupsieve==2.7
175
+ SQLAlchemy==2.0.41
176
+ sqlmodel==0.0.24
177
+ stack-data==0.6.3
178
+ starlette==0.45.3
179
+ streamlit==1.45.1
180
+ sympy==1.14.0
181
+ tenacity==9.1.2
182
+ threadpoolctl==3.6.0
183
+ tiktoken==0.9.0
184
+ tinycss2==1.4.0
185
+ tokenizers==0.21.1
186
+ toml==0.10.2
187
+ tomli==2.2.1
188
+ torch==2.7.1
189
+ tornado==6.4.2
190
+ tqdm==4.67.1
191
+ traitlets==5.14.3
192
+ transformers==4.52.4
193
+ typer==0.16.0
194
+ types-requests==2.32.4.20250611
195
+ typing-inspect==0.9.0
196
+ typing-inspection==0.4.0
197
+ typing_extensions==4.12.2
198
+ tzdata==2025.2
199
+ urllib3==2.4.0
200
+ uvicorn==0.34.0
201
+ watchdog==6.0.0
202
+ watchfiles==1.0.5
203
+ wcwidth==0.2.13
204
+ webencodings==0.5.1
205
+ websocket-client==1.8.0
206
+ websockets==15.0.1
207
+ widgetsnbextension==4.0.14
208
+ wrapt==1.17.2
209
+ yarg==0.1.9
210
+ yarl==1.20.0
211
+ zipp==3.23.0
212
+ zstandard==0.23.0
sentiment_analysis.ipynb ADDED
The diff for this file is too large to render. See raw diff
 
tagged_description.txt ADDED
The diff for this file is too large to render. See raw diff
 
text_classification.ipynb ADDED
@@ -0,0 +1,460 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "code",
5
+ "execution_count": null,
6
+ "id": "063cfaa8",
7
+ "metadata": {
8
+ "id": "063cfaa8"
9
+ },
10
+ "outputs": [],
11
+ "source": [
12
+ "import pandas as pd"
13
+ ]
14
+ },
15
+ {
16
+ "cell_type": "code",
17
+ "execution_count": null,
18
+ "id": "1c81ca58",
19
+ "metadata": {
20
+ "id": "1c81ca58"
21
+ },
22
+ "outputs": [],
23
+ "source": [
24
+ "books = pd.read_csv('books_cleaned.csv')"
25
+ ]
26
+ },
27
+ {
28
+ "cell_type": "code",
29
+ "execution_count": null,
30
+ "id": "8244b265",
31
+ "metadata": {
32
+ "colab": {
33
+ "base_uri": "https://localhost:8080/",
34
+ "height": 423
35
+ },
36
+ "id": "8244b265",
37
+ "outputId": "0af624a8-1577-4c66-e252-a29479ad8446"
38
+ },
39
+ "outputs": [],
40
+ "source": [
41
+ "books['categories'].value_counts().reset_index()"
42
+ ]
43
+ },
44
+ {
45
+ "cell_type": "code",
46
+ "execution_count": null,
47
+ "id": "aefb7553",
48
+ "metadata": {
49
+ "colab": {
50
+ "base_uri": "https://localhost:8080/",
51
+ "height": 425
52
+ },
53
+ "id": "aefb7553",
54
+ "outputId": "5aad706e-7f5c-4704-dc31-3fbb00a8da9c"
55
+ },
56
+ "outputs": [],
57
+ "source": [
58
+ "books['categories'].value_counts().reset_index().query('count > 50')"
59
+ ]
60
+ },
61
+ {
62
+ "cell_type": "code",
63
+ "execution_count": null,
64
+ "id": "f8635069",
65
+ "metadata": {
66
+ "id": "f8635069"
67
+ },
68
+ "outputs": [],
69
+ "source": [
70
+ "category_mapping = {\n",
71
+ " 'Fiction' : \"Fiction\",\n",
72
+ " 'Juvenile Fiction': \"Children's Fiction\",\n",
73
+ " 'Biography & Autobiography': \"Nonfiction\",\n",
74
+ " 'History': \"Nonfiction\",\n",
75
+ " 'Literary Criticism': \"Nonfiction\",\n",
76
+ " 'Philosophy': \"Nonfiction\",\n",
77
+ " 'Religion': \"Nonfiction\",\n",
78
+ " 'Comics & Graphic Novels': \"Fiction\",\n",
79
+ " 'Drama': \"Fiction\",\n",
80
+ " 'Juvenile Nonfiction': \"Children's Nonfiction\",\n",
81
+ " 'Science': \"Nonfiction\",\n",
82
+ " 'Poetry': \"Fiction\"\n",
83
+ " }\n",
84
+ "\n",
85
+ "books['simple_categories'] = books['categories'].map(category_mapping)"
86
+ ]
87
+ },
88
+ {
89
+ "cell_type": "code",
90
+ "execution_count": null,
91
+ "id": "7f1a4097",
92
+ "metadata": {
93
+ "colab": {
94
+ "base_uri": "https://localhost:8080/"
95
+ },
96
+ "id": "7f1a4097",
97
+ "outputId": "3a4a95b5-c920-4d85-a62d-8ca302670df1"
98
+ },
99
+ "outputs": [],
100
+ "source": [
101
+ "books[~(books['simple_categories'].isna())].shape"
102
+ ]
103
+ },
104
+ {
105
+ "cell_type": "code",
106
+ "execution_count": null,
107
+ "id": "09433430",
108
+ "metadata": {
109
+ "colab": {
110
+ "base_uri": "https://localhost:8080/"
111
+ },
112
+ "id": "09433430",
113
+ "outputId": "b8840759-5f44-4dc0-81a1-63c70b489653"
114
+ },
115
+ "outputs": [],
116
+ "source": [
117
+ "from transformers import pipeline\n",
118
+ "fiction_categories = [\n",
119
+ " \"Fiction\",\n",
120
+ " \"Nonfiction\"]\n",
121
+ "pipe = pipeline(\"zero-shot-classification\",\n",
122
+ " model=\"facebook/bart-large-mnli\",\n",
123
+ " device=\"cuda\")"
124
+ ]
125
+ },
126
+ {
127
+ "cell_type": "code",
128
+ "execution_count": null,
129
+ "id": "a44f3e64",
130
+ "metadata": {
131
+ "id": "a44f3e64"
132
+ },
133
+ "outputs": [],
134
+ "source": [
135
+ "sequence = books.loc[books['simple_categories']==\"Fiction\", 'description'].reset_index(drop=True)[0]"
136
+ ]
137
+ },
138
+ {
139
+ "cell_type": "code",
140
+ "execution_count": null,
141
+ "id": "3e3ff995",
142
+ "metadata": {
143
+ "colab": {
144
+ "base_uri": "https://localhost:8080/"
145
+ },
146
+ "id": "3e3ff995",
147
+ "outputId": "00ea0a49-3a9e-4d24-a621-92fb84de7d92"
148
+ },
149
+ "outputs": [],
150
+ "source": [
151
+ "pipe(sequence,fiction_categories)"
152
+ ]
153
+ },
154
+ {
155
+ "cell_type": "code",
156
+ "execution_count": null,
157
+ "id": "16b259eb",
158
+ "metadata": {
159
+ "colab": {
160
+ "base_uri": "https://localhost:8080/",
161
+ "height": 36
162
+ },
163
+ "id": "16b259eb",
164
+ "outputId": "3d8a6725-f246-49c9-cfce-ebd4c6e79151"
165
+ },
166
+ "outputs": [],
167
+ "source": [
168
+ "import numpy as np\n",
169
+ "max_index = np.argmax(pipe(sequence,fiction_categories)['scores'])\n",
170
+ "max_label = pipe(sequence,fiction_categories)['labels'][max_index]\n",
171
+ "\n",
172
+ "max_label"
173
+ ]
174
+ },
175
+ {
176
+ "cell_type": "code",
177
+ "execution_count": null,
178
+ "id": "bd1a160f",
179
+ "metadata": {
180
+ "id": "bd1a160f"
181
+ },
182
+ "outputs": [],
183
+ "source": [
184
+ "def generate_predictions(sequence, categories):\n",
185
+ " results = pipe(sequence, categories)\n",
186
+ " max_index = np.argmax(results['scores'])\n",
187
+ " max_label = results['labels'][max_index]\n",
188
+ " return max_label"
189
+ ]
190
+ },
191
+ {
192
+ "cell_type": "code",
193
+ "execution_count": null,
194
+ "id": "4945125a",
195
+ "metadata": {
196
+ "colab": {
197
+ "base_uri": "https://localhost:8080/"
198
+ },
199
+ "id": "4945125a",
200
+ "outputId": "bbd1a877-0c02-4860-d289-201ac5ff6f41"
201
+ },
202
+ "outputs": [],
203
+ "source": [
204
+ "from tqdm import tqdm\n",
205
+ "\n",
206
+ "actual_cats = []\n",
207
+ "predicted_cats = []\n",
208
+ "\n",
209
+ "for i in tqdm(range(0, 300)):\n",
210
+ " sequence = books.loc[books['simple_categories']==\"Fiction\", 'description'].reset_index(drop=True)[i]\n",
211
+ " predicted_cats.append(generate_predictions(sequence, fiction_categories))\n",
212
+ " actual_cats.append(\"Fiction\")"
213
+ ]
214
+ },
215
+ {
216
+ "cell_type": "code",
217
+ "execution_count": null,
218
+ "id": "efd30e84",
219
+ "metadata": {
220
+ "colab": {
221
+ "base_uri": "https://localhost:8080/"
222
+ },
223
+ "id": "efd30e84",
224
+ "outputId": "462d4f63-d2cd-4dc8-de04-eb9da6c55b20"
225
+ },
226
+ "outputs": [],
227
+ "source": [
228
+ "for i in tqdm(range(0, 300)):\n",
229
+ " sequence = books.loc[books['simple_categories']==\"Nonfiction\", 'description'].reset_index(drop=True)[i]\n",
230
+ " predicted_cats.append(generate_predictions(sequence, fiction_categories))\n",
231
+ " actual_cats.append(\"Nonfiction\")"
232
+ ]
233
+ },
234
+ {
235
+ "cell_type": "code",
236
+ "execution_count": null,
237
+ "id": "34322614",
238
+ "metadata": {
239
+ "id": "34322614"
240
+ },
241
+ "outputs": [],
242
+ "source": [
243
+ "predictions_df = pd.DataFrame({\n",
244
+ " 'actual': actual_cats,\n",
245
+ " 'predicted': predicted_cats\n",
246
+ "})"
247
+ ]
248
+ },
249
+ {
250
+ "cell_type": "code",
251
+ "execution_count": null,
252
+ "id": "fc41ebe7",
253
+ "metadata": {
254
+ "id": "fc41ebe7"
255
+ },
256
+ "outputs": [],
257
+ "source": [
258
+ "predictions_df['correct_prediction'] = (\n",
259
+ " np.where(predictions_df['actual'] == predictions_df['predicted'], 1, 0)\n",
260
+ ")"
261
+ ]
262
+ },
263
+ {
264
+ "cell_type": "code",
265
+ "execution_count": null,
266
+ "id": "325834c0",
267
+ "metadata": {
268
+ "colab": {
269
+ "base_uri": "https://localhost:8080/"
270
+ },
271
+ "id": "325834c0",
272
+ "outputId": "479b349e-c6e4-49cf-eda1-107fe595b57c"
273
+ },
274
+ "outputs": [],
275
+ "source": [
276
+ "predictions_df['correct_prediction'].sum() / predictions_df.shape[0]"
277
+ ]
278
+ },
279
+ {
280
+ "cell_type": "code",
281
+ "execution_count": null,
282
+ "id": "4f3834ac",
283
+ "metadata": {
284
+ "id": "4f3834ac"
285
+ },
286
+ "outputs": [],
287
+ "source": [
288
+ "isbns = []\n",
289
+ "predicted_cats = []\n",
290
+ "missing_cats = books.loc[books['simple_categories'].isna(), ['isbn13', \"description\"]].reset_index(drop=True)"
291
+ ]
292
+ },
293
+ {
294
+ "cell_type": "code",
295
+ "execution_count": null,
296
+ "id": "38a9529a",
297
+ "metadata": {
298
+ "colab": {
299
+ "base_uri": "https://localhost:8080/"
300
+ },
301
+ "id": "38a9529a",
302
+ "outputId": "190e56ad-fa21-4b98-fcb2-7896197bc349"
303
+ },
304
+ "outputs": [],
305
+ "source": [
306
+ "for i in tqdm(range(0, missing_cats.shape[0])):\n",
307
+ " sequence = missing_cats['description'][i]\n",
308
+ " pred = generate_predictions(sequence, fiction_categories)\n",
309
+ " predicted_cats.append(pred)\n",
310
+ " isbns.append(missing_cats['isbn13'][i])\n"
311
+ ]
312
+ },
313
+ {
314
+ "cell_type": "code",
315
+ "execution_count": null,
316
+ "id": "d76c7c58",
317
+ "metadata": {
318
+ "id": "d76c7c58"
319
+ },
320
+ "outputs": [],
321
+ "source": [
322
+ "missing_predictions_df = pd.DataFrame({\n",
323
+ " 'isbn13': isbns,\n",
324
+ " 'predicted': predicted_cats\n",
325
+ "})"
326
+ ]
327
+ },
328
+ {
329
+ "cell_type": "code",
330
+ "execution_count": null,
331
+ "id": "6e83ce7e",
332
+ "metadata": {
333
+ "id": "6e83ce7e"
334
+ },
335
+ "outputs": [],
336
+ "source": [
337
+ "missing_predictions_df"
338
+ ]
339
+ },
340
+ {
341
+ "cell_type": "code",
342
+ "execution_count": null,
343
+ "id": "17fbb19c",
344
+ "metadata": {
345
+ "colab": {
346
+ "base_uri": "https://localhost:8080/"
347
+ },
348
+ "id": "17fbb19c",
349
+ "outputId": "dfc3c2ef-0e99-4d76-f3eb-aca138c6782f"
350
+ },
351
+ "outputs": [],
352
+ "source": [
353
+ "books.columns"
354
+ ]
355
+ },
356
+ {
357
+ "cell_type": "code",
358
+ "execution_count": null,
359
+ "id": "f0060e11",
360
+ "metadata": {
361
+ "id": "f0060e11"
362
+ },
363
+ "outputs": [],
364
+ "source": [
365
+ "asdf"
366
+ ]
367
+ },
368
+ {
369
+ "cell_type": "code",
370
+ "execution_count": null,
371
+ "id": "c5c6ad46",
372
+ "metadata": {
373
+ "id": "c5c6ad46"
374
+ },
375
+ "outputs": [],
376
+ "source": [
377
+ "books = pd.merge(books, missing_predictions_df, on='isbn13', how='left')\n",
378
+ "books['simple_categories'] = np.where(books['simple_categories'].isna(),books['predicted'], books['simple_categories'])\n",
379
+ "books = books.drop(columns=['predicted'])\n"
380
+ ]
381
+ },
382
+ {
383
+ "cell_type": "code",
384
+ "execution_count": null,
385
+ "id": "83beb18f",
386
+ "metadata": {
387
+ "colab": {
388
+ "base_uri": "https://localhost:8080/",
389
+ "height": 271
390
+ },
391
+ "id": "83beb18f",
392
+ "outputId": "e7c6e296-748c-4f12-d818-e16b75083bff"
393
+ },
394
+ "outputs": [],
395
+ "source": [
396
+ "books.head(2)"
397
+ ]
398
+ },
399
+ {
400
+ "cell_type": "code",
401
+ "execution_count": null,
402
+ "id": "1e8e9374",
403
+ "metadata": {
404
+ "id": "1e8e9374"
405
+ },
406
+ "outputs": [],
407
+ "source": [
408
+ "books[books[\"categories\"].str.lower().isin([\n",
409
+ " \"romance\",\n",
410
+ " \"science fiction\",\n",
411
+ " \"scifi\",\n",
412
+ " \"fantasy\",\n",
413
+ " \"horror\",\n",
414
+ " \"mystery\",\n",
415
+ " \"thriller\",\n",
416
+ " \"comedy\",\n",
417
+ " \"crime\",\n",
418
+ " \"historical\"\n",
419
+ "])]"
420
+ ]
421
+ },
422
+ {
423
+ "cell_type": "code",
424
+ "execution_count": null,
425
+ "id": "DA0gYVkklR1e",
426
+ "metadata": {
427
+ "id": "DA0gYVkklR1e"
428
+ },
429
+ "outputs": [],
430
+ "source": [
431
+ "books.to_csv(\"books_with_categories.csv\", index=False)"
432
+ ]
433
+ }
434
+ ],
435
+ "metadata": {
436
+ "accelerator": "GPU",
437
+ "colab": {
438
+ "gpuType": "T4",
439
+ "provenance": []
440
+ },
441
+ "kernelspec": {
442
+ "display_name": "Python 3",
443
+ "name": "python3"
444
+ },
445
+ "language_info": {
446
+ "codemirror_mode": {
447
+ "name": "ipython",
448
+ "version": 3
449
+ },
450
+ "file_extension": ".py",
451
+ "mimetype": "text/x-python",
452
+ "name": "python",
453
+ "nbconvert_exporter": "python",
454
+ "pygments_lexer": "ipython3",
455
+ "version": "3.10.0"
456
+ }
457
+ },
458
+ "nbformat": 4,
459
+ "nbformat_minor": 5
460
+ }
vector_search.ipynb ADDED
@@ -0,0 +1,168 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "code",
5
+ "execution_count": null,
6
+ "id": "8c67e153",
7
+ "metadata": {},
8
+ "outputs": [],
9
+ "source": [
10
+ "from langchain_community.document_loaders import TextLoader\n",
11
+ "from langchain_text_splitters import CharacterTextSplitter\n",
12
+ "from langchain_openai import OpenAIEmbeddings\n",
13
+ "from langchain_chroma import Chroma"
14
+ ]
15
+ },
16
+ {
17
+ "cell_type": "code",
18
+ "execution_count": null,
19
+ "id": "4e0a1fdc",
20
+ "metadata": {},
21
+ "outputs": [],
22
+ "source": [
23
+ "from dotenv import load_dotenv\n",
24
+ "load_dotenv()"
25
+ ]
26
+ },
27
+ {
28
+ "cell_type": "code",
29
+ "execution_count": null,
30
+ "id": "55bc2ba8",
31
+ "metadata": {},
32
+ "outputs": [],
33
+ "source": [
34
+ "import pandas as pd\n",
35
+ "\n",
36
+ "books = pd.read_csv(\"books_cleaned.csv\")"
37
+ ]
38
+ },
39
+ {
40
+ "cell_type": "code",
41
+ "execution_count": null,
42
+ "id": "28b3e45d",
43
+ "metadata": {},
44
+ "outputs": [],
45
+ "source": [
46
+ "books.shape"
47
+ ]
48
+ },
49
+ {
50
+ "cell_type": "code",
51
+ "execution_count": null,
52
+ "id": "331dede4",
53
+ "metadata": {},
54
+ "outputs": [],
55
+ "source": [
56
+ "books['tagged_description'].to_csv('tagged_description.txt', index=False, sep='\\n', header=False)"
57
+ ]
58
+ },
59
+ {
60
+ "cell_type": "code",
61
+ "execution_count": null,
62
+ "id": "47b0b3c2",
63
+ "metadata": {},
64
+ "outputs": [],
65
+ "source": [
66
+ "import os\n",
67
+ "\n",
68
+ "file_path = \"tagged_description.txt\"\n",
69
+ "\n",
70
+ "raw_documents = TextLoader(file_path, encoding=\"utf-8\").load()\n",
71
+ "text_splitter = CharacterTextSplitter(chunk_size=0, chunk_overlap=0, separator=\"\\n\")\n",
72
+ "documents = text_splitter.split_documents(raw_documents)"
73
+ ]
74
+ },
75
+ {
76
+ "cell_type": "code",
77
+ "execution_count": null,
78
+ "id": "2fc263ba",
79
+ "metadata": {},
80
+ "outputs": [],
81
+ "source": [
82
+ "documents[0]"
83
+ ]
84
+ },
85
+ {
86
+ "cell_type": "code",
87
+ "execution_count": null,
88
+ "id": "427fd244",
89
+ "metadata": {},
90
+ "outputs": [],
91
+ "source": [
92
+ "db_books = Chroma.from_documents(\n",
93
+ " documents,\n",
94
+ " OpenAIEmbeddings(),\n",
95
+ ")"
96
+ ]
97
+ },
98
+ {
99
+ "cell_type": "code",
100
+ "execution_count": null,
101
+ "id": "5f4476a6",
102
+ "metadata": {},
103
+ "outputs": [],
104
+ "source": [
105
+ "query = \"A book to teach children about nature\"\n",
106
+ "docs = db_books.similarity_search(query, k=3)\n",
107
+ "docs"
108
+ ]
109
+ },
110
+ {
111
+ "cell_type": "code",
112
+ "execution_count": null,
113
+ "id": "faaf7618",
114
+ "metadata": {},
115
+ "outputs": [],
116
+ "source": [
117
+ "books[books['isbn13'] == int(docs[0].page_content.split(\" \")[0].strip())]"
118
+ ]
119
+ },
120
+ {
121
+ "cell_type": "code",
122
+ "execution_count": null,
123
+ "id": "3a893785",
124
+ "metadata": {},
125
+ "outputs": [],
126
+ "source": [
127
+ "def retrieve_semantic_recommendations(query, top_k=3) -> pd.DataFrame:\n",
128
+ " recs = db_books.similarity_search(query, k=top_k)\n",
129
+ "\n",
130
+ " books_list = []\n",
131
+ " for i in range(0, len(recs)):\n",
132
+ " books_list += [int(recs[i].page_content.strip('\"').split()[0].strip())]\n",
133
+ " return books[books['isbn13'].isin(books_list)].head(top_k)"
134
+ ]
135
+ },
136
+ {
137
+ "cell_type": "code",
138
+ "execution_count": null,
139
+ "id": "6a557732",
140
+ "metadata": {},
141
+ "outputs": [],
142
+ "source": [
143
+ "retrieve_semantic_recommendations(query)"
144
+ ]
145
+ }
146
+ ],
147
+ "metadata": {
148
+ "kernelspec": {
149
+ "display_name": "myenv",
150
+ "language": "python",
151
+ "name": "python3"
152
+ },
153
+ "language_info": {
154
+ "codemirror_mode": {
155
+ "name": "ipython",
156
+ "version": 3
157
+ },
158
+ "file_extension": ".py",
159
+ "mimetype": "text/x-python",
160
+ "name": "python",
161
+ "nbconvert_exporter": "python",
162
+ "pygments_lexer": "ipython3",
163
+ "version": "3.10.0"
164
+ }
165
+ },
166
+ "nbformat": 4,
167
+ "nbformat_minor": 5
168
+ }