Spaces:

DeathBlade020
/

MoodBasedBookSearch

Sleeping

App Files Files Community

DeathBlade020 commited on Jun 22, 2025

Commit

ca9f575

verified ·

1 Parent(s): 78b4348

Upload 11 files

Browse files

Files changed (11) hide show

app.py +124 -0
books_cleaned.csv +0 -0
books_with_categories.csv +0 -0
books_with_emotions.csv +0 -0
cover-not-found.jpg +0 -0
data_exploration.ipynb +269 -0
requirements.txt +212 -0
sentiment_analysis.ipynb +0 -0
tagged_description.txt +0 -0
text_classification.ipynb +460 -0
vector_search.ipynb +168 -0

app.py ADDED Viewed

	@@ -0,0 +1,124 @@

+import pandas as pd
+import numpy as np
+from dotenv import load_dotenv
+from langchain_community.document_loaders import TextLoader
+from langchain_openai import OpenAIEmbeddings
+from langchain_text_splitters import CharacterTextSplitter
+from langchain_community.vectorstores import Chroma
+import gradio as gr
+import os
+load_dotenv()
+books = pd.read_csv("books_with_emotions.csv")
+books["large_thumbnail"] = books["thumbnail"] + "&fife=w800"
+books["large_thumbnail"] = np.where(
+    books["large_thumbnail"].isna(),
+    "cover-not-found.jpg",
+    books["large_thumbnail"],
+)
+persist_directory = "chroma_db_saved"
+embedding = OpenAIEmbeddings()
+if os.path.exists(persist_directory) and os.listdir(persist_directory):
+    print("🔄 Using existing Chroma DB from disk.")
+    db_books = Chroma(persist_directory=persist_directory, embedding_function=embedding)
+else:
+    print("🆕 Creating new Chroma DB from documents.")
+    raw_documents = TextLoader("tagged_description.txt", encoding="utf-8").load()
+    text_splitter = CharacterTextSplitter(separator="\n", chunk_size=0, chunk_overlap=0)
+    documents = text_splitter.split_documents(raw_documents)
+    db_books = Chroma.from_documents(documents, embedding, persist_directory=persist_directory)
+    db_books.persist()
+def retrieve_semantic_recommendations(
+        query: str,
+        category: str = "All",
+        tone: str = "All",
+        initial_top_k: int = 50,
+        final_top_k: int = 24,
+) -> pd.DataFrame:
+    recs = db_books.similarity_search(query, k=initial_top_k)
+    books_list = [int(rec.page_content.strip('"').split()[0]) for rec in recs]
+    book_recs = books[books["isbn13"].isin(books_list)].head(initial_top_k)
+    if category != "All":
+        book_recs = book_recs[book_recs["simple_categories"] == category].head(final_top_k)
+    else:
+        book_recs = book_recs.head(final_top_k)
+    if tone == "Happy":
+        book_recs.sort_values(by="joy", ascending=False, inplace=True)
+    elif tone == "Surprising":
+        book_recs.sort_values(by="surprise", ascending=False, inplace=True)
+    elif tone == "Angry":
+        book_recs.sort_values(by="anger", ascending=False, inplace=True)
+    elif tone == "Suspenseful":
+        book_recs.sort_values(by="fear", ascending=False, inplace=True)
+    elif tone == "Sad":
+        book_recs.sort_values(by="sadness", ascending=False, inplace=True)
+    return book_recs
+def recommend_books(
+        query: str,
+        category: str,
+        tone: str
+):
+    recommendations = retrieve_semantic_recommendations(query, category, tone)
+    results = []
+    for _, row in recommendations.iterrows():
+        description = row["description"]
+        truncated_desc_split = description.split()
+        truncated_description = " ".join(truncated_desc_split[:30]) + "..."
+        authors_raw = row.get("authors", "")
+        if not isinstance(authors_raw, str):
+            authors_raw = "Unknown"
+        authors_split = authors_raw.split(";")
+        if len(authors_split) == 2:
+            authors_str = f"{authors_split[0]} and {authors_split[1]}"
+        elif len(authors_split) > 2:
+            authors_str = f"{', '.join(authors_split[:-1])}, and {authors_split[-1]}"
+        else:
+            authors_str = authors_raw
+        caption = f"{row['title']} by {authors_str}: {truncated_description}"
+        results.append((row["large_thumbnail"], caption))
+    return results
+categories = ["All"] + sorted(books["simple_categories"].unique())
+tones = ["All"] + ["Happy", "Surprising", "Angry", "Suspenseful", "Sad"]
+with gr.Blocks() as dashboard:
+    gr.Markdown("# Semantic book recommender")
+    with gr.Row():
+        user_query = gr.Textbox(label = "Please enter a description of a book:",
+                                placeholder = "e.g., A story about forgiveness")
+        category_dropdown = gr.Dropdown(choices = categories, label = "Select a category:", value = "All")
+        tone_dropdown = gr.Dropdown(choices = tones, label = "Select an emotional tone:", value = "All")
+        submit_button = gr.Button("Find recommendations")
+    gr.Markdown("## Recommendations")
+    output = gr.Gallery(label = "Recommended books", columns = 8, rows = 3)
+    submit_button.click(fn = recommend_books,
+                        inputs = [user_query, category_dropdown, tone_dropdown],
+                        outputs = output)
+if __name__ == "__main__":
+    dashboard.launch(share = True)

books_cleaned.csv ADDED Viewed

The diff for this file is too large to render. See raw diff

books_with_categories.csv ADDED Viewed

The diff for this file is too large to render. See raw diff

books_with_emotions.csv ADDED Viewed

The diff for this file is too large to render. See raw diff

cover-not-found.jpg ADDED Viewed

data_exploration.ipynb ADDED Viewed

	@@ -0,0 +1,269 @@

+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "176502f3",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Cell 1: Download the dataset from Kaggle using kagglehub\n",
+    "import kagglehub\n",
+    "\n",
+    "# Download latest version\n",
+    "path = kagglehub.dataset_download(\"dylanjcastillo/7k-books-with-metadata\")\n",
+    "\n",
+    "print(\"Path to dataset files:\", path)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "2cddaba0",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import pandas as pd\n",
+    "\n",
+    "path = r\"C:\\Users\\ravis\\.cache\\kagglehub\\datasets\\dylanjcastillo\\7k-books-with-metadata\\versions\\3\\books.csv\"\n",
+    "\n",
+    "books = pd.read_csv(path)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "0f0a325a",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "books.head()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "64a81372",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Cell 4: Add columns for missing descriptions and calculate the age of each book\n",
+    "import numpy as np\n",
+    "\n",
+    "books[\"missing_description\"] = np.where(books[\"description\"].isna(), 1, 0)\n",
+    "books[\"age_of_book\"] = 2023 - books[\"published_year\"]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "937762e4",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Cell 5: Compute and visualize the Spearman correlation matrix for selected columns\n",
+    "columns = ['num_pages','age_of_book', 'missing_description', 'average_rating']\n",
+    "\n",
+    "correlation_matrix = books[columns].corr(method='spearman')\n",
+    "\n",
+    "import seaborn as sns\n",
+    "import matplotlib.pyplot as plt\n",
+    "\n",
+    "sns.set_theme(style=\"white\")\n",
+    "plt.figure(figsize=(8, 6))\n",
+    "\n",
+    "heatmap = sns.heatmap(correlation_matrix,\n",
+    "                      annot=True,\n",
+    "                      fmt=\".2f\",\n",
+    "                      cmap=\"coolwarm\",\n",
+    "                      cbar_kws={'label': 'Spearman Correlation'}\n",
+    "                      )\n",
+    "heatmap.set_title('Spearman Correlation Matrix', fontdict={'fontsize':16}, pad=12)\n",
+    "\n",
+    "plt.show()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "b071bcdd",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Cell 6: Filter out rows with missing values in key columns\n",
+    "book_missing = books[\n",
+    "        ~(books['description'].isna()) &\n",
+    "        ~(books['num_pages'].isna()) &\n",
+    "        ~(books['average_rating'].isna()) &\n",
+    "        ~(books['published_year'].isna())\n",
+    "    ]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "059ad7c0",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "book_missing.shape"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "56e5f02e",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "book_missing['categories'].value_counts().reset_index().sort_values(\"count\", ascending=False)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "6637f49c",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Cell 9: Add a column counting the number of words in each book's description\n",
+    "book_missing['words_in_description'] = book_missing['description'].str.split().str.len()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "406785b4",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "book_missing.loc[book_missing['words_in_description'].between(1,4), 'description']"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "c6edd620",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "book_missing.loc[book_missing['words_in_description'].between(5,14), 'description']"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "7b1d5305",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "book_missing.loc[book_missing['words_in_description'].between(15,24), 'description']"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "44fc9f68",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "book_missing.loc[book_missing['words_in_description'].between(25,34), 'description']"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "62597c72",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Cell 14: Filter books with at least 25 words in the description and show the shape\n",
+    "book_missing_25_words = book_missing[book_missing['words_in_description'] >= 25]\n",
+    "book_missing_25_words.shape"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "be102f7e",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Cell 15: Create a new column combining title and subtitle (if available)\n",
+    "book_missing_25_words['title_and_subtitle'] = (\n",
+    "    np.where(\n",
+    "        book_missing_25_words['subtitle'].isna(), book_missing_25_words['title'],\n",
+    "        book_missing_25_words[['title', 'subtitle']].astype(str).agg(': '.join, axis=1)\n",
+    "    )\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "d7fc57e4",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "book_missing_25_words.head(4)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "1684a367",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Cell 17: Create a new column combining isbn13 and description for tagging\n",
+    "book_missing_25_words['tagged_description'] = book_missing_25_words[['isbn13', 'description']].astype(str).agg(' '.join, axis=1)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "faf74e50",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "book_missing_25_words.tagged_description"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "ff617bea",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Cell 19: Save the cleaned DataFrame to a CSV file, dropping some columns\n",
+    "(\n",
+    "    book_missing_25_words\n",
+    "    .drop([\"subtitle\", \"missing_description\", \"age_of_book\", \"words_in_description\"], axis=1)\n",
+    "    .to_csv(\"books_cleaned.csv\", index = False)\n",
+    ")"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "myenv",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.10.0"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}

requirements.txt ADDED Viewed

	@@ -0,0 +1,212 @@

+aiohappyeyeballs==2.6.1
+aiohttp==3.12.9
+aiosignal==1.3.2
+altair==5.5.0
+annotated-types==0.7.0
+anyio==4.8.0
+asgiref==3.8.1
+asttokens==3.0.0
+async-timeout==4.0.3
+attrs==25.3.0
+backcall==0.2.0
+backoff==2.2.1
+bcrypt==4.3.0
+beautifulsoup4==4.13.4
+bleach==6.2.0
+blinker==1.9.0
+build==1.2.2.post1
+cachetools==5.5.2
+certifi==2025.1.31
+cffi==1.17.1
+charset-normalizer==3.4.1
+chromadb==1.0.12
+click==8.1.8
+colorama==0.4.6
+coloredlogs==15.0.1
+comm==0.2.2
+cryptography==44.0.2
+dataclasses-json==0.6.7
+debugpy==1.8.14
+decorator==5.2.1
+defusedxml==0.7.1
+Deprecated==1.2.18
+distro==1.9.0
+dnspython==2.7.0
+docopt==0.6.2
+durationpy==0.10
+ecdsa==0.19.1
+exceptiongroup==1.2.2
+executing==2.2.0
+fastapi==0.115.12
+fastjsonschema==2.21.1
+filelock==3.18.0
+flatbuffers==25.2.10
+frozenlist==1.6.2
+fsspec==2025.5.1
+gitdb==4.0.12
+GitPython==3.1.44
+google-auth==2.40.3
+googleapis-common-protos==1.70.0
+greenlet==3.2.2
+groq==0.26.0
+grpcio==1.72.1
+h11==0.16.0
+httpcore==1.0.9
+httptools==0.6.4
+httpx==0.28.1
+httpx-sse==0.4.0
+huggingface-hub==0.32.4
+humanfriendly==10.0
+idna==3.10
+importlib_metadata==8.7.0
+importlib_resources==6.5.2
+ipykernel==6.29.5
+ipython==8.12.3
+ipywidgets==8.1.7
+jedi==0.19.2
+Jinja2==3.1.6
+jiter==0.10.0
+joblib==1.5.1
+jsonpatch==1.33
+jsonpointer==3.0.0
+jsonschema==4.24.0
+jsonschema-specifications==2025.4.1
+jupyter_client==8.6.3
+jupyter_core==5.7.2
+jupyterlab_pygments==0.3.0
+jupyterlab_widgets==3.0.15
+kubernetes==32.0.1
+langchain==0.3.25
+langchain-community==0.3.24
+langchain-core==0.3.65
+langchain-groq==0.3.2
+langchain-openai==0.3.23
+langchain-text-splitters==0.3.8
+langchainhub==0.1.21
+langsmith==0.3.45
+limits==4.1
+markdown-it-py==3.0.0
+MarkupSafe==3.0.2
+marshmallow==3.26.1
+matplotlib-inline==0.1.7
+mdurl==0.1.2
+mistune==3.1.3
+mmh3==5.1.0
+mpmath==1.3.0
+multidict==6.4.4
+mypy_extensions==1.1.0
+narwhals==1.41.0
+nbclient==0.10.2
+nbconvert==7.16.6
+nbformat==5.10.4
+nest-asyncio==1.6.0
+networkx==3.4.2
+numpy==2.2.5
+oauthlib==3.2.2
+onnxruntime==1.22.0
+openai==1.82.1
+opentelemetry-api==1.34.0
+opentelemetry-exporter-otlp-proto-common==1.34.0
+opentelemetry-exporter-otlp-proto-grpc==1.34.0
+opentelemetry-instrumentation==0.55b0
+opentelemetry-instrumentation-asgi==0.55b0
+opentelemetry-instrumentation-fastapi==0.55b0
+opentelemetry-proto==1.34.0
+opentelemetry-sdk==1.34.0
+opentelemetry-semantic-conventions==0.55b0
+opentelemetry-util-http==0.55b0
+orjson==3.10.18
+overrides==7.7.0
+packaging==24.2
+pandas==2.3.0
+pandocfilters==1.5.1
+parso==0.8.4
+pickleshare==0.7.5
+pillow==11.2.1
+pipreqs==0.5.0
+platformdirs==4.3.7
+posthog==4.4.0
+prompt_toolkit==3.0.51
+propcache==0.3.1
+protobuf==5.29.5
+psutil==7.0.0
+psycopg==3.2.9
+psycopg-binary==3.2.9
+pure_eval==0.2.3
+pyarrow==20.0.0
+pyasn1==0.6.1
+pyasn1_modules==0.4.2
+pycparser==2.22
+pydantic==2.10.6
+pydantic-settings==2.8.1
+pydantic_core==2.27.2
+pydeck==0.9.1
+Pygments==2.19.1
+PyJWT==2.10.1
+pymongo==4.11.2
+PyPika==0.48.9
+pyproject_hooks==1.2.0
+pyreadline3==3.5.4
+python-dateutil==2.9.0.post0
+python-dotenv==1.0.1
+python-jose==3.4.0
+pytz==2025.1
+pywin32==310
+PyYAML==6.0.2
+pyzmq==26.4.0
+referencing==0.36.2
+regex==2024.11.6
+requests==2.32.3
+requests-oauthlib==2.0.0
+requests-toolbelt==1.0.0
+rich==14.0.0
+rpds-py==0.25.1
+rsa==4.9
+safetensors==0.5.3
+scikit-learn==1.7.0
+scipy==1.15.3
+sentence-transformers==4.1.0
+shellingham==1.5.4
+six==1.17.0
+slowapi==0.1.9
+smmap==5.0.2
+sniffio==1.3.1
+soupsieve==2.7
+SQLAlchemy==2.0.41
+sqlmodel==0.0.24
+stack-data==0.6.3
+starlette==0.45.3
+streamlit==1.45.1
+sympy==1.14.0
+tenacity==9.1.2
+threadpoolctl==3.6.0
+tiktoken==0.9.0
+tinycss2==1.4.0
+tokenizers==0.21.1
+toml==0.10.2
+tomli==2.2.1
+torch==2.7.1
+tornado==6.4.2
+tqdm==4.67.1
+traitlets==5.14.3
+transformers==4.52.4
+typer==0.16.0
+types-requests==2.32.4.20250611
+typing-inspect==0.9.0
+typing-inspection==0.4.0
+typing_extensions==4.12.2
+tzdata==2025.2
+urllib3==2.4.0
+uvicorn==0.34.0
+watchdog==6.0.0
+watchfiles==1.0.5
+wcwidth==0.2.13
+webencodings==0.5.1
+websocket-client==1.8.0
+websockets==15.0.1
+widgetsnbextension==4.0.14
+wrapt==1.17.2
+yarg==0.1.9
+yarl==1.20.0
+zipp==3.23.0
+zstandard==0.23.0

sentiment_analysis.ipynb ADDED Viewed

The diff for this file is too large to render. See raw diff

tagged_description.txt ADDED Viewed

The diff for this file is too large to render. See raw diff

text_classification.ipynb ADDED Viewed

	@@ -0,0 +1,460 @@

+{
+  "cells": [
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "id": "063cfaa8",
+      "metadata": {
+        "id": "063cfaa8"
+      },
+      "outputs": [],
+      "source": [
+        "import pandas as pd"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "id": "1c81ca58",
+      "metadata": {
+        "id": "1c81ca58"
+      },
+      "outputs": [],
+      "source": [
+        "books = pd.read_csv('books_cleaned.csv')"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "id": "8244b265",
+      "metadata": {
+        "colab": {
+          "base_uri": "https://localhost:8080/",
+          "height": 423
+        },
+        "id": "8244b265",
+        "outputId": "0af624a8-1577-4c66-e252-a29479ad8446"
+      },
+      "outputs": [],
+      "source": [
+        "books['categories'].value_counts().reset_index()"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "id": "aefb7553",
+      "metadata": {
+        "colab": {
+          "base_uri": "https://localhost:8080/",
+          "height": 425
+        },
+        "id": "aefb7553",
+        "outputId": "5aad706e-7f5c-4704-dc31-3fbb00a8da9c"
+      },
+      "outputs": [],
+      "source": [
+        "books['categories'].value_counts().reset_index().query('count > 50')"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "id": "f8635069",
+      "metadata": {
+        "id": "f8635069"
+      },
+      "outputs": [],
+      "source": [
+        "category_mapping = {\n",
+        "    'Fiction' : \"Fiction\",\n",
+        "    'Juvenile Fiction': \"Children's Fiction\",\n",
+        "    'Biography & Autobiography': \"Nonfiction\",\n",
+        "    'History': \"Nonfiction\",\n",
+        "    'Literary Criticism': \"Nonfiction\",\n",
+        "    'Philosophy': \"Nonfiction\",\n",
+        "    'Religion': \"Nonfiction\",\n",
+        "    'Comics & Graphic Novels': \"Fiction\",\n",
+        "    'Drama': \"Fiction\",\n",
+        "    'Juvenile Nonfiction': \"Children's Nonfiction\",\n",
+        "    'Science': \"Nonfiction\",\n",
+        "    'Poetry': \"Fiction\"\n",
+        "    }\n",
+        "\n",
+        "books['simple_categories'] = books['categories'].map(category_mapping)"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "id": "7f1a4097",
+      "metadata": {
+        "colab": {
+          "base_uri": "https://localhost:8080/"
+        },
+        "id": "7f1a4097",
+        "outputId": "3a4a95b5-c920-4d85-a62d-8ca302670df1"
+      },
+      "outputs": [],
+      "source": [
+        "books[~(books['simple_categories'].isna())].shape"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "id": "09433430",
+      "metadata": {
+        "colab": {
+          "base_uri": "https://localhost:8080/"
+        },
+        "id": "09433430",
+        "outputId": "b8840759-5f44-4dc0-81a1-63c70b489653"
+      },
+      "outputs": [],
+      "source": [
+        "from transformers import pipeline\n",
+        "fiction_categories = [\n",
+        "    \"Fiction\",\n",
+        "    \"Nonfiction\"]\n",
+        "pipe = pipeline(\"zero-shot-classification\",\n",
+        "                model=\"facebook/bart-large-mnli\",\n",
+        "                device=\"cuda\")"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "id": "a44f3e64",
+      "metadata": {
+        "id": "a44f3e64"
+      },
+      "outputs": [],
+      "source": [
+        "sequence = books.loc[books['simple_categories']==\"Fiction\", 'description'].reset_index(drop=True)[0]"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "id": "3e3ff995",
+      "metadata": {
+        "colab": {
+          "base_uri": "https://localhost:8080/"
+        },
+        "id": "3e3ff995",
+        "outputId": "00ea0a49-3a9e-4d24-a621-92fb84de7d92"
+      },
+      "outputs": [],
+      "source": [
+        "pipe(sequence,fiction_categories)"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "id": "16b259eb",
+      "metadata": {
+        "colab": {
+          "base_uri": "https://localhost:8080/",
+          "height": 36
+        },
+        "id": "16b259eb",
+        "outputId": "3d8a6725-f246-49c9-cfce-ebd4c6e79151"
+      },
+      "outputs": [],
+      "source": [
+        "import numpy as np\n",
+        "max_index = np.argmax(pipe(sequence,fiction_categories)['scores'])\n",
+        "max_label = pipe(sequence,fiction_categories)['labels'][max_index]\n",
+        "\n",
+        "max_label"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "id": "bd1a160f",
+      "metadata": {
+        "id": "bd1a160f"
+      },
+      "outputs": [],
+      "source": [
+        "def generate_predictions(sequence, categories):\n",
+        "    results = pipe(sequence, categories)\n",
+        "    max_index = np.argmax(results['scores'])\n",
+        "    max_label = results['labels'][max_index]\n",
+        "    return max_label"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "id": "4945125a",
+      "metadata": {
+        "colab": {
+          "base_uri": "https://localhost:8080/"
+        },
+        "id": "4945125a",
+        "outputId": "bbd1a877-0c02-4860-d289-201ac5ff6f41"
+      },
+      "outputs": [],
+      "source": [
+        "from tqdm import tqdm\n",
+        "\n",
+        "actual_cats = []\n",
+        "predicted_cats = []\n",
+        "\n",
+        "for i in tqdm(range(0, 300)):\n",
+        "    sequence = books.loc[books['simple_categories']==\"Fiction\", 'description'].reset_index(drop=True)[i]\n",
+        "    predicted_cats.append(generate_predictions(sequence, fiction_categories))\n",
+        "    actual_cats.append(\"Fiction\")"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "id": "efd30e84",
+      "metadata": {
+        "colab": {
+          "base_uri": "https://localhost:8080/"
+        },
+        "id": "efd30e84",
+        "outputId": "462d4f63-d2cd-4dc8-de04-eb9da6c55b20"
+      },
+      "outputs": [],
+      "source": [
+        "for i in tqdm(range(0, 300)):\n",
+        "    sequence = books.loc[books['simple_categories']==\"Nonfiction\", 'description'].reset_index(drop=True)[i]\n",
+        "    predicted_cats.append(generate_predictions(sequence, fiction_categories))\n",
+        "    actual_cats.append(\"Nonfiction\")"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "id": "34322614",
+      "metadata": {
+        "id": "34322614"
+      },
+      "outputs": [],
+      "source": [
+        "predictions_df = pd.DataFrame({\n",
+        "    'actual': actual_cats,\n",
+        "    'predicted': predicted_cats\n",
+        "})"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "id": "fc41ebe7",
+      "metadata": {
+        "id": "fc41ebe7"
+      },
+      "outputs": [],
+      "source": [
+        "predictions_df['correct_prediction'] = (\n",
+        "    np.where(predictions_df['actual'] == predictions_df['predicted'], 1, 0)\n",
+        ")"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "id": "325834c0",
+      "metadata": {
+        "colab": {
+          "base_uri": "https://localhost:8080/"
+        },
+        "id": "325834c0",
+        "outputId": "479b349e-c6e4-49cf-eda1-107fe595b57c"
+      },
+      "outputs": [],
+      "source": [
+        "predictions_df['correct_prediction'].sum() / predictions_df.shape[0]"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "id": "4f3834ac",
+      "metadata": {
+        "id": "4f3834ac"
+      },
+      "outputs": [],
+      "source": [
+        "isbns = []\n",
+        "predicted_cats = []\n",
+        "missing_cats = books.loc[books['simple_categories'].isna(), ['isbn13', \"description\"]].reset_index(drop=True)"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "id": "38a9529a",
+      "metadata": {
+        "colab": {
+          "base_uri": "https://localhost:8080/"
+        },
+        "id": "38a9529a",
+        "outputId": "190e56ad-fa21-4b98-fcb2-7896197bc349"
+      },
+      "outputs": [],
+      "source": [
+        "for i in tqdm(range(0, missing_cats.shape[0])):\n",
+        "    sequence = missing_cats['description'][i]\n",
+        "    pred = generate_predictions(sequence, fiction_categories)\n",
+        "    predicted_cats.append(pred)\n",
+        "    isbns.append(missing_cats['isbn13'][i])\n"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "id": "d76c7c58",
+      "metadata": {
+        "id": "d76c7c58"
+      },
+      "outputs": [],
+      "source": [
+        "missing_predictions_df = pd.DataFrame({\n",
+        "    'isbn13': isbns,\n",
+        "    'predicted': predicted_cats\n",
+        "})"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "id": "6e83ce7e",
+      "metadata": {
+        "id": "6e83ce7e"
+      },
+      "outputs": [],
+      "source": [
+        "missing_predictions_df"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "id": "17fbb19c",
+      "metadata": {
+        "colab": {
+          "base_uri": "https://localhost:8080/"
+        },
+        "id": "17fbb19c",
+        "outputId": "dfc3c2ef-0e99-4d76-f3eb-aca138c6782f"
+      },
+      "outputs": [],
+      "source": [
+        "books.columns"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "id": "f0060e11",
+      "metadata": {
+        "id": "f0060e11"
+      },
+      "outputs": [],
+      "source": [
+        "asdf"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "id": "c5c6ad46",
+      "metadata": {
+        "id": "c5c6ad46"
+      },
+      "outputs": [],
+      "source": [
+        "books = pd.merge(books, missing_predictions_df, on='isbn13', how='left')\n",
+        "books['simple_categories'] = np.where(books['simple_categories'].isna(),books['predicted'], books['simple_categories'])\n",
+        "books = books.drop(columns=['predicted'])\n"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "id": "83beb18f",
+      "metadata": {
+        "colab": {
+          "base_uri": "https://localhost:8080/",
+          "height": 271
+        },
+        "id": "83beb18f",
+        "outputId": "e7c6e296-748c-4f12-d818-e16b75083bff"
+      },
+      "outputs": [],
+      "source": [
+        "books.head(2)"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "id": "1e8e9374",
+      "metadata": {
+        "id": "1e8e9374"
+      },
+      "outputs": [],
+      "source": [
+        "books[books[\"categories\"].str.lower().isin([\n",
+        "    \"romance\",\n",
+        "    \"science fiction\",\n",
+        "    \"scifi\",\n",
+        "    \"fantasy\",\n",
+        "    \"horror\",\n",
+        "    \"mystery\",\n",
+        "    \"thriller\",\n",
+        "    \"comedy\",\n",
+        "    \"crime\",\n",
+        "    \"historical\"\n",
+        "])]"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "id": "DA0gYVkklR1e",
+      "metadata": {
+        "id": "DA0gYVkklR1e"
+      },
+      "outputs": [],
+      "source": [
+        "books.to_csv(\"books_with_categories.csv\", index=False)"
+      ]
+    }
+  ],
+  "metadata": {
+    "accelerator": "GPU",
+    "colab": {
+      "gpuType": "T4",
+      "provenance": []
+    },
+    "kernelspec": {
+      "display_name": "Python 3",
+      "name": "python3"
+    },
+    "language_info": {
+      "codemirror_mode": {
+        "name": "ipython",
+        "version": 3
+      },
+      "file_extension": ".py",
+      "mimetype": "text/x-python",
+      "name": "python",
+      "nbconvert_exporter": "python",
+      "pygments_lexer": "ipython3",
+      "version": "3.10.0"
+    }
+  },
+  "nbformat": 4,
+  "nbformat_minor": 5
+}

vector_search.ipynb ADDED Viewed

	@@ -0,0 +1,168 @@

+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "8c67e153",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from langchain_community.document_loaders import TextLoader\n",
+    "from langchain_text_splitters import CharacterTextSplitter\n",
+    "from langchain_openai import OpenAIEmbeddings\n",
+    "from langchain_chroma import Chroma"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "4e0a1fdc",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from dotenv import load_dotenv\n",
+    "load_dotenv()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "55bc2ba8",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import pandas as pd\n",
+    "\n",
+    "books = pd.read_csv(\"books_cleaned.csv\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "28b3e45d",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "books.shape"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "331dede4",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "books['tagged_description'].to_csv('tagged_description.txt', index=False, sep='\\n', header=False)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "47b0b3c2",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import os\n",
+    "\n",
+    "file_path = \"tagged_description.txt\"\n",
+    "\n",
+    "raw_documents = TextLoader(file_path, encoding=\"utf-8\").load()\n",
+    "text_splitter = CharacterTextSplitter(chunk_size=0, chunk_overlap=0, separator=\"\\n\")\n",
+    "documents = text_splitter.split_documents(raw_documents)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "2fc263ba",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "documents[0]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "427fd244",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "db_books = Chroma.from_documents(\n",
+    "    documents,\n",
+    "    OpenAIEmbeddings(),\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "5f4476a6",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "query = \"A book to teach children about nature\"\n",
+    "docs = db_books.similarity_search(query, k=3)\n",
+    "docs"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "faaf7618",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "books[books['isbn13'] == int(docs[0].page_content.split(\" \")[0].strip())]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "3a893785",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def retrieve_semantic_recommendations(query, top_k=3) -> pd.DataFrame:\n",
+    "    recs = db_books.similarity_search(query, k=top_k)\n",
+    "\n",
+    "    books_list = []\n",
+    "    for i in range(0, len(recs)):\n",
+    "        books_list += [int(recs[i].page_content.strip('\"').split()[0].strip())]\n",
+    "    return books[books['isbn13'].isin(books_list)].head(top_k)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "6a557732",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "retrieve_semantic_recommendations(query)"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "myenv",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.10.0"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}