{ "cells": [ { "cell_type": "code", "execution_count": null, "id": "initial_id", "metadata": { "ExecuteTime": { "end_time": "2025-09-15T07:35:25.488414Z", "start_time": "2025-09-15T07:35:25.460656Z" } }, "outputs": [], "source": [ "from langchain.embeddings import HuggingFaceEmbeddings\n", "\n", "from langchain_text_splitters import CharacterTextSplitter\n", "from langchain_openai import OpenAIEmbeddings\n", "from langchain_chroma import Chroma\n", "\n" ] }, { "cell_type": "code", "execution_count": null, "id": "9e2d7510161fceb6", "metadata": { "ExecuteTime": { "end_time": "2025-09-15T07:35:27.755330Z", "start_time": "2025-09-15T07:35:27.736857Z" } }, "outputs": [], "source": [ "from dotenv import load_dotenv\n", "from dotenv import load_dotenv\n", "import os\n", "\n", "# Load environment variables\n", "load_dotenv()\n" ] }, { "cell_type": "code", "execution_count": null, "id": "b1c5ca1012315fd2", "metadata": { "ExecuteTime": { "end_time": "2025-09-15T07:35:30.169857Z", "start_time": "2025-09-15T07:35:30.074451Z" } }, "outputs": [], "source": [ "import pandas as pd\n", "\n", "books = pd.read_csv(\"books_cleaned.csv\")" ] }, { "cell_type": "code", "execution_count": null, "id": "694a28505e311eea", "metadata": { "ExecuteTime": { "end_time": "2025-09-15T07:35:32.747269Z", "start_time": "2025-09-15T07:35:32.725973Z" } }, "outputs": [], "source": [ "books" ] }, { "cell_type": "code", "execution_count": null, "id": "eb17356cf0ecbbef", "metadata": { "ExecuteTime": { "end_time": "2025-09-15T07:35:35.292093Z", "start_time": "2025-09-15T07:35:35.243618Z" } }, "outputs": [], "source": [ "books[\"tagged_description\"].to_csv(\"tagged_description.txt\",\n", " index=False,\n", " header=False)\n" ] }, { "cell_type": "code", "execution_count": null, "id": "2db289c35716805c", "metadata": { "ExecuteTime": { "end_time": "2025-09-15T07:35:42.877672Z", "start_time": "2025-09-15T07:35:42.683378Z" } }, "outputs": [], "source": [ "\n", "# Load the file manually (more reliable)\n", "with open(\"tagged_description.txt\", 'r', encoding='utf-8') as file:\n", " content = file.read()\n", "\n", "# Create a document object\n", "raw_documents = [Document(page_content=content)]\n", "\n", "# Split into chunks\n", "text_splitter = CharacterTextSplitter(\n", " chunk_size=1500, # Increased to avoid warnings\n", " chunk_overlap=150,\n", " separator=\"\\n\"\n", ")\n", "\n", "documents = text_splitter.split_documents(raw_documents)\n", "\n", "print(f\"Successfully created {len(documents)} chunks\")\n", "print(f\"First chunk preview:\\n{documents[0].page_content[:200]}...\")" ] }, { "cell_type": "code", "execution_count": null, "id": "12d6dc1c1f518682", "metadata": { "ExecuteTime": { "end_time": "2025-09-15T07:35:45.472985Z", "start_time": "2025-09-15T07:35:45.467714Z" } }, "outputs": [], "source": [ "documents[0]" ] }, { "cell_type": "code", "execution_count": null, "id": "d73b0e5261855919", "metadata": { "ExecuteTime": { "end_time": "2025-09-15T07:47:27.888830Z", "start_time": "2025-09-15T07:36:56.075724Z" } }, "outputs": [], "source": [ "!pip install sentence_transformers\n", "embeddings = HuggingFaceEmbeddings(\n", " model_name=\"all-MiniLM-L6-v2\", # Free, fast, and good quality\n", " model_kwargs={'device': 'cpu'} # Use 'cuda' if you have a GPU\n", ")\n", "\n", "db_books = Chroma.from_documents(\n", " documents,\n", " embedding=embeddings\n", ")" ] }, { "cell_type": "markdown", "id": "9473a4b393977d6f", "metadata": {}, "source": [] }, { "cell_type": "code", "execution_count": null, "id": "8c28a61479deb520", "metadata": { "ExecuteTime": { "end_time": "2025-09-15T07:47:49.568125Z", "start_time": "2025-09-15T07:47:49.337737Z" } }, "outputs": [], "source": [ "query = \"A book to teach children about nature\"\n", "docs = db_books.similarity_search(query, k = 10)\n", "docs" ] }, { "cell_type": "code", "execution_count": null, "id": "57cebcff1d436b6a", "metadata": { "ExecuteTime": { "end_time": "2025-09-15T07:54:09.612026Z", "start_time": "2025-09-15T07:54:09.538027Z" } }, "outputs": [], "source": [ "# Extract and clean the ISBN\n", "isbn_str = docs[0].page_content.split()[0].strip()\n", "# Remove quotes and convert to float first, then int\n", "isbn_clean = isbn_str.replace('\"', '').replace(\"'\", \"\")\n", "isbn_int = int(float(isbn_clean)) # float first to handle .0, then int\n", "\n", "# Now search\n", "result = books[books[\"isbn13\"] == isbn_int]" ] }, { "cell_type": "code", "execution_count": null, "id": "4155cc001df44e93", "metadata": { "ExecuteTime": { "end_time": "2025-09-15T07:54:49.157935Z", "start_time": "2025-09-15T07:54:49.088922Z" } }, "outputs": [], "source": [ "result" ] }, { "cell_type": "code", "execution_count": null, "id": "4c644a4b395fda08", "metadata": { "ExecuteTime": { "end_time": "2025-09-15T08:23:46.545582Z", "start_time": "2025-09-15T08:23:46.531998Z" } }, "outputs": [], "source": [ "def retrieve_semantic_recommendations(\n", " query: str,\n", " top_k: int = 10,\n", ") -> pd.DataFrame:\n", " recs = db_books.similarity_search(query, k=50)\n", "\n", " books_list = []\n", "\n", " for i in range(0, len(recs)):\n", " isbn_str = recs[i].page_content.strip('\"').split()[0]\n", " books_list += [int(float(isbn_str))] # float() first, then int()\n", "\n", " return books[books[\"isbn13\"].isin(books_list)].head(top_k)" ] }, { "cell_type": "code", "execution_count": null, "id": "b9eada846c702825", "metadata": { "ExecuteTime": { "end_time": "2025-09-15T08:23:47.659278Z", "start_time": "2025-09-15T08:23:47.501425Z" } }, "outputs": [], "source": [ "retrieve_semantic_recommendations(\"A book to teach children about nature\")" ] }, { "cell_type": "code", "execution_count": null, "id": "36d5bb5ac34f9b2d", "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 2 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython2", "version": "2.7.6" } }, "nbformat": 4, "nbformat_minor": 5 }