{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "initial_id",
   "metadata": {
    "ExecuteTime": {
     "end_time": "2025-09-15T07:35:25.488414Z",
     "start_time": "2025-09-15T07:35:25.460656Z"
    }
   },
   "outputs": [],
   "source": [
    "from langchain.embeddings import HuggingFaceEmbeddings\n",
    "\n",
    "from langchain_text_splitters import CharacterTextSplitter\n",
    "from langchain_openai import OpenAIEmbeddings\n",
    "from langchain_chroma import Chroma\n",
    "\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "9e2d7510161fceb6",
   "metadata": {
    "ExecuteTime": {
     "end_time": "2025-09-15T07:35:27.755330Z",
     "start_time": "2025-09-15T07:35:27.736857Z"
    }
   },
   "outputs": [],
   "source": [
    "from dotenv import load_dotenv\n",
    "from dotenv import load_dotenv\n",
    "import os\n",
    "\n",
    "# Load environment variables\n",
    "load_dotenv()\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "b1c5ca1012315fd2",
   "metadata": {
    "ExecuteTime": {
     "end_time": "2025-09-15T07:35:30.169857Z",
     "start_time": "2025-09-15T07:35:30.074451Z"
    }
   },
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "\n",
    "books = pd.read_csv(\"books_cleaned.csv\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "694a28505e311eea",
   "metadata": {
    "ExecuteTime": {
     "end_time": "2025-09-15T07:35:32.747269Z",
     "start_time": "2025-09-15T07:35:32.725973Z"
    }
   },
   "outputs": [],
   "source": [
    "books"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "eb17356cf0ecbbef",
   "metadata": {
    "ExecuteTime": {
     "end_time": "2025-09-15T07:35:35.292093Z",
     "start_time": "2025-09-15T07:35:35.243618Z"
    }
   },
   "outputs": [],
   "source": [
    "books[\"tagged_description\"].to_csv(\"tagged_description.txt\",\n",
    "                                   index=False,\n",
    "                                   header=False)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "2db289c35716805c",
   "metadata": {
    "ExecuteTime": {
     "end_time": "2025-09-15T07:35:42.877672Z",
     "start_time": "2025-09-15T07:35:42.683378Z"
    }
   },
   "outputs": [],
   "source": [
    "\n",
    "# Load the file manually (more reliable)\n",
    "with open(\"tagged_description.txt\", 'r', encoding='utf-8') as file:\n",
    "    content = file.read()\n",
    "\n",
    "# Create a document object\n",
    "raw_documents = [Document(page_content=content)]\n",
    "\n",
    "# Split into chunks\n",
    "text_splitter = CharacterTextSplitter(\n",
    "    chunk_size=1500,  # Increased to avoid warnings\n",
    "    chunk_overlap=150,\n",
    "    separator=\"\\n\"\n",
    ")\n",
    "\n",
    "documents = text_splitter.split_documents(raw_documents)\n",
    "\n",
    "print(f\"Successfully created {len(documents)} chunks\")\n",
    "print(f\"First chunk preview:\\n{documents[0].page_content[:200]}...\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "12d6dc1c1f518682",
   "metadata": {
    "ExecuteTime": {
     "end_time": "2025-09-15T07:35:45.472985Z",
     "start_time": "2025-09-15T07:35:45.467714Z"
    }
   },
   "outputs": [],
   "source": [
    "documents[0]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "d73b0e5261855919",
   "metadata": {
    "ExecuteTime": {
     "end_time": "2025-09-15T07:47:27.888830Z",
     "start_time": "2025-09-15T07:36:56.075724Z"
    }
   },
   "outputs": [],
   "source": [
    "!pip install sentence_transformers\n",
    "embeddings = HuggingFaceEmbeddings(\n",
    "    model_name=\"all-MiniLM-L6-v2\",  # Free, fast, and good quality\n",
    "    model_kwargs={'device': 'cpu'}   # Use 'cuda' if you have a GPU\n",
    ")\n",
    "\n",
    "db_books = Chroma.from_documents(\n",
    "    documents,\n",
    "    embedding=embeddings\n",
    ")"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "9473a4b393977d6f",
   "metadata": {},
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "8c28a61479deb520",
   "metadata": {
    "ExecuteTime": {
     "end_time": "2025-09-15T07:47:49.568125Z",
     "start_time": "2025-09-15T07:47:49.337737Z"
    }
   },
   "outputs": [],
   "source": [
    "query = \"A book to teach children about nature\"\n",
    "docs = db_books.similarity_search(query, k = 10)\n",
    "docs"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "57cebcff1d436b6a",
   "metadata": {
    "ExecuteTime": {
     "end_time": "2025-09-15T07:54:09.612026Z",
     "start_time": "2025-09-15T07:54:09.538027Z"
    }
   },
   "outputs": [],
   "source": [
    "# Extract and clean the ISBN\n",
    "isbn_str = docs[0].page_content.split()[0].strip()\n",
    "# Remove quotes and convert to float first, then int\n",
    "isbn_clean = isbn_str.replace('\"', '').replace(\"'\", \"\")\n",
    "isbn_int = int(float(isbn_clean))  # float first to handle .0, then int\n",
    "\n",
    "# Now search\n",
    "result = books[books[\"isbn13\"] == isbn_int]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "4155cc001df44e93",
   "metadata": {
    "ExecuteTime": {
     "end_time": "2025-09-15T07:54:49.157935Z",
     "start_time": "2025-09-15T07:54:49.088922Z"
    }
   },
   "outputs": [],
   "source": [
    "result"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "4c644a4b395fda08",
   "metadata": {
    "ExecuteTime": {
     "end_time": "2025-09-15T08:23:46.545582Z",
     "start_time": "2025-09-15T08:23:46.531998Z"
    }
   },
   "outputs": [],
   "source": [
    "def retrieve_semantic_recommendations(\n",
    "    query: str,\n",
    "    top_k: int = 10,\n",
    ") -> pd.DataFrame:\n",
    "    recs = db_books.similarity_search(query, k=50)\n",
    "\n",
    "    books_list = []\n",
    "\n",
    "    for i in range(0, len(recs)):\n",
    "        isbn_str = recs[i].page_content.strip('\"').split()[0]\n",
    "        books_list += [int(float(isbn_str))]  # float() first, then int()\n",
    "\n",
    "    return books[books[\"isbn13\"].isin(books_list)].head(top_k)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "b9eada846c702825",
   "metadata": {
    "ExecuteTime": {
     "end_time": "2025-09-15T08:23:47.659278Z",
     "start_time": "2025-09-15T08:23:47.501425Z"
    }
   },
   "outputs": [],
   "source": [
    "retrieve_semantic_recommendations(\"A book to teach children about nature\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "36d5bb5ac34f9b2d",
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 2
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython2",
   "version": "2.7.6"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}