In [None]:
from langchain.embeddings import HuggingFaceEmbeddings

from langchain_text_splitters import CharacterTextSplitter
from langchain_openai import OpenAIEmbeddings
from langchain_chroma import Chroma



In [None]:
from dotenv import load_dotenv
from dotenv import load_dotenv
import os

# Load environment variables
load_dotenv()


In [None]:
import pandas as pd

books = pd.read_csv("books_cleaned.csv")

In [None]:
books

In [None]:
books["tagged_description"].to_csv("tagged_description.txt",
 index=False,
 header=False)


In [None]:

# Load the file manually (more reliable)
with open("tagged_description.txt", 'r', encoding='utf-8') as file:
 content = file.read()

# Create a document object
raw_documents = [Document(page_content=content)]

# Split into chunks
text_splitter = CharacterTextSplitter(
 chunk_size=1500, # Increased to avoid warnings
 chunk_overlap=150,
 separator="\n"
)

documents = text_splitter.split_documents(raw_documents)

print(f"Successfully created {len(documents)} chunks")
print(f"First chunk preview:\n{documents[0].page_content[:200]}...")

In [None]:
documents[0]

In [None]:
!pip install sentence_transformers
embeddings = HuggingFaceEmbeddings(
 model_name="all-MiniLM-L6-v2", # Free, fast, and good quality
 model_kwargs={'device': 'cpu'} # Use 'cuda' if you have a GPU
)

db_books = Chroma.from_documents(
 documents,
 embedding=embeddings
)

In [None]:
query = "A book to teach children about nature"
docs = db_books.similarity_search(query, k = 10)
docs

In [None]:
# Extract and clean the ISBN
isbn_str = docs[0].page_content.split()[0].strip()
# Remove quotes and convert to float first, then int
isbn_clean = isbn_str.replace('"', '').replace("'", "")
isbn_int = int(float(isbn_clean)) # float first to handle .0, then int

# Now search
result = books[books["isbn13"] == isbn_int]

In [None]:
result

In [None]:
def retrieve_semantic_recommendations(
 query: str,
 top_k: int = 10,
) -> pd.DataFrame:
 recs = db_books.similarity_search(query, k=50)

 books_list = []

 for i in range(0, len(recs)):
 isbn_str = recs[i].page_content.strip('"').split()[0]
 books_list += [int(float(isbn_str))] # float() first, then int()

 return books[books["isbn13"].isin(books_list)].head(top_k)

In [None]:
retrieve_semantic_recommendations("A book to teach children about nature")