import pandas as pd import numpy as np from langchain_community.document_loaders import TextLoader from langchain_text_splitters import CharacterTextSplitter from langchain.text_splitter import RecursiveCharacterTextSplitter from langchain_openai import OpenAIEmbeddings from langchain_chroma import Chroma from langchain.embeddings import SentenceTransformerEmbeddings books = pd.read_csv('data/book_with_categories.csv') def retrieve_semantic_recommendations(query: str, top_k: int = 10, db_books=None) -> pd.DataFrame: """ Retrieve top-k book recommendations based on semantic similarity to the query. Args: query (str): The search query. top_k (int): Number of recommendations to return. db_books: FAISS vector store containing book embeddings. Returns: pd.DataFrame: DataFrame with top-k book recommendations. """ if db_books is None: raise ValueError("db_books vector store is required") # Perform similarity search recs = db_books.similarity_search(query, k=50) # Extract ISBNs from search results books_list = [int(rec.page_content.strip('"').split()[0]) for rec in recs] # Filter books DataFrame and return top-k return books[books["isbn13"].isin(books_list)].head(top_k) if __name__ == "__main__": books = pd.read_csv('data/book_cleaned.csv') books['tagged_description'].to_csv('data/tagged_description.txt', sep='\n', index=False, header=False) raw_docs = TextLoader('data/tagged_description.txt', encoding='utf-8').load() text_splitter = CharacterTextSplitter(chunk_size=1, chunk_overlap=0, separator="\n") documents = text_splitter.split_documents(raw_docs) embedding = SentenceTransformerEmbeddings(model_name="all-mpnet-base-v2") db_books = Chroma.from_documents( documents, embedding=embedding) query = "A book to teach children about nature" recommendations = retrieve_semantic_recommendations(query, top_k=10, db_books=db_books) print(recommendations)