Book-Recommender-System / vector_search.py
github-actions[bot]
Auto-deploy from GitHub Actions
3b8e0d6
import pandas as pd
import numpy as np
from langchain_community.document_loaders import TextLoader
from langchain_text_splitters import CharacterTextSplitter
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_openai import OpenAIEmbeddings
from langchain_chroma import Chroma
from langchain.embeddings import SentenceTransformerEmbeddings
books = pd.read_csv('data/book_with_categories.csv')
def retrieve_semantic_recommendations(query: str, top_k: int = 10, db_books=None) -> pd.DataFrame:
"""
Retrieve top-k book recommendations based on semantic similarity to the query.
Args:
query (str): The search query.
top_k (int): Number of recommendations to return.
db_books: FAISS vector store containing book embeddings.
Returns:
pd.DataFrame: DataFrame with top-k book recommendations.
"""
if db_books is None:
raise ValueError("db_books vector store is required")
# Perform similarity search
recs = db_books.similarity_search(query, k=50)
# Extract ISBNs from search results
books_list = [int(rec.page_content.strip('"').split()[0]) for rec in recs]
# Filter books DataFrame and return top-k
return books[books["isbn13"].isin(books_list)].head(top_k)
if __name__ == "__main__":
books = pd.read_csv('data/book_cleaned.csv')
books['tagged_description'].to_csv('data/tagged_description.txt',
sep='\n',
index=False,
header=False)
raw_docs = TextLoader('data/tagged_description.txt', encoding='utf-8').load()
text_splitter = CharacterTextSplitter(chunk_size=1, chunk_overlap=0, separator="\n")
documents = text_splitter.split_documents(raw_docs)
embedding = SentenceTransformerEmbeddings(model_name="all-mpnet-base-v2")
db_books = Chroma.from_documents(
documents,
embedding=embedding)
query = "A book to teach children about nature"
recommendations = retrieve_semantic_recommendations(query, top_k=10, db_books=db_books)
print(recommendations)