File size: 2,145 Bytes
779489a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
import pandas as pd
import numpy as np

from langchain_community.document_loaders import TextLoader
from langchain_text_splitters import CharacterTextSplitter
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_openai import OpenAIEmbeddings
from langchain_chroma import Chroma
from langchain.embeddings import SentenceTransformerEmbeddings

books = pd.read_csv('data/book_with_categories.csv')

def retrieve_semantic_recommendations(query: str, top_k: int = 10, db_books=None) -> pd.DataFrame:
    """
    Retrieve top-k book recommendations based on semantic similarity to the query.
    
    Args:
        query (str): The search query.
        top_k (int): Number of recommendations to return.
        db_books: FAISS vector store containing book embeddings.
    
    Returns:
        pd.DataFrame: DataFrame with top-k book recommendations.
    """
    if db_books is None:
        raise ValueError("db_books vector store is required")

    # Perform similarity search
    recs = db_books.similarity_search(query, k=50)
    
    # Extract ISBNs from search results
    books_list = [int(rec.page_content.strip('"').split()[0]) for rec in recs]
    
    # Filter books DataFrame and return top-k
    return books[books["isbn13"].isin(books_list)].head(top_k)

if __name__ == "__main__":
    books = pd.read_csv('data/book_cleaned.csv')
    books['tagged_description'].to_csv('data/tagged_description.txt', 
                                    sep='\n',
                                    index=False, 
                                    header=False)
    raw_docs = TextLoader('data/tagged_description.txt', encoding='utf-8').load()
    text_splitter = CharacterTextSplitter(chunk_size=1, chunk_overlap=0, separator="\n")
    documents = text_splitter.split_documents(raw_docs)

    embedding = SentenceTransformerEmbeddings(model_name="all-mpnet-base-v2")
    db_books = Chroma.from_documents(
    documents,
    embedding=embedding)  

    query = "A book to teach children about nature"

    recommendations = retrieve_semantic_recommendations(query, top_k=10, db_books=db_books)
    print(recommendations)