import pandas as pd import numpy as np import gradio as gr from langchain_community.document_loaders import TextLoader from langchain_openai import OpenAIEmbeddings from langchain_text_splitters import CharacterTextSplitter from langchain_chroma import Chroma # ======================= # Load and preprocess books # ======================= books = pd.read_csv("data/books_with_emotions.csv") books["large_thumbnail"] = np.where( books["thumbnail"].notna(), books["thumbnail"] + "&fife=w800", "cover-not-found.jpg" ) # ======================= # Prepare Chroma vector DB # ======================= raw_documents = TextLoader("data/tagged_description.txt").load() text_splitter = CharacterTextSplitter(chunk_size=1, chunk_overlap=0, separator="\n") documents = text_splitter.split_documents(raw_documents) db_books = Chroma.from_documents(documents, OpenAIEmbeddings()) # ======================= # Semantic retrieval logic # ======================= def retrieve_semantic_recommendations(query: str, category: str = "All", tone: str = "All", initial_top_k: int = 50, final_top_k: int = 16) -> pd.DataFrame: """Truy xuất danh sách gợi ý dựa trên ngữ nghĩa, danh mục và cảm xúc.""" recs = db_books.similarity_search(query, k=initial_top_k) books_list = [int(rec.page_content.strip('"').split()[0]) for rec in recs] # Lọc sách theo ISBN book_recs = books[books["isbn13"].isin(books_list)].head(initial_top_k) # Lọc theo category if category != "All": book_recs = book_recs[book_recs["simple_categories"] == category] # Sắp xếp theo tone cảm xúc tone_sort_map = { "Happy": "joy", "Surprising": "surprise", "Angry": "anger", "Suspenseful": "fear", "Sad": "sadness" } if tone in tone_sort_map: book_recs = book_recs.sort_values(by=tone_sort_map[tone], ascending=False) return book_recs.head(final_top_k) # ======================= # Recommendation formatting # ======================= def recommend_books(query: str, category: str, tone: str): recommendations = retrieve_semantic_recommendations(query, category, tone) results = [] for _, row in recommendations.iterrows(): desc = row["description"].split() truncated_description = " ".join(desc[:30]) + "..." authors = row["authors"].split(";") if len(authors) == 1: authors_str = authors[0] elif len(authors) == 2: authors_str = f"{authors[0]} and {authors[1]}" else: authors_str = f"{', '.join(authors[:-1])}, and {authors[-1]}" caption = f"{row['title']} by {authors_str}: {truncated_description}" results.append((row["large_thumbnail"], caption)) return results # ======================= # Build Gradio dashboard # ======================= categories = ["All"] + sorted(books["simple_categories"].unique()) tones = ["All", "Happy", "Surprising", "Angry", "Suspenseful", "Sad"] with gr.Blocks() as dashboard: gr.Markdown("# 📚 Semantic Book Recommender") with gr.Row(): user_query = gr.Textbox( label="Please enter a description of a book:", placeholder="e.g., A story about forgiveness" ) category_dropdown = gr.Dropdown( choices=categories, label="Select a category:", value="All" ) tone_dropdown = gr.Dropdown( choices=tones, label="Select an emotional tone:", value="All" ) submit_button = gr.Button("Find recommendations") gr.Markdown("## 🧠 Recommendations") output = gr.Gallery(label="Recommended books", columns=8, rows=2) submit_button.click(fn=recommend_books, inputs=[user_query, category_dropdown, tone_dropdown], outputs=output) if __name__ == "__main__": dashboard.launch()