import logging import os import sys from typing import Dict, List, Optional # Add the project root to the Python path sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), "../../.."))) import faiss import numpy as np import pandas as pd import src.book_recommender.core.config as config logger = logging.getLogger(__name__) class BookRecommender: """ A content-based book recommender system that uses a FAISS index for efficient similarity search. This class encapsulates the logic for building a searchable index of book embeddings and retrieving recommendations based on semantic similarity. """ def __init__(self, book_data: pd.DataFrame, embeddings: np.ndarray): """ Initializes the recommender, builds the FAISS index, and prepares data. Args: book_data (pd.DataFrame): DataFrame containing book metadata. Must include 'title_lower' column for indexing. embeddings (np.ndarray): A 2D NumPy array of book embeddings. """ if len(book_data) != len(embeddings): raise ValueError("Mismatch between number of books and number of embeddings.") self.book_data = book_data self.embeddings = embeddings.astype("float32") faiss.normalize_L2(self.embeddings) self.index = faiss.IndexFlatL2(config.EMBEDDING_DIMENSION) self.index.add(self.embeddings) self.title_to_index = pd.Series(self.book_data.index, index=self.book_data["title_lower"]).to_dict() logger.info(f"Recommender initialized with FAISS index containing {self.index.ntotal} vectors.") def get_recommendations_from_vector( self, vector: np.ndarray, top_k: int = config.DEFAULT_TOP_K, similarity_threshold: float = config.MIN_SIMILARITY_THRESHOLD, ignore_index: Optional[int] = None, ) -> List[Dict]: """ Finds and returns top_k book recommendations for a given embedding vector. Args: vector (np.ndarray): The embedding vector to find recommendations for. top_k (int): The number of recommendations to return. similarity_threshold (float): The minimum similarity score. ignore_index (int, optional): An index to ignore in the results (e.g., the query book itself). Returns: A list of dictionaries, each containing details of a recommended book. """ if vector.ndim == 1: vector = vector.reshape(1, -1) faiss.normalize_L2(vector) distances, indices = self.index.search(vector, top_k + (1 if ignore_index is not None else 0)) # Filter out the ignore_index if present valid_mask = indices[0] != ignore_index valid_indices = indices[0][valid_mask] valid_distances = distances[0][valid_mask] # Calculate similarity scores # cosine_sim = 1 - (L2_dist^2) / 2 similarity_scores = 1 - (valid_distances**2) / 2 # Filter by threshold threshold_mask = similarity_scores >= similarity_threshold final_indices = valid_indices[threshold_mask] final_scores = similarity_scores[threshold_mask] if len(final_indices) == 0: return [] # Batch retrieve book data # We use iloc[final_indices] to get all rows at once recommended_books_df = self.book_data.iloc[final_indices] recommendations = [] # Iterate over the subset DataFrame and the corresponding scores for (idx, row), score in zip(recommended_books_df.iterrows(), final_scores): recommendations.append( { "id": row["id"], "title": row["title"], "authors": row.get("authors", "N/A"), "description": row.get("description", ""), "genres": row.get("genres", ""), "tags": row.get("tags", ""), "rating": row.get("rating", "N/A"), "cover_image_url": row.get("cover_image_url", None), "similarity": float(score), } ) return recommendations def get_recommendations( self, title: str, top_k: int = 5, similarity_threshold: float = config.MIN_SIMILARITY_THRESHOLD, ) -> List[Dict]: """ Finds and returns top_k book recommendations for a given title using FAISS. Args: title (str): The title of the book to get recommendations for. top_k (int): The number of recommendations to return. similarity_threshold (float): The minimum similarity score for a book to be considered a recommendation. Returns: A list of dictionaries, where each dictionary contains the details of a recommended book (title, authors, similarity, etc.). Returns an empty list if the title is not found or no books meet the similarity threshold. """ book_index = self.title_to_index.get(title.lower()) if book_index is None: logger.warning(f"Title '{title}' not found in the dataset.") return [] book_vector = self.embeddings[book_index] recommendations = self.get_recommendations_from_vector( book_vector, top_k, similarity_threshold, ignore_index=book_index ) logger.info(f"Found {len(recommendations)} recommendations for '{title}'.") return recommendations if __name__ == "__main__": import sys sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) from src.book_recommender.core import config as config_main logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s") if not (os.path.exists(config_main.PROCESSED_DATA_PATH) and os.path.exists(config_main.EMBEDDINGS_PATH)): print("Processed data or embeddings not found.") print("Please run 'python src/data_processor.py' and 'python src/embedder.py' first.") else: logger.info(f"Loading book metadata from {config_main.PROCESSED_DATA_PATH}...") book_data_df = pd.read_parquet(config_main.PROCESSED_DATA_PATH) logger.info(f"Loading book embeddings from {config_main.EMBEDDINGS_PATH}...") embeddings_arr = np.load(config_main.EMBEDDINGS_PATH) recommender = BookRecommender(book_data=book_data_df, embeddings=embeddings_arr) book_titles = recommender.book_data["title"].tolist() if book_titles: test_title = book_titles[0] print(f"--- Getting recommendations for: '{test_title}' ---") recs = recommender.get_recommendations(test_title, top_k=5) if recs: for i, rec in enumerate(recs): print(f"{i+1}. {rec['title']} by {rec['authors']} (Similarity: {rec['similarity']:.2f})") else: print("Could not find any recommendations.") else: print("No book titles found in the dataset.")