deepshelf-api / src /book_recommender /ml /recommender.py
nice-bill's picture
initial commit
cdb73a8
import logging
import os
import sys
from typing import Dict, List, Optional
# Add the project root to the Python path
sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), "../../..")))
import faiss
import numpy as np
import pandas as pd
import src.book_recommender.core.config as config
logger = logging.getLogger(__name__)
class BookRecommender:
"""
A content-based book recommender system that uses a FAISS index for
efficient similarity search.
This class encapsulates the logic for building a searchable index of book
embeddings and retrieving recommendations based on semantic similarity.
"""
def __init__(self, book_data: pd.DataFrame, embeddings: np.ndarray):
"""
Initializes the recommender, builds the FAISS index, and prepares data.
Args:
book_data (pd.DataFrame): DataFrame containing book metadata.
Must include 'title_lower' column for indexing.
embeddings (np.ndarray): A 2D NumPy array of book embeddings.
"""
if len(book_data) != len(embeddings):
raise ValueError("Mismatch between number of books and number of embeddings.")
self.book_data = book_data
self.embeddings = embeddings.astype("float32")
faiss.normalize_L2(self.embeddings)
self.index = faiss.IndexFlatL2(config.EMBEDDING_DIMENSION)
self.index.add(self.embeddings)
self.title_to_index = pd.Series(self.book_data.index, index=self.book_data["title_lower"]).to_dict()
logger.info(f"Recommender initialized with FAISS index containing {self.index.ntotal} vectors.")
def get_recommendations_from_vector(
self,
vector: np.ndarray,
top_k: int = config.DEFAULT_TOP_K,
similarity_threshold: float = config.MIN_SIMILARITY_THRESHOLD,
ignore_index: Optional[int] = None,
) -> List[Dict]:
"""
Finds and returns top_k book recommendations for a given embedding vector.
Args:
vector (np.ndarray): The embedding vector to find recommendations for.
top_k (int): The number of recommendations to return.
similarity_threshold (float): The minimum similarity score.
ignore_index (int, optional): An index to ignore in the results (e.g., the query book itself).
Returns:
A list of dictionaries, each containing details of a recommended book.
"""
if vector.ndim == 1:
vector = vector.reshape(1, -1)
faiss.normalize_L2(vector)
distances, indices = self.index.search(vector, top_k + (1 if ignore_index is not None else 0))
# Filter out the ignore_index if present
valid_mask = indices[0] != ignore_index
valid_indices = indices[0][valid_mask]
valid_distances = distances[0][valid_mask]
# Calculate similarity scores
# cosine_sim = 1 - (L2_dist^2) / 2
similarity_scores = 1 - (valid_distances**2) / 2
# Filter by threshold
threshold_mask = similarity_scores >= similarity_threshold
final_indices = valid_indices[threshold_mask]
final_scores = similarity_scores[threshold_mask]
if len(final_indices) == 0:
return []
# Batch retrieve book data
# We use iloc[final_indices] to get all rows at once
recommended_books_df = self.book_data.iloc[final_indices]
recommendations = []
# Iterate over the subset DataFrame and the corresponding scores
for (idx, row), score in zip(recommended_books_df.iterrows(), final_scores):
recommendations.append(
{
"id": row["id"],
"title": row["title"],
"authors": row.get("authors", "N/A"),
"description": row.get("description", ""),
"genres": row.get("genres", ""),
"tags": row.get("tags", ""),
"rating": row.get("rating", "N/A"),
"cover_image_url": row.get("cover_image_url", None),
"similarity": float(score),
}
)
return recommendations
def get_recommendations(
self,
title: str,
top_k: int = 5,
similarity_threshold: float = config.MIN_SIMILARITY_THRESHOLD,
) -> List[Dict]:
"""
Finds and returns top_k book recommendations for a given title using FAISS.
Args:
title (str): The title of the book to get recommendations for.
top_k (int): The number of recommendations to return.
similarity_threshold (float): The minimum similarity score for a book
to be considered a recommendation.
Returns:
A list of dictionaries, where each dictionary contains the details
of a recommended book (title, authors, similarity, etc.).
Returns an empty list if the title is not found or no books meet
the similarity threshold.
"""
book_index = self.title_to_index.get(title.lower())
if book_index is None:
logger.warning(f"Title '{title}' not found in the dataset.")
return []
book_vector = self.embeddings[book_index]
recommendations = self.get_recommendations_from_vector(
book_vector, top_k, similarity_threshold, ignore_index=book_index
)
logger.info(f"Found {len(recommendations)} recommendations for '{title}'.")
return recommendations
if __name__ == "__main__":
import sys
sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
from src.book_recommender.core import config as config_main
logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s")
if not (os.path.exists(config_main.PROCESSED_DATA_PATH) and os.path.exists(config_main.EMBEDDINGS_PATH)):
print("Processed data or embeddings not found.")
print("Please run 'python src/data_processor.py' and 'python src/embedder.py' first.")
else:
logger.info(f"Loading book metadata from {config_main.PROCESSED_DATA_PATH}...")
book_data_df = pd.read_parquet(config_main.PROCESSED_DATA_PATH)
logger.info(f"Loading book embeddings from {config_main.EMBEDDINGS_PATH}...")
embeddings_arr = np.load(config_main.EMBEDDINGS_PATH)
recommender = BookRecommender(book_data=book_data_df, embeddings=embeddings_arr)
book_titles = recommender.book_data["title"].tolist()
if book_titles:
test_title = book_titles[0]
print(f"--- Getting recommendations for: '{test_title}' ---")
recs = recommender.get_recommendations(test_title, top_k=5)
if recs:
for i, rec in enumerate(recs):
print(f"{i+1}. {rec['title']} by {rec['authors']} (Similarity: {rec['similarity']:.2f})")
else:
print("Could not find any recommendations.")
else:
print("No book titles found in the dataset.")