File size: 7,209 Bytes
cdb73a8
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
import logging
import os
import sys
from typing import Dict, List, Optional

# Add the project root to the Python path
sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), "../../..")))

import faiss
import numpy as np
import pandas as pd

import src.book_recommender.core.config as config

logger = logging.getLogger(__name__)


class BookRecommender:
    """
    A content-based book recommender system that uses a FAISS index for
    efficient similarity search.

    This class encapsulates the logic for building a searchable index of book
    embeddings and retrieving recommendations based on semantic similarity.
    """

    def __init__(self, book_data: pd.DataFrame, embeddings: np.ndarray):
        """
        Initializes the recommender, builds the FAISS index, and prepares data.

        Args:
            book_data (pd.DataFrame): DataFrame containing book metadata.
                                      Must include 'title_lower' column for indexing.
            embeddings (np.ndarray): A 2D NumPy array of book embeddings.
        """
        if len(book_data) != len(embeddings):
            raise ValueError("Mismatch between number of books and number of embeddings.")

        self.book_data = book_data
        self.embeddings = embeddings.astype("float32")

        faiss.normalize_L2(self.embeddings)

        self.index = faiss.IndexFlatL2(config.EMBEDDING_DIMENSION)
        self.index.add(self.embeddings)

        self.title_to_index = pd.Series(self.book_data.index, index=self.book_data["title_lower"]).to_dict()
        logger.info(f"Recommender initialized with FAISS index containing {self.index.ntotal} vectors.")

    def get_recommendations_from_vector(
        self,
        vector: np.ndarray,
        top_k: int = config.DEFAULT_TOP_K,
        similarity_threshold: float = config.MIN_SIMILARITY_THRESHOLD,
        ignore_index: Optional[int] = None,
    ) -> List[Dict]:
        """
        Finds and returns top_k book recommendations for a given embedding vector.

        Args:
            vector (np.ndarray): The embedding vector to find recommendations for.
            top_k (int): The number of recommendations to return.
            similarity_threshold (float): The minimum similarity score.
            ignore_index (int, optional): An index to ignore in the results (e.g., the query book itself).

        Returns:
            A list of dictionaries, each containing details of a recommended book.
        """
        if vector.ndim == 1:
            vector = vector.reshape(1, -1)

        faiss.normalize_L2(vector)

        distances, indices = self.index.search(vector, top_k + (1 if ignore_index is not None else 0))

        # Filter out the ignore_index if present
        valid_mask = indices[0] != ignore_index
        valid_indices = indices[0][valid_mask]
        valid_distances = distances[0][valid_mask]

        # Calculate similarity scores
        # cosine_sim = 1 - (L2_dist^2) / 2
        similarity_scores = 1 - (valid_distances**2) / 2

        # Filter by threshold
        threshold_mask = similarity_scores >= similarity_threshold
        final_indices = valid_indices[threshold_mask]
        final_scores = similarity_scores[threshold_mask]

        if len(final_indices) == 0:
            return []

        # Batch retrieve book data
        # We use iloc[final_indices] to get all rows at once
        recommended_books_df = self.book_data.iloc[final_indices]

        recommendations = []
        # Iterate over the subset DataFrame and the corresponding scores
        for (idx, row), score in zip(recommended_books_df.iterrows(), final_scores):
            recommendations.append(
                {
                    "id": row["id"],
                    "title": row["title"],
                    "authors": row.get("authors", "N/A"),
                    "description": row.get("description", ""),
                    "genres": row.get("genres", ""),
                    "tags": row.get("tags", ""),
                    "rating": row.get("rating", "N/A"),
                    "cover_image_url": row.get("cover_image_url", None),
                    "similarity": float(score),
                }
            )

        return recommendations

    def get_recommendations(
        self,
        title: str,
        top_k: int = 5,
        similarity_threshold: float = config.MIN_SIMILARITY_THRESHOLD,
    ) -> List[Dict]:
        """
        Finds and returns top_k book recommendations for a given title using FAISS.

        Args:
            title (str): The title of the book to get recommendations for.
            top_k (int): The number of recommendations to return.
            similarity_threshold (float): The minimum similarity score for a book
                                          to be considered a recommendation.

        Returns:
            A list of dictionaries, where each dictionary contains the details
            of a recommended book (title, authors, similarity, etc.).
            Returns an empty list if the title is not found or no books meet
            the similarity threshold.
        """
        book_index = self.title_to_index.get(title.lower())

        if book_index is None:
            logger.warning(f"Title '{title}' not found in the dataset.")
            return []

        book_vector = self.embeddings[book_index]

        recommendations = self.get_recommendations_from_vector(
            book_vector, top_k, similarity_threshold, ignore_index=book_index
        )

        logger.info(f"Found {len(recommendations)} recommendations for '{title}'.")
        return recommendations


if __name__ == "__main__":
    import sys

    sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
    from src.book_recommender.core import config as config_main

    logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s")

    if not (os.path.exists(config_main.PROCESSED_DATA_PATH) and os.path.exists(config_main.EMBEDDINGS_PATH)):
        print("Processed data or embeddings not found.")
        print("Please run 'python src/data_processor.py' and 'python src/embedder.py' first.")
    else:
        logger.info(f"Loading book metadata from {config_main.PROCESSED_DATA_PATH}...")
        book_data_df = pd.read_parquet(config_main.PROCESSED_DATA_PATH)

        logger.info(f"Loading book embeddings from {config_main.EMBEDDINGS_PATH}...")
        embeddings_arr = np.load(config_main.EMBEDDINGS_PATH)

        recommender = BookRecommender(book_data=book_data_df, embeddings=embeddings_arr)

        book_titles = recommender.book_data["title"].tolist()
        if book_titles:
            test_title = book_titles[0]
            print(f"--- Getting recommendations for: '{test_title}' ---")
            recs = recommender.get_recommendations(test_title, top_k=5)

            if recs:
                for i, rec in enumerate(recs):
                    print(f"{i+1}. {rec['title']} by {rec['authors']} (Similarity: {rec['similarity']:.2f})")
            else:
                print("Could not find any recommendations.")
        else:
            print("No book titles found in the dataset.")