Spaces:
Sleeping
Sleeping
| import logging | |
| import os | |
| import pickle | |
| import sys | |
| from functools import lru_cache | |
| # Add the project root to the Python path | |
| sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), "../../.."))) | |
| import numpy as np | |
| import pandas as pd | |
| from sentence_transformers import SentenceTransformer | |
| from slowapi import Limiter | |
| from slowapi.util import get_remote_address | |
| import src.book_recommender.core.config as config | |
| from src.book_recommender.core.exceptions import DataNotFoundError | |
| from src.book_recommender.ml.clustering import ( | |
| cluster_books, | |
| get_cluster_names, | |
| ) | |
| from src.book_recommender.ml.embedder import ( | |
| load_model as embedder_load_model, | |
| ) | |
| from src.book_recommender.ml.recommender import BookRecommender | |
| logger = logging.getLogger(__name__) | |
| limiter = Limiter(key_func=get_remote_address, default_limits=["10/minute"]) | |
| CLUSTER_CACHE_PATH = config.PROCESSED_DATA_DIR / "cluster_cache.pkl" | |
| MODEL_CACHE_PATH = config.PROCESSED_DATA_DIR / "model_cache" | |
| def get_recommender() -> BookRecommender: | |
| """Load and cache BookRecommender (fast - uses cached files)""" | |
| try: | |
| logger.info("Loading book data and embeddings...") | |
| book_data_df = pd.read_parquet(config.PROCESSED_DATA_PATH) | |
| embeddings_arr = np.load(config.EMBEDDINGS_PATH) | |
| recommender = BookRecommender(book_data=book_data_df, embeddings=embeddings_arr) | |
| logger.info(f"Recommender ready | {len(book_data_df)} books loaded") | |
| return recommender | |
| except FileNotFoundError as e: | |
| logger.error(f"Data file not found: {e}") | |
| raise DataNotFoundError( | |
| f"Missing data files. Check '{config.PROCESSED_DATA_PATH}' and '{config.EMBEDDINGS_PATH}'" | |
| ) | |
| except Exception as e: | |
| logger.error(f"Error initializing recommender: {e}") | |
| raise | |
| def get_sentence_transformer_model() -> SentenceTransformer: | |
| """ | |
| Load model using the centralized, robust loader from embedder.py. | |
| The loader handles checking for a local cache and downloading if missing. | |
| """ | |
| logger.info("Requesting embedding model...") | |
| return embedder_load_model(config.EMBEDDING_MODEL) | |
| def get_clusters_data() -> tuple[np.ndarray, dict, pd.DataFrame]: | |
| """ | |
| Get clusters data with intelligent caching. | |
| Loads from cache if available, otherwise generates and caches. | |
| """ | |
| logger.info("Loading cluster data...") | |
| if CLUSTER_CACHE_PATH.exists(): | |
| try: | |
| cache_mtime = os.path.getmtime(CLUSTER_CACHE_PATH) | |
| embeddings_mtime = os.path.getmtime(config.EMBEDDINGS_PATH) | |
| if cache_mtime > embeddings_mtime: | |
| logger.info("Loading clusters from cache...") | |
| with open(CLUSTER_CACHE_PATH, "rb") as f: | |
| clusters_arr, names, book_data_df = pickle.load(f) | |
| logger.info(f"Clusters loaded from cache | {len(names)} clusters") | |
| return clusters_arr, names, book_data_df | |
| else: | |
| logger.info("Cache outdated, regenerating...") | |
| except Exception as e: | |
| logger.warning(f"Cache load failed: {e}, regenerating...") | |
| logger.info("Generating clusters (this takes ~30 seconds)...") | |
| recommender = get_recommender() | |
| book_data_df = recommender.book_data.copy() | |
| embeddings_arr = recommender.embeddings | |
| clusters_arr, _ = cluster_books(embeddings_arr, n_clusters=config.NUM_CLUSTERS) | |
| book_data_df["cluster_id"] = clusters_arr | |
| names = get_cluster_names(book_data_df, clusters_arr) | |
| try: | |
| with open(CLUSTER_CACHE_PATH, "wb") as f: | |
| pickle.dump((clusters_arr, names, book_data_df), f) | |
| logger.info(f"Clusters cached to {CLUSTER_CACHE_PATH}") | |
| except Exception as e: | |
| logger.warning(f"Failed to cache clusters: {e}") | |
| logger.info(f"Clusters ready | {len(names)} clusters generated") | |
| return clusters_arr, names, book_data_df | |