File size: 3,980 Bytes
cdb73a8
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
import logging
import os
import pickle
import sys
from functools import lru_cache

# Add the project root to the Python path
sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), "../../..")))

import numpy as np
import pandas as pd
from sentence_transformers import SentenceTransformer
from slowapi import Limiter
from slowapi.util import get_remote_address

import src.book_recommender.core.config as config
from src.book_recommender.core.exceptions import DataNotFoundError
from src.book_recommender.ml.clustering import (
    cluster_books,
    get_cluster_names,
)
from src.book_recommender.ml.embedder import (
    load_model as embedder_load_model,
)
from src.book_recommender.ml.recommender import BookRecommender

logger = logging.getLogger(__name__)

limiter = Limiter(key_func=get_remote_address, default_limits=["10/minute"])

CLUSTER_CACHE_PATH = config.PROCESSED_DATA_DIR / "cluster_cache.pkl"
MODEL_CACHE_PATH = config.PROCESSED_DATA_DIR / "model_cache"


@lru_cache(maxsize=1)
def get_recommender() -> BookRecommender:
    """Load and cache BookRecommender (fast - uses cached files)"""
    try:
        logger.info("Loading book data and embeddings...")
        book_data_df = pd.read_parquet(config.PROCESSED_DATA_PATH)
        embeddings_arr = np.load(config.EMBEDDINGS_PATH)

        recommender = BookRecommender(book_data=book_data_df, embeddings=embeddings_arr)
        logger.info(f"Recommender ready | {len(book_data_df)} books loaded")
        return recommender
    except FileNotFoundError as e:
        logger.error(f"Data file not found: {e}")
        raise DataNotFoundError(
            f"Missing data files. Check '{config.PROCESSED_DATA_PATH}' and '{config.EMBEDDINGS_PATH}'"
        )
    except Exception as e:
        logger.error(f"Error initializing recommender: {e}")
        raise


@lru_cache(maxsize=1)
def get_sentence_transformer_model() -> SentenceTransformer:
    """
    Load model using the centralized, robust loader from embedder.py.
    The loader handles checking for a local cache and downloading if missing.
    """
    logger.info("Requesting embedding model...")
    return embedder_load_model(config.EMBEDDING_MODEL)


@lru_cache(maxsize=1)
def get_clusters_data() -> tuple[np.ndarray, dict, pd.DataFrame]:
    """
    Get clusters data with intelligent caching.

    Loads from cache if available, otherwise generates and caches.
    """
    logger.info("Loading cluster data...")

    if CLUSTER_CACHE_PATH.exists():
        try:
            cache_mtime = os.path.getmtime(CLUSTER_CACHE_PATH)
            embeddings_mtime = os.path.getmtime(config.EMBEDDINGS_PATH)

            if cache_mtime > embeddings_mtime:
                logger.info("Loading clusters from cache...")
                with open(CLUSTER_CACHE_PATH, "rb") as f:
                    clusters_arr, names, book_data_df = pickle.load(f)
                logger.info(f"Clusters loaded from cache | {len(names)} clusters")
                return clusters_arr, names, book_data_df
            else:
                logger.info("Cache outdated, regenerating...")
        except Exception as e:
            logger.warning(f"Cache load failed: {e}, regenerating...")

    logger.info("Generating clusters (this takes ~30 seconds)...")
    recommender = get_recommender()
    book_data_df = recommender.book_data.copy()
    embeddings_arr = recommender.embeddings

    clusters_arr, _ = cluster_books(embeddings_arr, n_clusters=config.NUM_CLUSTERS)
    book_data_df["cluster_id"] = clusters_arr
    names = get_cluster_names(book_data_df, clusters_arr)

    try:
        with open(CLUSTER_CACHE_PATH, "wb") as f:
            pickle.dump((clusters_arr, names, book_data_df), f)
        logger.info(f"Clusters cached to {CLUSTER_CACHE_PATH}")
    except Exception as e:
        logger.warning(f"Failed to cache clusters: {e}")

    logger.info(f"Clusters ready | {len(names)} clusters generated")
    return clusters_arr, names, book_data_df