File size: 1,998 Bytes
cdb73a8
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
import logging
import os
import pickle
import sys
import numpy as np
import pandas as pd

# Add project root
sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), "..")))

import src.book_recommender.core.config as config
from src.book_recommender.ml.clustering import cluster_books, get_cluster_names

# Configure logging
logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s")
logger = logging.getLogger(__name__)

def precompute_clusters():
    logger.info("--- Starting Cluster Pre-computation ---")
    
    # 1. Load Data
    if not os.path.exists(config.PROCESSED_DATA_PATH) or not os.path.exists(config.EMBEDDINGS_PATH):
        logger.error("Data files missing. Run data processor and embedder first.")
        return

    logger.info(f"Loading book data from {config.PROCESSED_DATA_PATH}...")
    book_data_df = pd.read_parquet(config.PROCESSED_DATA_PATH)
    
    logger.info(f"Loading embeddings from {config.EMBEDDINGS_PATH}...")
    embeddings_arr = np.load(config.EMBEDDINGS_PATH)

    # 2. Cluster
    n_clusters = config.NUM_CLUSTERS
    logger.info(f"Clustering {len(book_data_df)} books into {n_clusters} clusters...")
    
    clusters_arr, _ = cluster_books(embeddings_arr, n_clusters=n_clusters)
    
    # 3. Name Clusters
    book_data_df["cluster_id"] = clusters_arr
    names = get_cluster_names(book_data_df, clusters_arr)
    
    # 4. Save Cache
    cache_path = config.PROCESSED_DATA_DIR / "cluster_cache.pkl"
    logger.info(f"Saving cache to {cache_path}...")
    
    try:
        with open(cache_path, "wb") as f:
            # Must match the tuple structure expected by api/dependencies.py
            # (clusters_arr, names, book_data_df)
            pickle.dump((clusters_arr, names, book_data_df), f)
        logger.info("Successfully pre-computed and cached clusters.")
    except Exception as e:
        logger.error(f"Failed to save cache: {e}")

if __name__ == "__main__":
    precompute_clusters()