Spaces:
Sleeping
Sleeping
| import sys | |
| import os | |
| sys.path.append(os.getcwd()) | |
| import pandas as pd | |
| import numpy as np | |
| import logging | |
| from tqdm import tqdm | |
| from src.services.recommend_service import RecommendationService | |
| from src.data.stores.metadata_store import metadata_store | |
| from src.core.diversity_metrics import compute_diversity_metrics | |
| logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s') | |
| logger = logging.getLogger(__name__) | |
| def _get_category(isbn: str) -> str: | |
| meta = metadata_store.get_book_metadata(str(isbn)) | |
| return (meta.get("simple_categories", "") or "Unknown").strip() | |
| def evaluate_baseline(sample_n=1000): | |
| logger.info("Initializing Evaluation...") | |
| # 1. Load Test Data | |
| test_df = pd.read_csv('data/rec/test.csv') | |
| # Sample users | |
| if sample_n and sample_n < len(test_df): | |
| eval_df = test_df.sample(n=sample_n, random_state=42) | |
| else: | |
| eval_df = test_df | |
| logger.info(f"Evaluating on {len(eval_df)} users...") | |
| # 2. Init Service | |
| service = RecommendationService() | |
| service.load_resources() | |
| # Load ISBN -> Title map for evaluation | |
| isbn_to_title = {} | |
| try: | |
| books_df = pd.read_csv('data/books_processed.csv', usecols=['isbn13', 'title']) | |
| books_df['isbn13'] = books_df['isbn13'].astype(str).str.replace(r'\.0$', '', regex=True) | |
| isbn_to_title = pd.Series(books_df.title.values, index=books_df.isbn13.values).to_dict() | |
| logger.info("Loaded ISBN-Title map for relaxed evaluation.") | |
| except Exception as e: | |
| logger.warning(f"Could not load books for evaluation: {e}") | |
| # 3. Predict & Metric | |
| k = 10 | |
| hits = 0 | |
| mrr_sum = 0.0 | |
| # P3: Diversity metrics (aggregate over all users) | |
| diversity_cov_sum = 0.0 | |
| diversity_ilsd_sum = 0.0 | |
| diversity_count = 0 | |
| results = [] | |
| for idx, (_, row) in tqdm(enumerate(eval_df.iterrows()), total=len(eval_df), desc="Evaluating"): | |
| user_id = row['user_id'] | |
| target_isbn = row['isbn'] | |
| # Get Recs | |
| try: | |
| # We disable favorite filtering for evaluation to handle potential data leakage in test set splits | |
| recs = service.get_recommendations(user_id, top_k=50, filter_favorites=False) | |
| # P3: Optional A/B test diversity: enable_diversity_rerank=True by default | |
| if not recs: | |
| if idx < 5: | |
| logger.warning(f"Empty recs for user {user_id}") | |
| continue | |
| rec_isbns = [r[0] for r in recs] | |
| # Check Hit | |
| hit = False | |
| rank = -1 | |
| # 1. Exact Match | |
| if target_isbn in rec_isbns: | |
| rank = rec_isbns.index(target_isbn) | |
| hit = True | |
| # 2. Relaxed Title Match (if Exact failed) | |
| if not hit: | |
| target_title = isbn_to_title.get(str(target_isbn), "").lower().strip() | |
| if target_title: | |
| for r_idx, r_isbn in enumerate(rec_isbns): | |
| r_title = isbn_to_title.get(str(r_isbn), "").lower().strip() | |
| if r_title and r_title == target_title: | |
| rank = r_idx | |
| hit = True | |
| # logger.info(f"Title Match! Target: {target_isbn} ({target_title}) matches Rec: {r_isbn}") | |
| break | |
| # P3: Diversity metrics on top-10 | |
| if rec_isbns: | |
| d = compute_diversity_metrics(rec_isbns, _get_category, top_k=10) | |
| diversity_cov_sum += d["category_coverage"] | |
| diversity_ilsd_sum += d["ilsd"] | |
| diversity_count += 1 | |
| if hit: | |
| # HR@10 | |
| if rank < 10: | |
| hits += 1 | |
| # MRR (consider top 50) | |
| # MRR@5 (Strict) | |
| if (rank + 1) <= 5: # Check if rank is within top 5 (1-indexed) | |
| mrr_sum += 1.0 / (rank + 1) | |
| else: | |
| if idx < 5: | |
| logger.info(f"MISS USER {user_id}: Target {target_isbn} not in top {len(rec_isbns)} recs.") | |
| logger.info(f"Top 5 Recs: {rec_isbns[:5]}") | |
| logger.info(f"Type check - Target: {type(target_isbn)}, Recs: {type(rec_isbns[0]) if rec_isbns else 'N/A'}") | |
| except Exception as e: | |
| logger.error(f"Error for user {user_id}: {e}") | |
| continue | |
| # 4. Report | |
| hr_10 = hits / len(eval_df) | |
| mean_mrr = mrr_sum / len(eval_df) | |
| div_n = max(diversity_count, 1) | |
| logger.info("==============================") | |
| logger.info(" EVALUATION RESULTS (Strict)") | |
| logger.info("==============================") | |
| logger.info(f"Users Evaluated: {len(eval_df)}") | |
| logger.info(f"Hit Rate@10: {hr_10:.4f}") | |
| logger.info(f"MRR@5: {mean_mrr:.4f}") | |
| logger.info(f"P3 Category Coverage@10: {diversity_cov_sum / div_n:.4f}") | |
| logger.info(f"P3 ILSD@10: {diversity_ilsd_sum / div_n:.4f}") | |
| logger.info("==============================") | |
| if __name__ == "__main__": | |
| evaluate_baseline(sample_n=500) # Fast check | |