File size: 5,256 Bytes
fe617ac
 
 
 
 
 
 
 
 
6ad997d
65b86c6
fe617ac
 
 
 
65b86c6
 
 
 
 
fe617ac
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
71a564a
 
 
 
 
 
 
 
 
 
fe617ac
 
 
 
65b86c6
 
 
 
 
fe617ac
 
 
 
 
 
 
 
71a564a
65b86c6
 
 
fe617ac
 
 
 
 
 
 
 
71a564a
 
 
 
fe617ac
 
71a564a
 
 
 
 
 
 
 
 
 
 
 
 
 
65b86c6
 
 
 
 
 
 
71a564a
fe617ac
 
 
71a564a
fe617ac
 
65b86c6
fe617ac
71a564a
 
 
 
 
fe617ac
 
 
 
 
 
 
65b86c6
 
fe617ac
65b86c6
fe617ac
 
 
65b86c6
 
 
fe617ac
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
import sys
import os
sys.path.append(os.getcwd())

import pandas as pd
import numpy as np
import logging
from tqdm import tqdm
from src.services.recommend_service import RecommendationService
from src.data.stores.metadata_store import metadata_store
from src.core.diversity_metrics import compute_diversity_metrics

logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)


def _get_category(isbn: str) -> str:
    meta = metadata_store.get_book_metadata(str(isbn))
    return (meta.get("simple_categories", "") or "Unknown").strip()

def evaluate_baseline(sample_n=1000):
    logger.info("Initializing Evaluation...")
    
    # 1. Load Test Data
    test_df = pd.read_csv('data/rec/test.csv')
    
    # Sample users
    if sample_n and sample_n < len(test_df):
        eval_df = test_df.sample(n=sample_n, random_state=42)
    else:
        eval_df = test_df
        
    logger.info(f"Evaluating on {len(eval_df)} users...")
    
    # 2. Init Service
    service = RecommendationService()
    service.load_resources()
    # Load ISBN -> Title map for evaluation
    isbn_to_title = {}
    try:
        books_df = pd.read_csv('data/books_processed.csv', usecols=['isbn13', 'title'])
        books_df['isbn13'] = books_df['isbn13'].astype(str).str.replace(r'\.0$', '', regex=True)
        isbn_to_title = pd.Series(books_df.title.values, index=books_df.isbn13.values).to_dict()
        logger.info("Loaded ISBN-Title map for relaxed evaluation.")
    except Exception as e:
        logger.warning(f"Could not load books for evaluation: {e}")

    # 3. Predict & Metric
    k = 10
    hits = 0
    mrr_sum = 0.0
    # P3: Diversity metrics (aggregate over all users)
    diversity_cov_sum = 0.0
    diversity_ilsd_sum = 0.0
    diversity_count = 0

    results = []
    
    for idx, (_, row) in tqdm(enumerate(eval_df.iterrows()), total=len(eval_df), desc="Evaluating"):
        user_id = row['user_id']
        target_isbn = row['isbn']
        
        # Get Recs
        try:
            # We disable favorite filtering for evaluation to handle potential data leakage in test set splits
            recs = service.get_recommendations(user_id, top_k=50, filter_favorites=False)
            # P3: Optional A/B test diversity: enable_diversity_rerank=True by default

            if not recs:
                if idx < 5: 
                    logger.warning(f"Empty recs for user {user_id}")
                continue
            
            rec_isbns = [r[0] for r in recs]
            
            # Check Hit
            hit = False
            rank = -1
            
            # 1. Exact Match
            if target_isbn in rec_isbns:
                rank = rec_isbns.index(target_isbn)
                hit = True
            
            # 2. Relaxed Title Match (if Exact failed)
            if not hit:
                target_title = isbn_to_title.get(str(target_isbn), "").lower().strip()
                if target_title:
                   for r_idx, r_isbn in enumerate(rec_isbns):
                       r_title = isbn_to_title.get(str(r_isbn), "").lower().strip()
                       if r_title and r_title == target_title:
                           rank = r_idx
                           hit = True
                           # logger.info(f"Title Match! Target: {target_isbn} ({target_title}) matches Rec: {r_isbn}")
                           break
            
            # P3: Diversity metrics on top-10
            if rec_isbns:
                d = compute_diversity_metrics(rec_isbns, _get_category, top_k=10)
                diversity_cov_sum += d["category_coverage"]
                diversity_ilsd_sum += d["ilsd"]
                diversity_count += 1

            if hit:
                # HR@10
                if rank < 10:
                    hits += 1

                # MRR (consider top 50)
                # MRR@5 (Strict)
                if (rank + 1) <= 5:  # Check if rank is within top 5 (1-indexed)
                    mrr_sum += 1.0 / (rank + 1)
            else:
                if idx < 5:
                    logger.info(f"MISS USER {user_id}: Target {target_isbn} not in top {len(rec_isbns)} recs.")
                    logger.info(f"Top 5 Recs: {rec_isbns[:5]}")
                    logger.info(f"Type check - Target: {type(target_isbn)}, Recs: {type(rec_isbns[0]) if rec_isbns else 'N/A'}")
            
        except Exception as e:
            logger.error(f"Error for user {user_id}: {e}")
            continue

    # 4. Report
    hr_10 = hits / len(eval_df)
    mean_mrr = mrr_sum / len(eval_df)
    div_n = max(diversity_count, 1)
    logger.info("==============================")
    logger.info("  EVALUATION RESULTS (Strict)")
    logger.info("==============================")
    logger.info(f"Users Evaluated: {len(eval_df)}")
    logger.info(f"Hit Rate@10:   {hr_10:.4f}")
    logger.info(f"MRR@5:         {mean_mrr:.4f}")
    logger.info(f"P3 Category Coverage@10: {diversity_cov_sum / div_n:.4f}")
    logger.info(f"P3 ILSD@10:              {diversity_ilsd_sum / div_n:.4f}")
    logger.info("==============================")

if __name__ == "__main__":
    evaluate_baseline(sample_n=500) # Fast check