import numpy as np import pandas as pd import logging logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(message)s") log = logging.getLogger(__name__) def precision_at_k(recommended, relevant, k): if k == 0 or len(relevant) == 0: return 0.0 hits = sum(1 for item in recommended[:k] if item in relevant) return hits / k def recall_at_k(recommended, relevant, k): if k == 0 or len(relevant) == 0: return 0.0 hits = sum(1 for item in recommended[:k] if item in relevant) return hits / len(relevant) def ndcg_at_k(recommended, relevant, k): if k == 0 or len(relevant) == 0: return 0.0 dcg = sum( 1.0 / np.log2(i + 2) for i, item in enumerate(recommended[:k]) if item in relevant ) n_relevant = min(len(relevant), k) idcg = sum(1.0 / np.log2(i + 2) for i in range(n_relevant)) return dcg / idcg if idcg > 0 else 0.0 def hit_rate_at_k(recommended, relevant, k): return float(any(item in relevant for item in recommended[:k])) def evaluate_model(model, test_df, train_df, k_values=None, n_users=None): if k_values is None: k_values = [5, 10, 20] max_k = max(k_values) train_users = set(train_df["user_idx"].unique()) test_users = test_df[test_df["user_idx"].isin(train_users)]["user_idx"].unique() if n_users is not None: test_users = test_users[:n_users] log.info(f"Evaluating {len(test_users):,} users at K={k_values}") ground_truth = ( test_df[test_df["user_idx"].isin(test_users)] .groupby("user_idx")["item_idx"] .apply(set).to_dict() ) results = {k: {"precision": [], "recall": [], "ndcg": [], "hit_rate": []} for k in k_values} skipped = 0 for user_idx in test_users: relevant = ground_truth.get(user_idx, set()) if not relevant: skipped += 1 continue try: recs = model.recommend(user_idx, k=max_k) except Exception: skipped += 1 continue for k in k_values: results[k]["precision"].append(precision_at_k(recs, relevant, k)) results[k]["recall"].append(recall_at_k(recs, relevant, k)) results[k]["ndcg"].append(ndcg_at_k(recs, relevant, k)) results[k]["hit_rate"].append(hit_rate_at_k(recs, relevant, k)) if skipped > 0: log.warning(f"Skipped {skipped} users") summary = {} for k in k_values: summary[f"precision@{k}"] = float(np.mean(results[k]["precision"])) summary[f"recall@{k}"] = float(np.mean(results[k]["recall"])) summary[f"ndcg@{k}"] = float(np.mean(results[k]["ndcg"])) summary[f"hit_rate@{k}"] = float(np.mean(results[k]["hit_rate"])) summary["n_users_evaluated"] = int(len(test_users) - skipped) return summary def catalog_coverage(model, test_users, n_items, k=10): recommended_items = set() for u in test_users: try: recs = model.recommend(u, k=k) recommended_items.update(recs) except Exception: continue return len(recommended_items) / n_items def intra_list_diversity(model, test_users, item_feats, k=10, sample=200): feat_map = item_feats.set_index("item_idx")["popularity_score"].to_dict() diversities = [] for u in list(test_users)[:sample]: try: recs = model.recommend(u, k=k) except Exception: continue scores = np.array([feat_map.get(i, 0.0) for i in recs]) if len(scores) < 2: continue diffs = np.abs(scores[:, None] - scores[None, :]) div = diffs[np.triu_indices(len(scores), k=1)].mean() diversities.append(div) return float(np.mean(diversities)) if diversities else 0.0