Spaces:

Israelbliz
/

Recommendation-Agent

Running

File size: 6,414 Bytes

a971a56

"""End-to-end test of the Task B recommender on real data.

Picks a user, generates k=10 recommendations, and reports whether any of
the user's actual held-out test items appeared in the top-10 (Hit Rate).

This is the first time you'll see the Hit Rate signal — the same metric
judges score on. Real data, real test, real number.

Usage:
    python -m scripts.test_task_b
    python -m scripts.test_task_b --user <user_id>
    python -m scripts.test_task_b --cross-domain     # recommend in domains they haven't tried
    python -m scripts.test_task_b --cold-start        # use a synthetic Naija persona
"""
from __future__ import annotations

import argparse
import logging

import pandas as pd

from core.config import settings
from core.persona import PersonaEngine, UserPersona
from core.nigerian import naija_persona_examples
from task_b_recommender.agent import RecommendationAgent, detect_mode

logging.basicConfig(level=logging.INFO, format="%(asctime)s [%(levelname)s] %(message)s")


def _pick_cross_domain_user(train: pd.DataFrame, test: pd.DataFrame) -> str:
    """Pick a user with multi-domain history AND held-out test items."""
    users_in_test = set(test["user_id"])
    counts = (train.groupby("user_id")
                   .agg(n=("rating", "size"), d=("domain", "nunique"))
                   .reset_index())
    counts = counts[counts["user_id"].isin(users_in_test)]
    counts = counts[counts["d"] >= 2]
    if counts.empty:
        raise SystemExit("No cross-domain user has test reviews.")
    return counts.nlargest(1, "n").iloc[0]["user_id"]


def main():
    ap = argparse.ArgumentParser()
    ap.add_argument("--user", type=str, default=None)
    ap.add_argument("--cross-domain", action="store_true",
                    help="Recommend in domains the user has NOT engaged with")
    ap.add_argument("--cold-start", action="store_true",
                    help="Use a synthetic Naija demo persona instead of a real user")
    ap.add_argument("--k", type=int, default=10)
    args = ap.parse_args()

    reviews_path = settings.processed_dir / "reviews.parquet"
    if not reviews_path.exists():
        raise SystemExit("Run `python data/prepare_data.py` first.")

    reviews = pd.read_parquet(reviews_path)
    train = reviews[reviews["split"] == "train"]
    test = reviews[reviews["split"] == "test"]

    # ── Build persona ──────────────────────────────────────────────────────
    if args.cold_start:
        demo = naija_persona_examples()[0]   # Tunde — Lagos software engineer
        print(f"Cold-start persona: {demo['name']}\n")
        print(f"Description: {demo['description']}\n")
        persona = UserPersona(
            user_id="custom_cold_start",
            n_reviews=0, avg_rating=4.0, std_rating=0.5,
            avg_review_length=80.0, std_review_length=20.0,
            verified_rate=1.0, domains=["Books"], n_domains=1,
            rating_distribution={4: 0.6, 5: 0.3, 3: 0.1},
            top_terms=[],
            tone="", preferred_themes=demo["stated_preferences"],
            common_complaints=demo["deal_breakers"],
            voice_one_liner=demo["description"],
            history_samples=[],
        )
        held_out = pd.DataFrame()
    else:
        user_id = args.user or _pick_cross_domain_user(train, test)
        if not args.user:
            print(f"Auto-selected cross-domain user: {user_id}\n")

        print(f"Building persona for {user_id}...")
        engine = PersonaEngine()
        persona = engine.from_dataframe(user_id, train)
        persona = engine.enrich(persona)

        held_out = test[test["user_id"] == user_id]
        print(f"User has {len(held_out)} held-out test items")

    # ── Generate recommendations ───────────────────────────────────────────
    mode = detect_mode(persona, requested_cross_domain=args.cross_domain)
    print(f"\nMode: {mode}")
    print(f"Generating {args.k} recommendations...\n")

    # use_review_enrichment=False → pure HyDE for cold-start, no aggregator.
    # This keeps the rerank prompt small and isolates HyDE's effect.
    agent = RecommendationAgent(use_review_enrichment=False)
    recs = agent.run(persona, k=args.k, cross_domain=args.cross_domain)

    # ── Display ────────────────────────────────────────────────────────────
    print("=" * 70)
    print("PERSONA")
    print("=" * 70)
    if persona.n_reviews > 0:
        print(f"User: {persona.user_id}")
        print(f"Avg rating: {persona.avg_rating:.2f}  Reviews: {persona.n_reviews}")
        print(f"Domains: {', '.join(persona.domains)}")
        print(f"Voice: {persona.voice_one_liner}")
    else:
        print(f"Cold-start: {persona.voice_one_liner}")

    print("\n" + "=" * 70)
    print(f"TOP {args.k} RECOMMENDATIONS")
    print("=" * 70)
    if not recs:
        print("(no recommendations returned)")
    held_out_ids = set(held_out["parent_asin"]) if not held_out.empty else set()
    hits = 0
    for r in recs:
        is_hit = r.item_id in held_out_ids
        marker = "  🎯 HIT" if is_hit else ""
        if is_hit:
            hits += 1
        print(f"\n#{r.rank}  [{r.domain}]  {r.title[:80]}{marker}")
        print(f"     Why: {r.reasoning}")

    # ── Hit Rate report ────────────────────────────────────────────────────
    if not held_out.empty:
        print("\n" + "=" * 70)
        print("EVALUATION (vs held-out test items)")
        print("=" * 70)
        print(f"User's actual held-out items: {len(held_out)}")
        print(f"Hits in top-{args.k}:           {hits}")
        print(f"Hit Rate@{args.k}:              {1.0 if hits > 0 else 0.0:.2f}")
        if hits > 0:
            print(f"\n🎉 The system recommended {hits} item(s) the user actually engaged with.")
        else:
            print(f"\n(No hits this run — held-out items are tiny needles in a 64k-item haystack.")
            print(f" Real evaluation will average across 500 users to get a stable score.)")

    print()


if __name__ == "__main__":
    main()