"""Quick verification — load processed data, build a persona, print it. Run after `python data/prepare_data.py` to confirm the persona engine works on your real Amazon data before we plug in the LLM agents. python scripts/verify_persona.py python scripts/verify_persona.py --enrich # also runs LLM enrichment python scripts/verify_persona.py --user # specific user """ from __future__ import annotations import argparse import logging import pandas as pd from core.config import settings from core.persona import PersonaEngine logging.basicConfig(level=logging.INFO, format="%(asctime)s [%(levelname)s] %(message)s") def main(): ap = argparse.ArgumentParser() ap.add_argument("--user", type=str, default=None, help="Specific user_id; otherwise picks a cross-domain user with the most reviews") ap.add_argument("--enrich", action="store_true", help="Also run LLM qualitative enrichment (requires OPENAI_API_KEY)") args = ap.parse_args() reviews_path = settings.processed_dir / "reviews.parquet" if not reviews_path.exists(): raise SystemExit( f"Could not find {reviews_path}. Run `python data/prepare_data.py` first." ) reviews = pd.read_parquet(reviews_path) train = reviews[reviews["split"] == "train"] print(f"Loaded {len(train):,} training reviews across {train['user_id'].nunique():,} users\n") if args.user: user_id = args.user else: # Pick the cross-domain user with the most reviews counts = train.groupby("user_id").agg(n=("rating", "size"), d=("domain", "nunique")).reset_index() counts = counts[counts["d"] >= 2].sort_values("n", ascending=False) if counts.empty: user_id = train["user_id"].value_counts().idxmax() else: user_id = counts.iloc[0]["user_id"] print(f"Auto-selected cross-domain user: {user_id}\n") engine = PersonaEngine() if args.enrich else PersonaEngine.__new__(PersonaEngine) if not args.enrich: # Quantitative-only mode — skip LLM init engine.top_terms_k = 20 engine.history_samples_k = 8 engine.llm = None # type: ignore persona = engine.from_dataframe(user_id, train) if args.enrich: print("Running LLM enrichment...\n") persona = engine.enrich(persona) print("=" * 70) print(persona.to_prompt_block()) print("=" * 70) print(f"\nHistory samples ({len(persona.history_samples)}):") for i, s in enumerate(persona.history_samples[:3], 1): snippet = s["text"][:150].replace("\n", " ") print(f" [{i}] {s['rating']}★ ({s['domain']}): {snippet}...") if __name__ == "__main__": main()