""" eval/run_eval.py CLI runner for the PC Pal evaluation framework. Usage ----- python eval/run_eval.py # Interactive mode python eval/run_eval.py --precomputed # Use precomputed rubric scores python eval/run_eval.py --conversation conv-xxx # Single conversation python eval/run_eval.py --data-dir data/conversations # Scan a directory python eval/run_eval.py --file eval/sample_conversations.json # Specific file """ import argparse import json import os import sys from datetime import datetime from pathlib import Path # Ensure the eval/ directory is on the path so we can import siblings _EVAL_DIR = Path(__file__).parent.resolve() sys.path.insert(0, str(_EVAL_DIR)) from evaluate import load_conversations, evaluate_conversation # noqa: E402 from rubrics import RUBRICS, PRECOMPUTED_SCORES # noqa: E402 # --------------------------------------------------------------------------- # Output helpers # --------------------------------------------------------------------------- def _separator(char="-", width=70): print(char * width) def _print_table(results): """Print a summary table of all evaluation results.""" _separator("=") print(f"{'ID':<30} {'Avg Rubric':>10} {'Flags':>6} {'Warn':>5} {'CRIT':>5}") _separator() for r in results: s = r["summary"] print( f"{r['conversation_id']:<30} " f"{s['avg_rubric_score']:>10} " f"{s['total_flags']:>6} " f"{s['warnings']:>5} " f"{s['criticals']:>5}" ) _separator("=") def _print_result_detail(result): """Print detailed output for a single conversation.""" _separator("=") print(f"Conversation: {result['name']} ({result['conversation_id']})") _separator() print("\nStructural Metrics:") for m in result["structural_metrics"]: flag_str = f" [{m['flag']}]" if m.get("flag") else "" print(f" {m['metric']:<25} {str(m['value']):<10}{flag_str}") print("\nRubric Scores:") rubric_scores = result["rubric_scores"] for rubric in RUBRICS: name = rubric["name"] entry = rubric_scores.get(name) if entry is None: score_str = "N/A" notes_str = "" elif isinstance(entry, dict): score_str = str(entry.get("score", "?")) notes_str = f" — {entry.get('notes', '')}" else: score_str = str(entry) notes_str = "" print(f" {name:<20} {score_str}{notes_str}") s = result["summary"] print(f"\nSummary:") print(f" Avg Rubric Score : {s['avg_rubric_score']}/5") print(f" Total Flags : {s['total_flags']} (Warnings: {s['warnings']}, Criticals: {s['criticals']})") if s["flag_details"]: print(" Flag Details:") for fd in s["flag_details"]: print(f" - {fd}") _separator("=") # --------------------------------------------------------------------------- # Interactive rubric scoring # --------------------------------------------------------------------------- def _prompt_rubric_scores(conversation_id, conversation): """Interactively ask the user to score each rubric dimension.""" print(f"\nScoring rubrics for: {conversation_id}") print("Rate each dimension 1-5 (or press Enter to skip / mark as N/A).\n") scores = {} for rubric in RUBRICS: name = rubric["name"] desc = rubric["description"] scale = rubric["scale"] print(f" {name}") print(f" {desc}") print(f" Scale: {scale}") while True: raw = input(f" Score (1-5): ").strip() if raw == "": scores[name] = {"score": 0, "notes": "Not scored"} break try: val = int(raw) if 1 <= val <= 5: notes = input(f" Notes (optional): ").strip() scores[name] = {"score": val, "notes": notes} break else: print(" Please enter a number between 1 and 5.") except ValueError: print(" Please enter a valid integer.") return scores # --------------------------------------------------------------------------- # Save helpers # --------------------------------------------------------------------------- def _save_result(result, results_dir): results_dir = Path(results_dir) results_dir.mkdir(parents=True, exist_ok=True) out_file = results_dir / f"result-{result['conversation_id']}.json" with out_file.open("w", encoding="utf-8") as fh: json.dump(result, fh, indent=2) return out_file def _save_summary(results, results_dir): results_dir = Path(results_dir) results_dir.mkdir(parents=True, exist_ok=True) timestamp = datetime.now().strftime("%Y%m%d-%H%M%S") out_file = results_dir / f"summary-{timestamp}.json" summary_data = { "timestamp": timestamp, "conversations_evaluated": len(results), "results": [ { "conversation_id": r["conversation_id"], "name": r["name"], "summary": r["summary"], } for r in results ], } with out_file.open("w", encoding="utf-8") as fh: json.dump(summary_data, fh, indent=2) return out_file # --------------------------------------------------------------------------- # Main # --------------------------------------------------------------------------- def main(): parser = argparse.ArgumentParser( description="PC Pal Evaluation Framework CLI", formatter_class=argparse.RawDescriptionHelpFormatter, epilog=__doc__, ) parser.add_argument( "--precomputed", action="store_true", help="Use precomputed rubric scores from rubrics.py instead of prompting.", ) parser.add_argument( "--conversation", metavar="ID", help="Evaluate only the conversation with this ID.", ) parser.add_argument( "--data-dir", metavar="DIR", help="Directory to scan for *.json conversation files.", ) parser.add_argument( "--file", metavar="FILE", help="Specific JSON file containing conversations.", ) parser.add_argument( "--results-dir", metavar="DIR", default=str(_EVAL_DIR / "results"), help="Directory to write result files (default: eval/results/).", ) args = parser.parse_args() # ---- Determine source of conversations ---- repo_root = _EVAL_DIR.parent default_sample = _EVAL_DIR / "sample_conversations.json" if args.file: json_path = Path(args.file) elif args.data_dir: json_path = None # Override load_conversations to use this data_dir elif default_sample.exists(): json_path = default_sample else: json_path = None try: if args.data_dir: # Temporarily patch data dir by loading manually data_path = Path(args.data_dir) conversations = {} for jf in sorted(data_path.glob("*.json")): with jf.open(encoding="utf-8") as fh: conv = json.load(fh) cid = conv.get("id") or jf.stem conversations[cid] = conv else: conversations = load_conversations(json_path) except FileNotFoundError as exc: print(f"ERROR: {exc}", file=sys.stderr) sys.exit(1) if not conversations: print("No conversations loaded. Nothing to evaluate.", file=sys.stderr) sys.exit(1) # ---- Filter to single conversation if requested ---- if args.conversation: if args.conversation not in conversations: print( f"ERROR: Conversation '{args.conversation}' not found. " f"Available: {list(conversations.keys())}", file=sys.stderr, ) sys.exit(1) conversations = {args.conversation: conversations[args.conversation]} print(f"\nPC Pal Evaluation Framework") print(f"Loaded {len(conversations)} conversation(s).") results = [] for conv_id, conv in conversations.items(): print(f"\nEvaluating: {conv_id}") if args.precomputed: if conv_id not in PRECOMPUTED_SCORES: print(f" WARNING: No precomputed scores for '{conv_id}' — skipping rubric scoring.") rubric_scores = {} else: rubric_scores = PRECOMPUTED_SCORES[conv_id] print(f" Using precomputed rubric scores.") else: rubric_scores = _prompt_rubric_scores(conv_id, conv) result = evaluate_conversation(conv_id, conv, rubric_scores) _print_result_detail(result) out_file = _save_result(result, args.results_dir) print(f" Saved: {out_file}") results.append(result) # ---- Summary table ---- if len(results) > 1: print("\nOverall Summary Table") _print_table(results) summary_file = _save_summary(results, args.results_dir) print(f"\nSummary saved: {summary_file}") print("Done.") if __name__ == "__main__": main()