Spaces:
Sleeping
Sleeping
| """ | |
| eval/run_eval.py | |
| CLI runner for the PC Pal evaluation framework. | |
| Usage | |
| ----- | |
| python eval/run_eval.py # Interactive mode | |
| python eval/run_eval.py --precomputed # Use precomputed rubric scores | |
| python eval/run_eval.py --conversation conv-xxx # Single conversation | |
| python eval/run_eval.py --data-dir data/conversations # Scan a directory | |
| python eval/run_eval.py --file eval/sample_conversations.json # Specific file | |
| """ | |
| import argparse | |
| import json | |
| import os | |
| import sys | |
| from datetime import datetime | |
| from pathlib import Path | |
| # Ensure the eval/ directory is on the path so we can import siblings | |
| _EVAL_DIR = Path(__file__).parent.resolve() | |
| sys.path.insert(0, str(_EVAL_DIR)) | |
| from evaluate import load_conversations, evaluate_conversation # noqa: E402 | |
| from rubrics import RUBRICS, PRECOMPUTED_SCORES # noqa: E402 | |
| # --------------------------------------------------------------------------- | |
| # Output helpers | |
| # --------------------------------------------------------------------------- | |
| def _separator(char="-", width=70): | |
| print(char * width) | |
| def _print_table(results): | |
| """Print a summary table of all evaluation results.""" | |
| _separator("=") | |
| print(f"{'ID':<30} {'Avg Rubric':>10} {'Flags':>6} {'Warn':>5} {'CRIT':>5}") | |
| _separator() | |
| for r in results: | |
| s = r["summary"] | |
| print( | |
| f"{r['conversation_id']:<30} " | |
| f"{s['avg_rubric_score']:>10} " | |
| f"{s['total_flags']:>6} " | |
| f"{s['warnings']:>5} " | |
| f"{s['criticals']:>5}" | |
| ) | |
| _separator("=") | |
| def _print_result_detail(result): | |
| """Print detailed output for a single conversation.""" | |
| _separator("=") | |
| print(f"Conversation: {result['name']} ({result['conversation_id']})") | |
| _separator() | |
| print("\nStructural Metrics:") | |
| for m in result["structural_metrics"]: | |
| flag_str = f" [{m['flag']}]" if m.get("flag") else "" | |
| print(f" {m['metric']:<25} {str(m['value']):<10}{flag_str}") | |
| print("\nRubric Scores:") | |
| rubric_scores = result["rubric_scores"] | |
| for rubric in RUBRICS: | |
| name = rubric["name"] | |
| entry = rubric_scores.get(name) | |
| if entry is None: | |
| score_str = "N/A" | |
| notes_str = "" | |
| elif isinstance(entry, dict): | |
| score_str = str(entry.get("score", "?")) | |
| notes_str = f" — {entry.get('notes', '')}" | |
| else: | |
| score_str = str(entry) | |
| notes_str = "" | |
| print(f" {name:<20} {score_str}{notes_str}") | |
| s = result["summary"] | |
| print(f"\nSummary:") | |
| print(f" Avg Rubric Score : {s['avg_rubric_score']}/5") | |
| print(f" Total Flags : {s['total_flags']} (Warnings: {s['warnings']}, Criticals: {s['criticals']})") | |
| if s["flag_details"]: | |
| print(" Flag Details:") | |
| for fd in s["flag_details"]: | |
| print(f" - {fd}") | |
| _separator("=") | |
| # --------------------------------------------------------------------------- | |
| # Interactive rubric scoring | |
| # --------------------------------------------------------------------------- | |
| def _prompt_rubric_scores(conversation_id, conversation): | |
| """Interactively ask the user to score each rubric dimension.""" | |
| print(f"\nScoring rubrics for: {conversation_id}") | |
| print("Rate each dimension 1-5 (or press Enter to skip / mark as N/A).\n") | |
| scores = {} | |
| for rubric in RUBRICS: | |
| name = rubric["name"] | |
| desc = rubric["description"] | |
| scale = rubric["scale"] | |
| print(f" {name}") | |
| print(f" {desc}") | |
| print(f" Scale: {scale}") | |
| while True: | |
| raw = input(f" Score (1-5): ").strip() | |
| if raw == "": | |
| scores[name] = {"score": 0, "notes": "Not scored"} | |
| break | |
| try: | |
| val = int(raw) | |
| if 1 <= val <= 5: | |
| notes = input(f" Notes (optional): ").strip() | |
| scores[name] = {"score": val, "notes": notes} | |
| break | |
| else: | |
| print(" Please enter a number between 1 and 5.") | |
| except ValueError: | |
| print(" Please enter a valid integer.") | |
| return scores | |
| # --------------------------------------------------------------------------- | |
| # Save helpers | |
| # --------------------------------------------------------------------------- | |
| def _save_result(result, results_dir): | |
| results_dir = Path(results_dir) | |
| results_dir.mkdir(parents=True, exist_ok=True) | |
| out_file = results_dir / f"result-{result['conversation_id']}.json" | |
| with out_file.open("w", encoding="utf-8") as fh: | |
| json.dump(result, fh, indent=2) | |
| return out_file | |
| def _save_summary(results, results_dir): | |
| results_dir = Path(results_dir) | |
| results_dir.mkdir(parents=True, exist_ok=True) | |
| timestamp = datetime.now().strftime("%Y%m%d-%H%M%S") | |
| out_file = results_dir / f"summary-{timestamp}.json" | |
| summary_data = { | |
| "timestamp": timestamp, | |
| "conversations_evaluated": len(results), | |
| "results": [ | |
| { | |
| "conversation_id": r["conversation_id"], | |
| "name": r["name"], | |
| "summary": r["summary"], | |
| } | |
| for r in results | |
| ], | |
| } | |
| with out_file.open("w", encoding="utf-8") as fh: | |
| json.dump(summary_data, fh, indent=2) | |
| return out_file | |
| # --------------------------------------------------------------------------- | |
| # Main | |
| # --------------------------------------------------------------------------- | |
| def main(): | |
| parser = argparse.ArgumentParser( | |
| description="PC Pal Evaluation Framework CLI", | |
| formatter_class=argparse.RawDescriptionHelpFormatter, | |
| epilog=__doc__, | |
| ) | |
| parser.add_argument( | |
| "--precomputed", | |
| action="store_true", | |
| help="Use precomputed rubric scores from rubrics.py instead of prompting.", | |
| ) | |
| parser.add_argument( | |
| "--conversation", | |
| metavar="ID", | |
| help="Evaluate only the conversation with this ID.", | |
| ) | |
| parser.add_argument( | |
| "--data-dir", | |
| metavar="DIR", | |
| help="Directory to scan for *.json conversation files.", | |
| ) | |
| parser.add_argument( | |
| "--file", | |
| metavar="FILE", | |
| help="Specific JSON file containing conversations.", | |
| ) | |
| parser.add_argument( | |
| "--results-dir", | |
| metavar="DIR", | |
| default=str(_EVAL_DIR / "results"), | |
| help="Directory to write result files (default: eval/results/).", | |
| ) | |
| args = parser.parse_args() | |
| # ---- Determine source of conversations ---- | |
| repo_root = _EVAL_DIR.parent | |
| default_sample = _EVAL_DIR / "sample_conversations.json" | |
| if args.file: | |
| json_path = Path(args.file) | |
| elif args.data_dir: | |
| json_path = None | |
| # Override load_conversations to use this data_dir | |
| elif default_sample.exists(): | |
| json_path = default_sample | |
| else: | |
| json_path = None | |
| try: | |
| if args.data_dir: | |
| # Temporarily patch data dir by loading manually | |
| data_path = Path(args.data_dir) | |
| conversations = {} | |
| for jf in sorted(data_path.glob("*.json")): | |
| with jf.open(encoding="utf-8") as fh: | |
| conv = json.load(fh) | |
| cid = conv.get("id") or jf.stem | |
| conversations[cid] = conv | |
| else: | |
| conversations = load_conversations(json_path) | |
| except FileNotFoundError as exc: | |
| print(f"ERROR: {exc}", file=sys.stderr) | |
| sys.exit(1) | |
| if not conversations: | |
| print("No conversations loaded. Nothing to evaluate.", file=sys.stderr) | |
| sys.exit(1) | |
| # ---- Filter to single conversation if requested ---- | |
| if args.conversation: | |
| if args.conversation not in conversations: | |
| print( | |
| f"ERROR: Conversation '{args.conversation}' not found. " | |
| f"Available: {list(conversations.keys())}", | |
| file=sys.stderr, | |
| ) | |
| sys.exit(1) | |
| conversations = {args.conversation: conversations[args.conversation]} | |
| print(f"\nPC Pal Evaluation Framework") | |
| print(f"Loaded {len(conversations)} conversation(s).") | |
| results = [] | |
| for conv_id, conv in conversations.items(): | |
| print(f"\nEvaluating: {conv_id}") | |
| if args.precomputed: | |
| if conv_id not in PRECOMPUTED_SCORES: | |
| print(f" WARNING: No precomputed scores for '{conv_id}' — skipping rubric scoring.") | |
| rubric_scores = {} | |
| else: | |
| rubric_scores = PRECOMPUTED_SCORES[conv_id] | |
| print(f" Using precomputed rubric scores.") | |
| else: | |
| rubric_scores = _prompt_rubric_scores(conv_id, conv) | |
| result = evaluate_conversation(conv_id, conv, rubric_scores) | |
| _print_result_detail(result) | |
| out_file = _save_result(result, args.results_dir) | |
| print(f" Saved: {out_file}") | |
| results.append(result) | |
| # ---- Summary table ---- | |
| if len(results) > 1: | |
| print("\nOverall Summary Table") | |
| _print_table(results) | |
| summary_file = _save_summary(results, args.results_dir) | |
| print(f"\nSummary saved: {summary_file}") | |
| print("Done.") | |
| if __name__ == "__main__": | |
| main() | |