COS498-Group7 / eval /run_eval.py
izzicooki's picture
Add PC Pal evaluation framework with structural metrics and rubric scoring
980f6ba
"""
eval/run_eval.py
CLI runner for the PC Pal evaluation framework.
Usage
-----
python eval/run_eval.py # Interactive mode
python eval/run_eval.py --precomputed # Use precomputed rubric scores
python eval/run_eval.py --conversation conv-xxx # Single conversation
python eval/run_eval.py --data-dir data/conversations # Scan a directory
python eval/run_eval.py --file eval/sample_conversations.json # Specific file
"""
import argparse
import json
import os
import sys
from datetime import datetime
from pathlib import Path
# Ensure the eval/ directory is on the path so we can import siblings
_EVAL_DIR = Path(__file__).parent.resolve()
sys.path.insert(0, str(_EVAL_DIR))
from evaluate import load_conversations, evaluate_conversation # noqa: E402
from rubrics import RUBRICS, PRECOMPUTED_SCORES # noqa: E402
# ---------------------------------------------------------------------------
# Output helpers
# ---------------------------------------------------------------------------
def _separator(char="-", width=70):
print(char * width)
def _print_table(results):
"""Print a summary table of all evaluation results."""
_separator("=")
print(f"{'ID':<30} {'Avg Rubric':>10} {'Flags':>6} {'Warn':>5} {'CRIT':>5}")
_separator()
for r in results:
s = r["summary"]
print(
f"{r['conversation_id']:<30} "
f"{s['avg_rubric_score']:>10} "
f"{s['total_flags']:>6} "
f"{s['warnings']:>5} "
f"{s['criticals']:>5}"
)
_separator("=")
def _print_result_detail(result):
"""Print detailed output for a single conversation."""
_separator("=")
print(f"Conversation: {result['name']} ({result['conversation_id']})")
_separator()
print("\nStructural Metrics:")
for m in result["structural_metrics"]:
flag_str = f" [{m['flag']}]" if m.get("flag") else ""
print(f" {m['metric']:<25} {str(m['value']):<10}{flag_str}")
print("\nRubric Scores:")
rubric_scores = result["rubric_scores"]
for rubric in RUBRICS:
name = rubric["name"]
entry = rubric_scores.get(name)
if entry is None:
score_str = "N/A"
notes_str = ""
elif isinstance(entry, dict):
score_str = str(entry.get("score", "?"))
notes_str = f" — {entry.get('notes', '')}"
else:
score_str = str(entry)
notes_str = ""
print(f" {name:<20} {score_str}{notes_str}")
s = result["summary"]
print(f"\nSummary:")
print(f" Avg Rubric Score : {s['avg_rubric_score']}/5")
print(f" Total Flags : {s['total_flags']} (Warnings: {s['warnings']}, Criticals: {s['criticals']})")
if s["flag_details"]:
print(" Flag Details:")
for fd in s["flag_details"]:
print(f" - {fd}")
_separator("=")
# ---------------------------------------------------------------------------
# Interactive rubric scoring
# ---------------------------------------------------------------------------
def _prompt_rubric_scores(conversation_id, conversation):
"""Interactively ask the user to score each rubric dimension."""
print(f"\nScoring rubrics for: {conversation_id}")
print("Rate each dimension 1-5 (or press Enter to skip / mark as N/A).\n")
scores = {}
for rubric in RUBRICS:
name = rubric["name"]
desc = rubric["description"]
scale = rubric["scale"]
print(f" {name}")
print(f" {desc}")
print(f" Scale: {scale}")
while True:
raw = input(f" Score (1-5): ").strip()
if raw == "":
scores[name] = {"score": 0, "notes": "Not scored"}
break
try:
val = int(raw)
if 1 <= val <= 5:
notes = input(f" Notes (optional): ").strip()
scores[name] = {"score": val, "notes": notes}
break
else:
print(" Please enter a number between 1 and 5.")
except ValueError:
print(" Please enter a valid integer.")
return scores
# ---------------------------------------------------------------------------
# Save helpers
# ---------------------------------------------------------------------------
def _save_result(result, results_dir):
results_dir = Path(results_dir)
results_dir.mkdir(parents=True, exist_ok=True)
out_file = results_dir / f"result-{result['conversation_id']}.json"
with out_file.open("w", encoding="utf-8") as fh:
json.dump(result, fh, indent=2)
return out_file
def _save_summary(results, results_dir):
results_dir = Path(results_dir)
results_dir.mkdir(parents=True, exist_ok=True)
timestamp = datetime.now().strftime("%Y%m%d-%H%M%S")
out_file = results_dir / f"summary-{timestamp}.json"
summary_data = {
"timestamp": timestamp,
"conversations_evaluated": len(results),
"results": [
{
"conversation_id": r["conversation_id"],
"name": r["name"],
"summary": r["summary"],
}
for r in results
],
}
with out_file.open("w", encoding="utf-8") as fh:
json.dump(summary_data, fh, indent=2)
return out_file
# ---------------------------------------------------------------------------
# Main
# ---------------------------------------------------------------------------
def main():
parser = argparse.ArgumentParser(
description="PC Pal Evaluation Framework CLI",
formatter_class=argparse.RawDescriptionHelpFormatter,
epilog=__doc__,
)
parser.add_argument(
"--precomputed",
action="store_true",
help="Use precomputed rubric scores from rubrics.py instead of prompting.",
)
parser.add_argument(
"--conversation",
metavar="ID",
help="Evaluate only the conversation with this ID.",
)
parser.add_argument(
"--data-dir",
metavar="DIR",
help="Directory to scan for *.json conversation files.",
)
parser.add_argument(
"--file",
metavar="FILE",
help="Specific JSON file containing conversations.",
)
parser.add_argument(
"--results-dir",
metavar="DIR",
default=str(_EVAL_DIR / "results"),
help="Directory to write result files (default: eval/results/).",
)
args = parser.parse_args()
# ---- Determine source of conversations ----
repo_root = _EVAL_DIR.parent
default_sample = _EVAL_DIR / "sample_conversations.json"
if args.file:
json_path = Path(args.file)
elif args.data_dir:
json_path = None
# Override load_conversations to use this data_dir
elif default_sample.exists():
json_path = default_sample
else:
json_path = None
try:
if args.data_dir:
# Temporarily patch data dir by loading manually
data_path = Path(args.data_dir)
conversations = {}
for jf in sorted(data_path.glob("*.json")):
with jf.open(encoding="utf-8") as fh:
conv = json.load(fh)
cid = conv.get("id") or jf.stem
conversations[cid] = conv
else:
conversations = load_conversations(json_path)
except FileNotFoundError as exc:
print(f"ERROR: {exc}", file=sys.stderr)
sys.exit(1)
if not conversations:
print("No conversations loaded. Nothing to evaluate.", file=sys.stderr)
sys.exit(1)
# ---- Filter to single conversation if requested ----
if args.conversation:
if args.conversation not in conversations:
print(
f"ERROR: Conversation '{args.conversation}' not found. "
f"Available: {list(conversations.keys())}",
file=sys.stderr,
)
sys.exit(1)
conversations = {args.conversation: conversations[args.conversation]}
print(f"\nPC Pal Evaluation Framework")
print(f"Loaded {len(conversations)} conversation(s).")
results = []
for conv_id, conv in conversations.items():
print(f"\nEvaluating: {conv_id}")
if args.precomputed:
if conv_id not in PRECOMPUTED_SCORES:
print(f" WARNING: No precomputed scores for '{conv_id}' — skipping rubric scoring.")
rubric_scores = {}
else:
rubric_scores = PRECOMPUTED_SCORES[conv_id]
print(f" Using precomputed rubric scores.")
else:
rubric_scores = _prompt_rubric_scores(conv_id, conv)
result = evaluate_conversation(conv_id, conv, rubric_scores)
_print_result_detail(result)
out_file = _save_result(result, args.results_dir)
print(f" Saved: {out_file}")
results.append(result)
# ---- Summary table ----
if len(results) > 1:
print("\nOverall Summary Table")
_print_table(results)
summary_file = _save_summary(results, args.results_dir)
print(f"\nSummary saved: {summary_file}")
print("Done.")
if __name__ == "__main__":
main()