Spaces:

umaine
/

COS498-Group7

Sleeping

App Files Files Community

COS498-Group7 / eval /run_eval.py

izzicooki

Add PC Pal evaluation framework with structural metrics and rubric scoring

980f6ba about 2 months ago

raw

history blame contribute delete

9.35 kB

	"""
	eval/run_eval.py
	CLI runner for the PC Pal evaluation framework.

	Usage
	-----
	python eval/run_eval.py # Interactive mode
	python eval/run_eval.py --precomputed # Use precomputed rubric scores
	python eval/run_eval.py --conversation conv-xxx # Single conversation
	python eval/run_eval.py --data-dir data/conversations # Scan a directory
	python eval/run_eval.py --file eval/sample_conversations.json # Specific file
	"""

	import argparse
	import json
	import os
	import sys
	from datetime import datetime
	from pathlib import Path

	# Ensure the eval/ directory is on the path so we can import siblings
	_EVAL_DIR = Path(__file__).parent.resolve()
	sys.path.insert(0, str(_EVAL_DIR))

	from evaluate import load_conversations, evaluate_conversation # noqa: E402
	from rubrics import RUBRICS, PRECOMPUTED_SCORES # noqa: E402


	# ---------------------------------------------------------------------------
	# Output helpers
	# ---------------------------------------------------------------------------

	def _separator(char="-", width=70):
	print(char * width)


	def _print_table(results):
	"""Print a summary table of all evaluation results."""
	_separator("=")
	print(f"{'ID':<30} {'Avg Rubric':>10} {'Flags':>6} {'Warn':>5} {'CRIT':>5}")
	_separator()
	for r in results:
	s = r["summary"]
	print(
	f"{r['conversation_id']:<30} "
	f"{s['avg_rubric_score']:>10} "
	f"{s['total_flags']:>6} "
	f"{s['warnings']:>5} "
	f"{s['criticals']:>5}"
	)
	_separator("=")


	def _print_result_detail(result):
	"""Print detailed output for a single conversation."""
	_separator("=")
	print(f"Conversation: {result['name']} ({result['conversation_id']})")
	_separator()

	print("\nStructural Metrics:")
	for m in result["structural_metrics"]:
	flag_str = f" [{m['flag']}]" if m.get("flag") else ""
	print(f" {m['metric']:<25} {str(m['value']):<10}{flag_str}")

	print("\nRubric Scores:")
	rubric_scores = result["rubric_scores"]
	for rubric in RUBRICS:
	name = rubric["name"]
	entry = rubric_scores.get(name)
	if entry is None:
	score_str = "N/A"
	notes_str = ""
	elif isinstance(entry, dict):
	score_str = str(entry.get("score", "?"))
	notes_str = f" — {entry.get('notes', '')}"
	else:
	score_str = str(entry)
	notes_str = ""
	print(f" {name:<20} {score_str}{notes_str}")

	s = result["summary"]
	print(f"\nSummary:")
	print(f" Avg Rubric Score : {s['avg_rubric_score']}/5")
	print(f" Total Flags : {s['total_flags']} (Warnings: {s['warnings']}, Criticals: {s['criticals']})")
	if s["flag_details"]:
	print(" Flag Details:")
	for fd in s["flag_details"]:
	print(f" - {fd}")
	_separator("=")


	# ---------------------------------------------------------------------------
	# Interactive rubric scoring
	# ---------------------------------------------------------------------------

	def _prompt_rubric_scores(conversation_id, conversation):
	"""Interactively ask the user to score each rubric dimension."""
	print(f"\nScoring rubrics for: {conversation_id}")
	print("Rate each dimension 1-5 (or press Enter to skip / mark as N/A).\n")

	scores = {}
	for rubric in RUBRICS:
	name = rubric["name"]
	desc = rubric["description"]
	scale = rubric["scale"]
	print(f" {name}")
	print(f" {desc}")
	print(f" Scale: {scale}")
	while True:
	raw = input(f" Score (1-5): ").strip()
	if raw == "":
	scores[name] = {"score": 0, "notes": "Not scored"}
	break
	try:
	val = int(raw)
	if 1 <= val <= 5:
	notes = input(f" Notes (optional): ").strip()
	scores[name] = {"score": val, "notes": notes}
	break
	else:
	print(" Please enter a number between 1 and 5.")
	except ValueError:
	print(" Please enter a valid integer.")
	return scores


	# ---------------------------------------------------------------------------
	# Save helpers
	# ---------------------------------------------------------------------------

	def _save_result(result, results_dir):
	results_dir = Path(results_dir)
	results_dir.mkdir(parents=True, exist_ok=True)
	out_file = results_dir / f"result-{result['conversation_id']}.json"
	with out_file.open("w", encoding="utf-8") as fh:
	json.dump(result, fh, indent=2)
	return out_file


	def _save_summary(results, results_dir):
	results_dir = Path(results_dir)
	results_dir.mkdir(parents=True, exist_ok=True)
	timestamp = datetime.now().strftime("%Y%m%d-%H%M%S")
	out_file = results_dir / f"summary-{timestamp}.json"
	summary_data = {
	"timestamp": timestamp,
	"conversations_evaluated": len(results),
	"results": [
	{
	"conversation_id": r["conversation_id"],
	"name": r["name"],
	"summary": r["summary"],
	}
	for r in results
	],
	}
	with out_file.open("w", encoding="utf-8") as fh:
	json.dump(summary_data, fh, indent=2)
	return out_file


	# ---------------------------------------------------------------------------
	# Main
	# ---------------------------------------------------------------------------

	def main():
	parser = argparse.ArgumentParser(
	description="PC Pal Evaluation Framework CLI",
	formatter_class=argparse.RawDescriptionHelpFormatter,
	epilog=__doc__,
	)
	parser.add_argument(
	"--precomputed",
	action="store_true",
	help="Use precomputed rubric scores from rubrics.py instead of prompting.",
	)
	parser.add_argument(
	"--conversation",
	metavar="ID",
	help="Evaluate only the conversation with this ID.",
	)
	parser.add_argument(
	"--data-dir",
	metavar="DIR",
	help="Directory to scan for *.json conversation files.",
	)
	parser.add_argument(
	"--file",
	metavar="FILE",
	help="Specific JSON file containing conversations.",
	)
	parser.add_argument(
	"--results-dir",
	metavar="DIR",
	default=str(_EVAL_DIR / "results"),
	help="Directory to write result files (default: eval/results/).",
	)
	args = parser.parse_args()

	# ---- Determine source of conversations ----
	repo_root = _EVAL_DIR.parent
	default_sample = _EVAL_DIR / "sample_conversations.json"

	if args.file:
	json_path = Path(args.file)
	elif args.data_dir:
	json_path = None
	# Override load_conversations to use this data_dir
	elif default_sample.exists():
	json_path = default_sample
	else:
	json_path = None

	try:
	if args.data_dir:
	# Temporarily patch data dir by loading manually
	data_path = Path(args.data_dir)
	conversations = {}
	for jf in sorted(data_path.glob("*.json")):
	with jf.open(encoding="utf-8") as fh:
	conv = json.load(fh)
	cid = conv.get("id") or jf.stem
	conversations[cid] = conv
	else:
	conversations = load_conversations(json_path)
	except FileNotFoundError as exc:
	print(f"ERROR: {exc}", file=sys.stderr)
	sys.exit(1)

	if not conversations:
	print("No conversations loaded. Nothing to evaluate.", file=sys.stderr)
	sys.exit(1)

	# ---- Filter to single conversation if requested ----
	if args.conversation:
	if args.conversation not in conversations:
	print(
	f"ERROR: Conversation '{args.conversation}' not found. "
	f"Available: {list(conversations.keys())}",
	file=sys.stderr,
	)
	sys.exit(1)
	conversations = {args.conversation: conversations[args.conversation]}

	print(f"\nPC Pal Evaluation Framework")
	print(f"Loaded {len(conversations)} conversation(s).")

	results = []

	for conv_id, conv in conversations.items():
	print(f"\nEvaluating: {conv_id}")

	if args.precomputed:
	if conv_id not in PRECOMPUTED_SCORES:
	print(f" WARNING: No precomputed scores for '{conv_id}' — skipping rubric scoring.")
	rubric_scores = {}
	else:
	rubric_scores = PRECOMPUTED_SCORES[conv_id]
	print(f" Using precomputed rubric scores.")
	else:
	rubric_scores = _prompt_rubric_scores(conv_id, conv)

	result = evaluate_conversation(conv_id, conv, rubric_scores)
	_print_result_detail(result)

	out_file = _save_result(result, args.results_dir)
	print(f" Saved: {out_file}")

	results.append(result)

	# ---- Summary table ----
	if len(results) > 1:
	print("\nOverall Summary Table")
	_print_table(results)

	summary_file = _save_summary(results, args.results_dir)
	print(f"\nSummary saved: {summary_file}")
	print("Done.")


	if __name__ == "__main__":
	main()