Spaces:

umaine
/

COS498-Group7

Sleeping

App Files Files Community

COS498-Group7 / eval /evaluate.py

izzicooki

Add PC Pal evaluation framework with structural metrics and rubric scoring

980f6ba about 1 month ago

raw

history blame contribute delete

5.06 kB

	"""
	eval/evaluate.py
	Orchestrator for the PC Pal evaluation framework.
	Loads conversations, runs structural metrics, combines with rubric scores,
	and produces a structured result per conversation.
	"""

	import json
	import os
	from pathlib import Path

	from metrics import compute_all


	# ---------------------------------------------------------------------------
	# Loading
	# ---------------------------------------------------------------------------

	def load_conversations(json_path=None):
	"""
	Load conversations from a specific JSON file or from the data/conversations/ directory.

	Parameters
	----------
	json_path : str \| Path \| None
	Path to a specific JSON file containing a list of conversations or a single
	conversation dict. If None, all *.json files under data/conversations/ are loaded.

	Returns
	-------
	dict[str, dict]
	Mapping of conversation_id -> conversation dict.
	"""
	conversations = {}

	if json_path is not None:
	json_path = Path(json_path)
	if not json_path.exists():
	raise FileNotFoundError(f"Conversation file not found: {json_path}")

	with json_path.open(encoding="utf-8") as fh:
	data = json.load(fh)

	# Accept either a list of conversations or a single dict keyed by id
	if isinstance(data, list):
	for conv in data:
	cid = conv.get("id") or conv.get("conversation_id") or f"conv-{len(conversations)}"
	conversations[cid] = conv
	elif isinstance(data, dict):
	# Could be {id: conv, ...} or a single conversation
	if "turns" in data:
	cid = data.get("id") or "conv-0"
	conversations[cid] = data
	else:
	for cid, conv in data.items():
	conversations[cid] = conv
	return conversations

	# Scan data/conversations/
	repo_root = Path(__file__).parent.parent
	data_dir = repo_root / "data" / "conversations"
	if not data_dir.exists():
	raise FileNotFoundError(f"data/conversations directory not found: {data_dir}")

	json_files = sorted(data_dir.glob("*.json"))
	if not json_files:
	raise FileNotFoundError(f"No JSON files found in {data_dir}")

	for jf in json_files:
	with jf.open(encoding="utf-8") as fh:
	conv = json.load(fh)
	cid = conv.get("id") or jf.stem
	conversations[cid] = conv

	return conversations


	# ---------------------------------------------------------------------------
	# Structural metrics
	# ---------------------------------------------------------------------------

	def run_structural(conversation):
	"""Run all structural metrics on a conversation and return the list of results."""
	return compute_all(conversation)


	# ---------------------------------------------------------------------------
	# Per-conversation evaluation
	# ---------------------------------------------------------------------------

	def evaluate_conversation(conversation_id, conversation, rubric_scores):
	"""
	Combine structural metrics with rubric scores into a single evaluation result.

	Parameters
	----------
	conversation_id : str
	conversation : dict
	rubric_scores : dict[str, dict]
	e.g. {"Clarity": {"score": 5, "notes": "..."}, ...}

	Returns
	-------
	dict
	"""
	structural = run_structural(conversation)
	summary = compute_summary(structural, rubric_scores)

	return {
	"conversation_id": conversation_id,
	"name": conversation.get("name", conversation_id),
	"structural_metrics": structural,
	"rubric_scores": rubric_scores,
	"summary": summary,
	}


	# ---------------------------------------------------------------------------
	# Summary
	# ---------------------------------------------------------------------------

	def compute_summary(structural_metrics, rubric_scores):
	"""
	Compute a high-level summary from structural metric flags and rubric scores.

	Parameters
	----------
	structural_metrics : list[dict]
	Output of compute_all().
	rubric_scores : dict[str, dict \| int \| float]
	Either {"Clarity": {"score": 5}, ...} or {"Clarity": 5, ...}

	Returns
	-------
	dict
	"""
	flags = [m["flag"] for m in structural_metrics if m.get("flag")]
	warnings = [f for f in flags if "WARNING" in f]
	criticals = [f for f in flags if "CRITICAL" in f]

	# Support both {"score": n, "notes": "..."} and plain numeric formats
	rubric_values = []
	for v in rubric_scores.values():
	if isinstance(v, dict):
	rubric_values.append(v["score"])
	elif isinstance(v, (int, float)):
	rubric_values.append(v)

	avg_rubric = round(sum(rubric_values) / len(rubric_values), 1) if rubric_values else 0

	return {
	"avg_rubric_score": avg_rubric,
	"total_flags": len(flags),
	"warnings": len(warnings),
	"criticals": len(criticals),
	"flag_details": flags,
	}