COS498-Group7 / eval /evaluate.py
izzicooki's picture
Add PC Pal evaluation framework with structural metrics and rubric scoring
980f6ba
"""
eval/evaluate.py
Orchestrator for the PC Pal evaluation framework.
Loads conversations, runs structural metrics, combines with rubric scores,
and produces a structured result per conversation.
"""
import json
import os
from pathlib import Path
from metrics import compute_all
# ---------------------------------------------------------------------------
# Loading
# ---------------------------------------------------------------------------
def load_conversations(json_path=None):
"""
Load conversations from a specific JSON file or from the data/conversations/ directory.
Parameters
----------
json_path : str | Path | None
Path to a specific JSON file containing a list of conversations or a single
conversation dict. If None, all *.json files under data/conversations/ are loaded.
Returns
-------
dict[str, dict]
Mapping of conversation_id -> conversation dict.
"""
conversations = {}
if json_path is not None:
json_path = Path(json_path)
if not json_path.exists():
raise FileNotFoundError(f"Conversation file not found: {json_path}")
with json_path.open(encoding="utf-8") as fh:
data = json.load(fh)
# Accept either a list of conversations or a single dict keyed by id
if isinstance(data, list):
for conv in data:
cid = conv.get("id") or conv.get("conversation_id") or f"conv-{len(conversations)}"
conversations[cid] = conv
elif isinstance(data, dict):
# Could be {id: conv, ...} or a single conversation
if "turns" in data:
cid = data.get("id") or "conv-0"
conversations[cid] = data
else:
for cid, conv in data.items():
conversations[cid] = conv
return conversations
# Scan data/conversations/
repo_root = Path(__file__).parent.parent
data_dir = repo_root / "data" / "conversations"
if not data_dir.exists():
raise FileNotFoundError(f"data/conversations directory not found: {data_dir}")
json_files = sorted(data_dir.glob("*.json"))
if not json_files:
raise FileNotFoundError(f"No JSON files found in {data_dir}")
for jf in json_files:
with jf.open(encoding="utf-8") as fh:
conv = json.load(fh)
cid = conv.get("id") or jf.stem
conversations[cid] = conv
return conversations
# ---------------------------------------------------------------------------
# Structural metrics
# ---------------------------------------------------------------------------
def run_structural(conversation):
"""Run all structural metrics on a conversation and return the list of results."""
return compute_all(conversation)
# ---------------------------------------------------------------------------
# Per-conversation evaluation
# ---------------------------------------------------------------------------
def evaluate_conversation(conversation_id, conversation, rubric_scores):
"""
Combine structural metrics with rubric scores into a single evaluation result.
Parameters
----------
conversation_id : str
conversation : dict
rubric_scores : dict[str, dict]
e.g. {"Clarity": {"score": 5, "notes": "..."}, ...}
Returns
-------
dict
"""
structural = run_structural(conversation)
summary = compute_summary(structural, rubric_scores)
return {
"conversation_id": conversation_id,
"name": conversation.get("name", conversation_id),
"structural_metrics": structural,
"rubric_scores": rubric_scores,
"summary": summary,
}
# ---------------------------------------------------------------------------
# Summary
# ---------------------------------------------------------------------------
def compute_summary(structural_metrics, rubric_scores):
"""
Compute a high-level summary from structural metric flags and rubric scores.
Parameters
----------
structural_metrics : list[dict]
Output of compute_all().
rubric_scores : dict[str, dict | int | float]
Either {"Clarity": {"score": 5}, ...} or {"Clarity": 5, ...}
Returns
-------
dict
"""
flags = [m["flag"] for m in structural_metrics if m.get("flag")]
warnings = [f for f in flags if "WARNING" in f]
criticals = [f for f in flags if "CRITICAL" in f]
# Support both {"score": n, "notes": "..."} and plain numeric formats
rubric_values = []
for v in rubric_scores.values():
if isinstance(v, dict):
rubric_values.append(v["score"])
elif isinstance(v, (int, float)):
rubric_values.append(v)
avg_rubric = round(sum(rubric_values) / len(rubric_values), 1) if rubric_values else 0
return {
"avg_rubric_score": avg_rubric,
"total_flags": len(flags),
"warnings": len(warnings),
"criticals": len(criticals),
"flag_details": flags,
}