Spaces:
Sleeping
Sleeping
File size: 5,055 Bytes
980f6ba | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 | """
eval/evaluate.py
Orchestrator for the PC Pal evaluation framework.
Loads conversations, runs structural metrics, combines with rubric scores,
and produces a structured result per conversation.
"""
import json
import os
from pathlib import Path
from metrics import compute_all
# ---------------------------------------------------------------------------
# Loading
# ---------------------------------------------------------------------------
def load_conversations(json_path=None):
"""
Load conversations from a specific JSON file or from the data/conversations/ directory.
Parameters
----------
json_path : str | Path | None
Path to a specific JSON file containing a list of conversations or a single
conversation dict. If None, all *.json files under data/conversations/ are loaded.
Returns
-------
dict[str, dict]
Mapping of conversation_id -> conversation dict.
"""
conversations = {}
if json_path is not None:
json_path = Path(json_path)
if not json_path.exists():
raise FileNotFoundError(f"Conversation file not found: {json_path}")
with json_path.open(encoding="utf-8") as fh:
data = json.load(fh)
# Accept either a list of conversations or a single dict keyed by id
if isinstance(data, list):
for conv in data:
cid = conv.get("id") or conv.get("conversation_id") or f"conv-{len(conversations)}"
conversations[cid] = conv
elif isinstance(data, dict):
# Could be {id: conv, ...} or a single conversation
if "turns" in data:
cid = data.get("id") or "conv-0"
conversations[cid] = data
else:
for cid, conv in data.items():
conversations[cid] = conv
return conversations
# Scan data/conversations/
repo_root = Path(__file__).parent.parent
data_dir = repo_root / "data" / "conversations"
if not data_dir.exists():
raise FileNotFoundError(f"data/conversations directory not found: {data_dir}")
json_files = sorted(data_dir.glob("*.json"))
if not json_files:
raise FileNotFoundError(f"No JSON files found in {data_dir}")
for jf in json_files:
with jf.open(encoding="utf-8") as fh:
conv = json.load(fh)
cid = conv.get("id") or jf.stem
conversations[cid] = conv
return conversations
# ---------------------------------------------------------------------------
# Structural metrics
# ---------------------------------------------------------------------------
def run_structural(conversation):
"""Run all structural metrics on a conversation and return the list of results."""
return compute_all(conversation)
# ---------------------------------------------------------------------------
# Per-conversation evaluation
# ---------------------------------------------------------------------------
def evaluate_conversation(conversation_id, conversation, rubric_scores):
"""
Combine structural metrics with rubric scores into a single evaluation result.
Parameters
----------
conversation_id : str
conversation : dict
rubric_scores : dict[str, dict]
e.g. {"Clarity": {"score": 5, "notes": "..."}, ...}
Returns
-------
dict
"""
structural = run_structural(conversation)
summary = compute_summary(structural, rubric_scores)
return {
"conversation_id": conversation_id,
"name": conversation.get("name", conversation_id),
"structural_metrics": structural,
"rubric_scores": rubric_scores,
"summary": summary,
}
# ---------------------------------------------------------------------------
# Summary
# ---------------------------------------------------------------------------
def compute_summary(structural_metrics, rubric_scores):
"""
Compute a high-level summary from structural metric flags and rubric scores.
Parameters
----------
structural_metrics : list[dict]
Output of compute_all().
rubric_scores : dict[str, dict | int | float]
Either {"Clarity": {"score": 5}, ...} or {"Clarity": 5, ...}
Returns
-------
dict
"""
flags = [m["flag"] for m in structural_metrics if m.get("flag")]
warnings = [f for f in flags if "WARNING" in f]
criticals = [f for f in flags if "CRITICAL" in f]
# Support both {"score": n, "notes": "..."} and plain numeric formats
rubric_values = []
for v in rubric_scores.values():
if isinstance(v, dict):
rubric_values.append(v["score"])
elif isinstance(v, (int, float)):
rubric_values.append(v)
avg_rubric = round(sum(rubric_values) / len(rubric_values), 1) if rubric_values else 0
return {
"avg_rubric_score": avg_rubric,
"total_flags": len(flags),
"warnings": len(warnings),
"criticals": len(criticals),
"flag_details": flags,
}
|