File size: 5,055 Bytes
980f6ba
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
"""
eval/evaluate.py
Orchestrator for the PC Pal evaluation framework.
Loads conversations, runs structural metrics, combines with rubric scores,
and produces a structured result per conversation.
"""

import json
import os
from pathlib import Path

from metrics import compute_all


# ---------------------------------------------------------------------------
# Loading
# ---------------------------------------------------------------------------

def load_conversations(json_path=None):
    """
    Load conversations from a specific JSON file or from the data/conversations/ directory.

    Parameters
    ----------
    json_path : str | Path | None
        Path to a specific JSON file containing a list of conversations or a single
        conversation dict.  If None, all *.json files under data/conversations/ are loaded.

    Returns
    -------
    dict[str, dict]
        Mapping of conversation_id -> conversation dict.
    """
    conversations = {}

    if json_path is not None:
        json_path = Path(json_path)
        if not json_path.exists():
            raise FileNotFoundError(f"Conversation file not found: {json_path}")

        with json_path.open(encoding="utf-8") as fh:
            data = json.load(fh)

        # Accept either a list of conversations or a single dict keyed by id
        if isinstance(data, list):
            for conv in data:
                cid = conv.get("id") or conv.get("conversation_id") or f"conv-{len(conversations)}"
                conversations[cid] = conv
        elif isinstance(data, dict):
            # Could be {id: conv, ...} or a single conversation
            if "turns" in data:
                cid = data.get("id") or "conv-0"
                conversations[cid] = data
            else:
                for cid, conv in data.items():
                    conversations[cid] = conv
        return conversations

    # Scan data/conversations/
    repo_root = Path(__file__).parent.parent
    data_dir = repo_root / "data" / "conversations"
    if not data_dir.exists():
        raise FileNotFoundError(f"data/conversations directory not found: {data_dir}")

    json_files = sorted(data_dir.glob("*.json"))
    if not json_files:
        raise FileNotFoundError(f"No JSON files found in {data_dir}")

    for jf in json_files:
        with jf.open(encoding="utf-8") as fh:
            conv = json.load(fh)
        cid = conv.get("id") or jf.stem
        conversations[cid] = conv

    return conversations


# ---------------------------------------------------------------------------
# Structural metrics
# ---------------------------------------------------------------------------

def run_structural(conversation):
    """Run all structural metrics on a conversation and return the list of results."""
    return compute_all(conversation)


# ---------------------------------------------------------------------------
# Per-conversation evaluation
# ---------------------------------------------------------------------------

def evaluate_conversation(conversation_id, conversation, rubric_scores):
    """
    Combine structural metrics with rubric scores into a single evaluation result.

    Parameters
    ----------
    conversation_id : str
    conversation : dict
    rubric_scores : dict[str, dict]
        e.g. {"Clarity": {"score": 5, "notes": "..."}, ...}

    Returns
    -------
    dict
    """
    structural = run_structural(conversation)
    summary = compute_summary(structural, rubric_scores)

    return {
        "conversation_id": conversation_id,
        "name": conversation.get("name", conversation_id),
        "structural_metrics": structural,
        "rubric_scores": rubric_scores,
        "summary": summary,
    }


# ---------------------------------------------------------------------------
# Summary
# ---------------------------------------------------------------------------

def compute_summary(structural_metrics, rubric_scores):
    """
    Compute a high-level summary from structural metric flags and rubric scores.

    Parameters
    ----------
    structural_metrics : list[dict]
        Output of compute_all().
    rubric_scores : dict[str, dict | int | float]
        Either {"Clarity": {"score": 5}, ...} or {"Clarity": 5, ...}

    Returns
    -------
    dict
    """
    flags = [m["flag"] for m in structural_metrics if m.get("flag")]
    warnings = [f for f in flags if "WARNING" in f]
    criticals = [f for f in flags if "CRITICAL" in f]

    # Support both {"score": n, "notes": "..."} and plain numeric formats
    rubric_values = []
    for v in rubric_scores.values():
        if isinstance(v, dict):
            rubric_values.append(v["score"])
        elif isinstance(v, (int, float)):
            rubric_values.append(v)

    avg_rubric = round(sum(rubric_values) / len(rubric_values), 1) if rubric_values else 0

    return {
        "avg_rubric_score": avg_rubric,
        "total_flags": len(flags),
        "warnings": len(warnings),
        "criticals": len(criticals),
        "flag_details": flags,
    }