Spaces:

umaine
/

COS498-Group7

Sleeping

File size: 5,055 Bytes

980f6ba

"""
eval/evaluate.py
Orchestrator for the PC Pal evaluation framework.
Loads conversations, runs structural metrics, combines with rubric scores,
and produces a structured result per conversation.
"""

import json
import os
from pathlib import Path

from metrics import compute_all


# ---------------------------------------------------------------------------
# Loading
# ---------------------------------------------------------------------------

def load_conversations(json_path=None):
    """
    Load conversations from a specific JSON file or from the data/conversations/ directory.

    Parameters
    ----------
    json_path : str | Path | None
        Path to a specific JSON file containing a list of conversations or a single
        conversation dict.  If None, all *.json files under data/conversations/ are loaded.

    Returns
    -------
    dict[str, dict]
        Mapping of conversation_id -> conversation dict.
    """
    conversations = {}

    if json_path is not None:
        json_path = Path(json_path)
        if not json_path.exists():
            raise FileNotFoundError(f"Conversation file not found: {json_path}")

        with json_path.open(encoding="utf-8") as fh:
            data = json.load(fh)

        # Accept either a list of conversations or a single dict keyed by id
        if isinstance(data, list):
            for conv in data:
                cid = conv.get("id") or conv.get("conversation_id") or f"conv-{len(conversations)}"
                conversations[cid] = conv
        elif isinstance(data, dict):
            # Could be {id: conv, ...} or a single conversation
            if "turns" in data:
                cid = data.get("id") or "conv-0"
                conversations[cid] = data
            else:
                for cid, conv in data.items():
                    conversations[cid] = conv
        return conversations

    # Scan data/conversations/
    repo_root = Path(__file__).parent.parent
    data_dir = repo_root / "data" / "conversations"
    if not data_dir.exists():
        raise FileNotFoundError(f"data/conversations directory not found: {data_dir}")

    json_files = sorted(data_dir.glob("*.json"))
    if not json_files:
        raise FileNotFoundError(f"No JSON files found in {data_dir}")

    for jf in json_files:
        with jf.open(encoding="utf-8") as fh:
            conv = json.load(fh)
        cid = conv.get("id") or jf.stem
        conversations[cid] = conv

    return conversations


# ---------------------------------------------------------------------------
# Structural metrics
# ---------------------------------------------------------------------------

def run_structural(conversation):
    """Run all structural metrics on a conversation and return the list of results."""
    return compute_all(conversation)


# ---------------------------------------------------------------------------
# Per-conversation evaluation
# ---------------------------------------------------------------------------

def evaluate_conversation(conversation_id, conversation, rubric_scores):
    """
    Combine structural metrics with rubric scores into a single evaluation result.

    Parameters
    ----------
    conversation_id : str
    conversation : dict
    rubric_scores : dict[str, dict]
        e.g. {"Clarity": {"score": 5, "notes": "..."}, ...}

    Returns
    -------
    dict
    """
    structural = run_structural(conversation)
    summary = compute_summary(structural, rubric_scores)

    return {
        "conversation_id": conversation_id,
        "name": conversation.get("name", conversation_id),
        "structural_metrics": structural,
        "rubric_scores": rubric_scores,
        "summary": summary,
    }


# ---------------------------------------------------------------------------
# Summary
# ---------------------------------------------------------------------------

def compute_summary(structural_metrics, rubric_scores):
    """
    Compute a high-level summary from structural metric flags and rubric scores.

    Parameters
    ----------
    structural_metrics : list[dict]
        Output of compute_all().
    rubric_scores : dict[str, dict | int | float]
        Either {"Clarity": {"score": 5}, ...} or {"Clarity": 5, ...}

    Returns
    -------
    dict
    """
    flags = [m["flag"] for m in structural_metrics if m.get("flag")]
    warnings = [f for f in flags if "WARNING" in f]
    criticals = [f for f in flags if "CRITICAL" in f]

    # Support both {"score": n, "notes": "..."} and plain numeric formats
    rubric_values = []
    for v in rubric_scores.values():
        if isinstance(v, dict):
            rubric_values.append(v["score"])
        elif isinstance(v, (int, float)):
            rubric_values.append(v)

    avg_rubric = round(sum(rubric_values) / len(rubric_values), 1) if rubric_values else 0

    return {
        "avg_rubric_score": avg_rubric,
        "total_flags": len(flags),
        "warnings": len(warnings),
        "criticals": len(criticals),
        "flag_details": flags,
    }