File size: 4,044 Bytes
c8ebaee
 
 
 
 
 
ea03c8c
 
 
 
3816847
 
c8ebaee
 
 
 
 
 
 
ea03c8c
3816847
 
c8ebaee
 
 
 
 
 
 
ea03c8c
 
c8ebaee
 
 
 
 
83eb290
 
c8ebaee
 
 
ea03c8c
37bfd28
 
 
ea03c8c
83eb290
 
ea03c8c
 
 
 
 
 
 
 
c8ebaee
 
 
 
 
 
 
 
 
 
 
 
 
ea03c8c
 
 
c8ebaee
 
 
 
 
 
 
b7aa1f0
c8ebaee
 
 
ea03c8c
 
 
 
 
 
 
 
 
 
c8ebaee
ea03c8c
 
 
 
 
 
c8ebaee
 
 
 
 
 
 
 
ea03c8c
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
"""Deterministic grading adapters that delegate to OpenEnv Rubric subclasses.

The real scoring lives in :mod:`evaluation.rubrics`. This module keeps the
legacy call sites (``score_case`` / ``grade_episode`` / ``grade_representment_note``)
stable so the environment, tests, and audit tooling do not need to change.
"""

from __future__ import annotations

try:
    from ..core.models import CaseScoreBreakdown, GraderReport
    from ..scenarios.simulation import CaseProgress, InternalCase, TaskScenario
    from .rubrics import (
        CaseRubric,
        ChargebackOpsEpisodeRubric,
        EpisodeGradingContext,
        GradingContext,
        grade_representment_note,
    )
except ImportError:  # pragma: no cover
    from core.models import CaseScoreBreakdown, GraderReport
    from scenarios.simulation import CaseProgress, InternalCase, TaskScenario
    from evaluation.rubrics import (
        CaseRubric,
        ChargebackOpsEpisodeRubric,
        EpisodeGradingContext,
        GradingContext,
        grade_representment_note,
    )


__all__ = [
    "grade_representment_note",
    "score_case",
    "grade_episode",
]


_CASE_RUBRIC = CaseRubric()
_EPISODE_RUBRIC = ChargebackOpsEpisodeRubric()


def _build_case_notes(
    case: InternalCase, progress: CaseProgress, step_count: int
) -> str:
    final_resolution = progress.final_resolution or "unresolved"
    attached_set = set(progress.attached_evidence_ids)
    harmful_attached = len(attached_set.intersection(case.harmful_evidence_ids))

    note_parts = [case.resolution_summary]
    if harmful_attached:
        note_parts.append("Harmful evidence weakened the case.")
    if final_resolution == "unresolved":
        note_parts.append("Case was never resolved.")
    elif step_count > case.deadline_step:
        note_parts.append("Resolution happened after the deadline.")
    return " ".join(note_parts)


def score_case(
    case: InternalCase,
    progress: CaseProgress,
    step_count: int,
) -> CaseScoreBreakdown:
    """Score one case deterministically via the case rubric."""

    ctx = GradingContext(case=case, progress=progress, step_count=step_count)
    weighted = _CASE_RUBRIC(ctx, None)
    dims = _CASE_RUBRIC.dimension_scores()

    return CaseScoreBreakdown(
        case_id=case.case_id,
        strategy_correctness=round(dims["strategy_correctness"], 4),
        evidence_quality=round(dims["evidence_quality"], 4),
        packet_validity=round(dims["packet_validity"], 4),
        deadline_compliance=round(dims["deadline_compliance"], 4),
        efficiency=round(dims["efficiency"], 4),
        outcome_quality=round(dims["outcome_quality"], 4),
        note_quality=round(dims["note_quality"], 4),
        escalation_roi=round(dims["escalation_roi"], 4),
        weighted_score=round(weighted * case.weight, 4),
        final_resolution=progress.final_resolution or "unresolved",
        notes=_build_case_notes(case, progress, step_count),
    )


def grade_episode(
    task: TaskScenario,
    progress_by_case: dict[str, CaseProgress],
    step_count: int,
    episode_id: str,
    completed: bool,
) -> GraderReport:
    """Grade a full episode via the episode-level rubric."""

    case_reports = [
        score_case(case, progress_by_case[case.case_id], step_count)
        for case in task.cases
    ]
    total_score = sum(report.weighted_score for report in case_reports)

    ctx = EpisodeGradingContext(
        task=task,
        progress_by_case=progress_by_case,
        step_count=step_count,
    )
    normalized = float(_EPISODE_RUBRIC(ctx, None))

    summary = (
        f"Resolved {sum(1 for report in case_reports if report.final_resolution != 'unresolved')}/"
        f"{len(case_reports)} cases with normalized score {normalized:.3f}."
    )
    return GraderReport(
        episode_id=episode_id,
        task_id=task.task_id,
        total_score=round(total_score, 4),
        normalized_score=round(normalized, 4),
        completed=completed,
        case_reports=case_reports,
        summary=summary,
    )