File size: 2,575 Bytes
b522b5c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
from __future__ import annotations

import json
import re
import sys
from pathlib import Path
from typing import Any

MIN_SCORE = 0.01
MAX_SCORE = 0.99
END_SCORE_RE = re.compile(r"\[END\].*?\bscore=([0-9]+(?:\.[0-9]+)?)")
START_TASK_RE = re.compile(r"\[START\]\s+task=([^\s]+)")


def clamp_score(score: float) -> float:
    return round(min(MAX_SCORE, max(MIN_SCORE, score)), 4)


def read_payload_text() -> str:
    if len(sys.argv) > 1:
        path = Path(sys.argv[1])
        if path.exists():
            return path.read_text()
    return sys.stdin.read()


def _lookup_score(value: Any) -> float | None:
    if isinstance(value, (int, float)):
        return float(value)

    if isinstance(value, dict):
        for key in (
            "score",
            "benchmark_score",
            "final_score",
            "task_score",
        ):
            candidate = value.get(key)
            if isinstance(candidate, (int, float)):
                return float(candidate)

        for key in (
            "success_metrics",
            "observation",
            "final_observation",
            "result",
            "metrics",
        ):
            candidate = value.get(key)
            if candidate is not None:
                nested = _lookup_score(candidate)
                if nested is not None:
                    return nested

    if isinstance(value, list):
        for item in value:
            nested = _lookup_score(item)
            if nested is not None:
                return nested

    return None


def extract_score(text: str) -> float:
    stripped = text.strip()
    if not stripped:
        return MIN_SCORE

    match = END_SCORE_RE.search(stripped)
    if match:
        return clamp_score(float(match.group(1)))

    try:
        payload = json.loads(stripped)
    except json.JSONDecodeError:
        return MIN_SCORE

    score = _lookup_score(payload)
    if score is None:
        return MIN_SCORE
    return clamp_score(score)


def extract_started_task(text: str) -> str | None:
    match = START_TASK_RE.search(text)
    if match:
        return match.group(1)
    return None


def emit_grade(expected_task: str) -> int:
    text = read_payload_text()
    observed_task = extract_started_task(text)
    score = extract_score(text)
    if observed_task is not None and observed_task != expected_task:
        score = MIN_SCORE
    print(
        json.dumps(
            {
                "task_id": expected_task,
                "score": score,
            },
            separators=(",", ":"),
        )
    )
    return 0