Spaces:
Running
Running
File size: 2,575 Bytes
b522b5c | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 | from __future__ import annotations
import json
import re
import sys
from pathlib import Path
from typing import Any
MIN_SCORE = 0.01
MAX_SCORE = 0.99
END_SCORE_RE = re.compile(r"\[END\].*?\bscore=([0-9]+(?:\.[0-9]+)?)")
START_TASK_RE = re.compile(r"\[START\]\s+task=([^\s]+)")
def clamp_score(score: float) -> float:
return round(min(MAX_SCORE, max(MIN_SCORE, score)), 4)
def read_payload_text() -> str:
if len(sys.argv) > 1:
path = Path(sys.argv[1])
if path.exists():
return path.read_text()
return sys.stdin.read()
def _lookup_score(value: Any) -> float | None:
if isinstance(value, (int, float)):
return float(value)
if isinstance(value, dict):
for key in (
"score",
"benchmark_score",
"final_score",
"task_score",
):
candidate = value.get(key)
if isinstance(candidate, (int, float)):
return float(candidate)
for key in (
"success_metrics",
"observation",
"final_observation",
"result",
"metrics",
):
candidate = value.get(key)
if candidate is not None:
nested = _lookup_score(candidate)
if nested is not None:
return nested
if isinstance(value, list):
for item in value:
nested = _lookup_score(item)
if nested is not None:
return nested
return None
def extract_score(text: str) -> float:
stripped = text.strip()
if not stripped:
return MIN_SCORE
match = END_SCORE_RE.search(stripped)
if match:
return clamp_score(float(match.group(1)))
try:
payload = json.loads(stripped)
except json.JSONDecodeError:
return MIN_SCORE
score = _lookup_score(payload)
if score is None:
return MIN_SCORE
return clamp_score(score)
def extract_started_task(text: str) -> str | None:
match = START_TASK_RE.search(text)
if match:
return match.group(1)
return None
def emit_grade(expected_task: str) -> int:
text = read_payload_text()
observed_task = extract_started_task(text)
score = extract_score(text)
if observed_task is not None and observed_task != expected_task:
score = MIN_SCORE
print(
json.dumps(
{
"task_id": expected_task,
"score": score,
},
separators=(",", ":"),
)
)
return 0
|