File size: 2,378 Bytes
344c3c9
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
#!/usr/bin/env bash
set -euo pipefail

REPO_DIR="${1:-.}"
cd "$REPO_DIR"

echo "==> 1/3 openenv task grader check"
python3 - <<'PY'
import sys, yaml
from pathlib import Path

doc = yaml.safe_load(Path('openenv.yaml').read_text())
tasks = doc.get('tasks', [])
with_grader = [t.get('id') for t in tasks if isinstance(t, dict) and t.get('grader')]
print(f"tasks_total={len(tasks)} tasks_with_grader={len(with_grader)}")
if len(with_grader) < 3:
    raise SystemExit(1)
PY

echo "==> 2/3 inference structured logs check"
TMP_OUT="/tmp/pre_validation_results.json"
TMP_LOG="/tmp/pre_validation_stdout.log"
python3 inference.py --max-steps 1 --output "$TMP_OUT" > "$TMP_LOG" 2>/tmp/pre_validation_stderr.log

python3 - <<'PY'
import re, sys, yaml
from pathlib import Path

log = Path('/tmp/pre_validation_stdout.log').read_text().splitlines()
doc = yaml.safe_load(Path('openenv.yaml').read_text())
valid_ids = {t['id'] for t in doc.get('tasks', []) if isinstance(t, dict) and 'id' in t}

start_tasks = []
end_tasks = []
for line in log:
    if line.startswith('[START] '):
        m = re.search(r'\btask=([^\s]+)', line)
        if m:
            start_tasks.append(m.group(1))
    if line.startswith('[END] '):
        m = re.search(r'\btask=([^\s]+)', line)
        if m:
            end_tasks.append(m.group(1))

if len(start_tasks) < 3 or len(end_tasks) < 3:
    print(f"expected >=3 START/END lines, got start={len(start_tasks)} end={len(end_tasks)}")
    raise SystemExit(1)

if start_tasks != end_tasks:
    print(f"START/END task mismatch: start={start_tasks} end={end_tasks}")
    raise SystemExit(1)

if any(t not in valid_ids for t in start_tasks):
    print(f"unknown task id in logs: {start_tasks}")
    raise SystemExit(1)

print(f"structured_log_tasks={start_tasks}")
PY

echo "==> 3/3 score range check"
python3 - <<'PY'
import json
from pathlib import Path

payload = json.loads(Path('/tmp/pre_validation_results.json').read_text())
results = payload.get('results') if isinstance(payload, dict) else None
if not isinstance(results, list) or len(results) < 3:
    raise SystemExit(1)

for row in results:
    score = float(row.get('task_score', 0.0))
    if not (0.01 <= score <= 0.99):
        print(f"out_of_range task={row.get('task_id')} score={score}")
        raise SystemExit(1)

print('all_scores_in_[0.01,0.99]=true')
PY

echo "PASS: pre_validation.sh (3/3)"