#!/usr/bin/env bash set -euo pipefail REPO_DIR="${1:-.}" cd "$REPO_DIR" echo "==> 1/3 openenv task grader check" python3 - <<'PY' import sys, yaml from pathlib import Path doc = yaml.safe_load(Path('openenv.yaml').read_text()) tasks = doc.get('tasks', []) with_grader = [t.get('id') for t in tasks if isinstance(t, dict) and t.get('grader')] print(f"tasks_total={len(tasks)} tasks_with_grader={len(with_grader)}") if len(with_grader) < 3: raise SystemExit(1) PY echo "==> 2/3 inference structured logs check" TMP_OUT="/tmp/pre_validation_results.json" TMP_LOG="/tmp/pre_validation_stdout.log" python3 inference.py --max-steps 1 --output "$TMP_OUT" > "$TMP_LOG" 2>/tmp/pre_validation_stderr.log python3 - <<'PY' import re, sys, yaml from pathlib import Path log = Path('/tmp/pre_validation_stdout.log').read_text().splitlines() doc = yaml.safe_load(Path('openenv.yaml').read_text()) valid_ids = {t['id'] for t in doc.get('tasks', []) if isinstance(t, dict) and 'id' in t} start_tasks = [] end_tasks = [] for line in log: if line.startswith('[START] '): m = re.search(r'\btask=([^\s]+)', line) if m: start_tasks.append(m.group(1)) if line.startswith('[END] '): m = re.search(r'\btask=([^\s]+)', line) if m: end_tasks.append(m.group(1)) if len(start_tasks) < 3 or len(end_tasks) < 3: print(f"expected >=3 START/END lines, got start={len(start_tasks)} end={len(end_tasks)}") raise SystemExit(1) if start_tasks != end_tasks: print(f"START/END task mismatch: start={start_tasks} end={end_tasks}") raise SystemExit(1) if any(t not in valid_ids for t in start_tasks): print(f"unknown task id in logs: {start_tasks}") raise SystemExit(1) print(f"structured_log_tasks={start_tasks}") PY echo "==> 3/3 score range check" python3 - <<'PY' import json from pathlib import Path payload = json.loads(Path('/tmp/pre_validation_results.json').read_text()) results = payload.get('results') if isinstance(payload, dict) else None if not isinstance(results, list) or len(results) < 3: raise SystemExit(1) for row in results: score = float(row.get('task_score', 0.0)) if not (0.01 <= score <= 0.99): print(f"out_of_range task={row.get('task_id')} score={score}") raise SystemExit(1) print('all_scores_in_[0.01,0.99]=true') PY echo "PASS: pre_validation.sh (3/3)"