code-review-environment / scripts /pre_validation.sh
ashishbaberwal's picture
Final Changes
344c3c9
#!/usr/bin/env bash
set -euo pipefail
REPO_DIR="${1:-.}"
cd "$REPO_DIR"
echo "==> 1/3 openenv task grader check"
python3 - <<'PY'
import sys, yaml
from pathlib import Path
doc = yaml.safe_load(Path('openenv.yaml').read_text())
tasks = doc.get('tasks', [])
with_grader = [t.get('id') for t in tasks if isinstance(t, dict) and t.get('grader')]
print(f"tasks_total={len(tasks)} tasks_with_grader={len(with_grader)}")
if len(with_grader) < 3:
raise SystemExit(1)
PY
echo "==> 2/3 inference structured logs check"
TMP_OUT="/tmp/pre_validation_results.json"
TMP_LOG="/tmp/pre_validation_stdout.log"
python3 inference.py --max-steps 1 --output "$TMP_OUT" > "$TMP_LOG" 2>/tmp/pre_validation_stderr.log
python3 - <<'PY'
import re, sys, yaml
from pathlib import Path
log = Path('/tmp/pre_validation_stdout.log').read_text().splitlines()
doc = yaml.safe_load(Path('openenv.yaml').read_text())
valid_ids = {t['id'] for t in doc.get('tasks', []) if isinstance(t, dict) and 'id' in t}
start_tasks = []
end_tasks = []
for line in log:
if line.startswith('[START] '):
m = re.search(r'\btask=([^\s]+)', line)
if m:
start_tasks.append(m.group(1))
if line.startswith('[END] '):
m = re.search(r'\btask=([^\s]+)', line)
if m:
end_tasks.append(m.group(1))
if len(start_tasks) < 3 or len(end_tasks) < 3:
print(f"expected >=3 START/END lines, got start={len(start_tasks)} end={len(end_tasks)}")
raise SystemExit(1)
if start_tasks != end_tasks:
print(f"START/END task mismatch: start={start_tasks} end={end_tasks}")
raise SystemExit(1)
if any(t not in valid_ids for t in start_tasks):
print(f"unknown task id in logs: {start_tasks}")
raise SystemExit(1)
print(f"structured_log_tasks={start_tasks}")
PY
echo "==> 3/3 score range check"
python3 - <<'PY'
import json
from pathlib import Path
payload = json.loads(Path('/tmp/pre_validation_results.json').read_text())
results = payload.get('results') if isinstance(payload, dict) else None
if not isinstance(results, list) or len(results) < 3:
raise SystemExit(1)
for row in results:
score = float(row.get('task_score', 0.0))
if not (0.01 <= score <= 0.99):
print(f"out_of_range task={row.get('task_id')} score={score}")
raise SystemExit(1)
print('all_scores_in_[0.01,0.99]=true')
PY
echo "PASS: pre_validation.sh (3/3)"