Spaces:
Runtime error
Runtime error
| """ | |
| PRobe submission inference entrypoint. | |
| Reads ``API_BASE_URL``, ``MODEL_NAME``, and ``HF_TOKEN`` from the environment, | |
| calls the configured OpenAI-compatible API via the official ``openai`` client, | |
| runs the PRobe ``ProbeEnvironment`` for each requested task, and prints | |
| structured lines for automated evaluation: | |
| [START] {...} | |
| [STEP] {...} | |
| [END] {...} | |
| Each JSON object uses **sorted keys** and **compact separators** (stable ordering). | |
| Smoke test (no network, no API keys):: | |
| python inference.py --smoke | |
| Full run (requires env vars):: | |
| export API_BASE_URL=https://api.openai.com/v1 | |
| export MODEL_NAME=gpt-4o-mini | |
| export HF_TOKEN=sk-... | |
| python inference.py --tasks 0 1 2 --episodes-per-task 1 | |
| """ | |
| from __future__ import annotations | |
| import argparse | |
| import json | |
| import os | |
| import sys | |
| import time | |
| import uuid | |
| from pathlib import Path | |
| from typing import Any | |
| from urllib.parse import urlparse | |
| # Repository root on sys.path (this file lives at project root) | |
| _ROOT = Path(__file__).resolve().parent | |
| if str(_ROOT) not in sys.path: | |
| sys.path.insert(0, str(_ROOT)) | |
| from training.baseline import TASKS, run_episode # noqa: E402 | |
| from environment.probe_environment import ProbeEnvironment # noqa: E402 | |
| _SCHEMA = "probe-inference-1" | |
| _DEFAULT_WALL_S = 1140 # stay under 20 min with margin | |
| def _log(tag: str, payload: dict[str, Any]) -> None: | |
| """Emit one evaluation line with sorted keys.""" | |
| body = json.dumps(payload, sort_keys=True, separators=(",", ":"), default=str) | |
| print(f"[{tag}] {body}", flush=True) | |
| def _host_only(api_base_url: str) -> str: | |
| p = urlparse(api_base_url.strip()) | |
| return p.netloc or p.path.split("/")[0] or "invalid-url" | |
| def _reward_01(raw: float) -> float: | |
| r = float(raw) | |
| r = min(1.0, max(-1.0, r)) | |
| return round((r + 1.0) / 2.0, 6) | |
| def main() -> int: | |
| parser = argparse.ArgumentParser(description="PRobe inference.py (submission entrypoint)") | |
| parser.add_argument( | |
| "--tasks", | |
| type=int, | |
| nargs="+", | |
| default=[0, 1, 2], | |
| help="Task ids to run (default: first three graders)", | |
| ) | |
| parser.add_argument( | |
| "--episodes-per-task", | |
| type=int, | |
| default=1, | |
| help="Episodes per task (default 1 for fast validation)", | |
| ) | |
| parser.add_argument( | |
| "--max-wall-seconds", | |
| type=int, | |
| default=_DEFAULT_WALL_S, | |
| help="Stop early after this many wall seconds (default 1140)", | |
| ) | |
| parser.add_argument( | |
| "--smoke", | |
| action="store_true", | |
| help="Run without LLM or API keys; one episode on task 0 (deterministic submit path)", | |
| ) | |
| args = parser.parse_args() | |
| run_id = str(uuid.uuid4()) | |
| t0 = time.monotonic() | |
| if args.smoke: | |
| task_ids = [0] | |
| episodes_per_task = 1 | |
| client = None | |
| model_name = None | |
| api_base_host = "smoke" | |
| _log( | |
| "START", | |
| { | |
| "api_base_host": api_base_host, | |
| "model_name": "none", | |
| "run_id": run_id, | |
| "schema_version": _SCHEMA, | |
| "smoke": True, | |
| "task_ids": task_ids, | |
| }, | |
| ) | |
| else: | |
| api_base = os.environ.get("API_BASE_URL", "").strip() | |
| model_name = os.environ.get("MODEL_NAME", "").strip() | |
| token = os.environ.get("HF_TOKEN", "").strip() | |
| missing = [k for k, v in [("API_BASE_URL", api_base), ("MODEL_NAME", model_name), ("HF_TOKEN", token)] if not v] | |
| if missing: | |
| _log( | |
| "END", | |
| { | |
| "episodes_completed": 0, | |
| "mean_cumulative_reward": None, | |
| "mean_cumulative_reward_01": None, | |
| "message": f"Missing required environment variables: {', '.join(missing)}", | |
| "status": "error_config", | |
| }, | |
| ) | |
| return 1 | |
| try: | |
| from openai import OpenAI | |
| except ImportError: | |
| _log( | |
| "END", | |
| { | |
| "episodes_completed": 0, | |
| "mean_cumulative_reward": None, | |
| "mean_cumulative_reward_01": None, | |
| "message": "openai package not installed", | |
| "status": "error_import", | |
| }, | |
| ) | |
| return 1 | |
| base_url = api_base.rstrip("/") | |
| if not base_url.endswith("v1"): | |
| base_url = f"{base_url}/v1" | |
| client = OpenAI(base_url=base_url, api_key=token) | |
| api_base_host = _host_only(api_base) | |
| task_ids = args.tasks | |
| episodes_per_task = args.episodes_per_task | |
| _log( | |
| "START", | |
| { | |
| "api_base_host": api_base_host, | |
| "model_name": model_name, | |
| "run_id": run_id, | |
| "schema_version": _SCHEMA, | |
| "smoke": False, | |
| "task_ids": task_ids, | |
| }, | |
| ) | |
| env = ProbeEnvironment() | |
| results: list[dict[str, Any]] = [] | |
| def on_step(row: dict[str, Any]) -> None: | |
| _log("STEP", row) | |
| for task_id in task_ids: | |
| if task_id < 0 or task_id >= len(TASKS): | |
| _log( | |
| "END", | |
| { | |
| "episodes_completed": len(results), | |
| "mean_cumulative_reward": None, | |
| "mean_cumulative_reward_01": None, | |
| "message": f"Invalid task_id {task_id}", | |
| "status": "error_task", | |
| }, | |
| ) | |
| return 1 | |
| for ep in range(episodes_per_task): | |
| if time.monotonic() - t0 > args.max_wall_seconds: | |
| rewards = [float(r["cumulative_reward"]) for r in results] | |
| mean_r = sum(rewards) / len(rewards) if rewards else 0.0 | |
| _log( | |
| "END", | |
| { | |
| "episodes_completed": len(results), | |
| "mean_cumulative_reward": round(mean_r, 6), | |
| "mean_cumulative_reward_01": round(_reward_01(mean_r), 6), | |
| "message": "max_wall_seconds exceeded", | |
| "status": "timeout_partial", | |
| }, | |
| ) | |
| return 0 | |
| env._reset_count = task_id # align with training/baseline.py episode selection | |
| result = run_episode( | |
| env, | |
| client, | |
| task_id, | |
| model_name=model_name, | |
| on_step=on_step, | |
| ) | |
| results.append(result) | |
| rewards = [float(r["cumulative_reward"]) for r in results] | |
| mean_r = sum(rewards) / len(rewards) if rewards else 0.0 | |
| _log( | |
| "END", | |
| { | |
| "episodes_completed": len(results), | |
| "mean_cumulative_reward": round(mean_r, 6), | |
| "mean_cumulative_reward_01": round(_reward_01(mean_r), 6), | |
| "status": "ok_smoke" if args.smoke else "ok", | |
| }, | |
| ) | |
| return 0 | |
| if __name__ == "__main__": | |
| raise SystemExit(main()) | |