Commit ·
19e4a1d
1
Parent(s): 96c1a25
Final multi-mode OpenEnv fix
Browse files- README.md +33 -12
- inference.py +58 -53
- server/app.py +5 -3
- validate.py +30 -12
README.md
CHANGED
|
@@ -27,6 +27,11 @@ ACRE is an OpenEnv-compliant environment designed to simulate real-world softwar
|
|
| 27 |
|
| 28 |
It enables agents to iteratively improve code through structured actions while receiving dense, step-wise reward feedback.
|
| 29 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 30 |
---
|
| 31 |
|
| 32 |
## 💡 Why This Matters
|
|
@@ -74,6 +79,12 @@ Code → Action → Refactor → Reward → Repeat
|
|
| 74 |
|
| 75 |
Each task uses AST-based transformations and deterministic grading.
|
| 76 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 77 |
---
|
| 78 |
|
| 79 |
## 🎯 Reward System
|
|
@@ -95,17 +106,17 @@ Rewards are computed at every step:
|
|
| 95 |
## 📊 Example Execution
|
| 96 |
|
| 97 |
```text
|
| 98 |
-
START rename_variables
|
| 99 |
-
STEP 0
|
| 100 |
-
END 1.00
|
| 101 |
|
| 102 |
-
START remove_dead_code
|
| 103 |
-
STEP 1
|
| 104 |
-
END 0.25
|
| 105 |
|
| 106 |
-
START full_refactor
|
| 107 |
-
STEP 3
|
| 108 |
-
END 0.71
|
| 109 |
|
| 110 |
Final Score: 0.65
|
| 111 |
```
|
|
@@ -114,7 +125,8 @@ Final Score: 0.65
|
|
| 114 |
|
| 115 |
## 🏗️ Architecture
|
| 116 |
|
| 117 |
-
- `server.py` → FastAPI entry point
|
|
|
|
| 118 |
- `openenv_interface.py` → OpenEnv wrapper
|
| 119 |
- `acre/env/` → Core environment logic
|
| 120 |
- `acre/tasks/` → Task definitions
|
|
@@ -137,6 +149,11 @@ Uses Pydantic models:
|
|
| 137 |
- `ActionModel`
|
| 138 |
- `RewardModel`
|
| 139 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 140 |
---
|
| 141 |
|
| 142 |
## 🌐 HTTP API
|
|
@@ -155,6 +172,8 @@ Uses Pydantic models:
|
|
| 155 |
|
| 156 |
## 🚀 Run Locally
|
| 157 |
|
|
|
|
|
|
|
| 158 |
```bash
|
| 159 |
pip install -r requirements.txt
|
| 160 |
uvicorn server.app:app --host 0.0.0.0 --port 7860
|
|
@@ -169,7 +188,7 @@ docker build -t acre .
|
|
| 169 |
docker run -p 7860:7860 \
|
| 170 |
-e API_BASE_URL=https://api.openai.com/v1 \
|
| 171 |
-e MODEL_NAME=gpt-4o-mini \
|
| 172 |
-
-e
|
| 173 |
-e ENV_URL=http://localhost:7860 \
|
| 174 |
acre
|
| 175 |
```
|
|
@@ -183,7 +202,7 @@ Set environment variables:
|
|
| 183 |
```bash
|
| 184 |
export API_BASE_URL=https://api.openai.com/v1
|
| 185 |
export MODEL_NAME=gpt-4o-mini
|
| 186 |
-
export
|
| 187 |
export ENV_URL=http://localhost:7860
|
| 188 |
```
|
| 189 |
|
|
@@ -237,6 +256,8 @@ openenv validate
|
|
| 237 |
|
| 238 |
## 📊 Baseline Performance
|
| 239 |
|
|
|
|
|
|
|
| 240 |
| Task | Score |
|
| 241 |
|---|---|
|
| 242 |
| `rename_variables` | 1.0000 |
|
|
|
|
| 27 |
|
| 28 |
It enables agents to iteratively improve code through structured actions while receiving dense, step-wise reward feedback.
|
| 29 |
|
| 30 |
+
## Environment Overview and Motivation
|
| 31 |
+
|
| 32 |
+
ACRE models a realistic developer workflow where an agent incrementally improves Python code quality under a fixed action budget.
|
| 33 |
+
The environment is designed for OpenEnv Round 1 requirements: typed APIs, deterministic grading, multi-difficulty tasks, and reproducible inference behavior.
|
| 34 |
+
|
| 35 |
---
|
| 36 |
|
| 37 |
## 💡 Why This Matters
|
|
|
|
| 79 |
|
| 80 |
Each task uses AST-based transformations and deterministic grading.
|
| 81 |
|
| 82 |
+
## Task Descriptions with Expected Difficulty Levels
|
| 83 |
+
|
| 84 |
+
- Easy (`rename_variables`): rename generic names like `x`, `tmp`, `i` into descriptive identifiers.
|
| 85 |
+
- Medium (`remove_dead_code`): remove unreachable branches and unused assignments while preserving behavior.
|
| 86 |
+
- Hard (`full_refactor`): combine renaming, dead-code elimination, loop simplification, condition cleanup, and helper inlining.
|
| 87 |
+
|
| 88 |
---
|
| 89 |
|
| 90 |
## 🎯 Reward System
|
|
|
|
| 106 |
## 📊 Example Execution
|
| 107 |
|
| 108 |
```text
|
| 109 |
+
[START] task=rename_variables
|
| 110 |
+
[STEP] action=0
|
| 111 |
+
[END] task=rename_variables score=1.00
|
| 112 |
|
| 113 |
+
[START] task=remove_dead_code
|
| 114 |
+
[STEP] action=1
|
| 115 |
+
[END] task=remove_dead_code score=0.25
|
| 116 |
|
| 117 |
+
[START] task=full_refactor
|
| 118 |
+
[STEP] action=3
|
| 119 |
+
[END] task=full_refactor score=0.71
|
| 120 |
|
| 121 |
Final Score: 0.65
|
| 122 |
```
|
|
|
|
| 125 |
|
| 126 |
## 🏗️ Architecture
|
| 127 |
|
| 128 |
+
- `server/app.py` → FastAPI entry point used by OpenEnv + Docker
|
| 129 |
+
- `server.py` → legacy local runner / UI helper
|
| 130 |
- `openenv_interface.py` → OpenEnv wrapper
|
| 131 |
- `acre/env/` → Core environment logic
|
| 132 |
- `acre/tasks/` → Task definitions
|
|
|
|
| 149 |
- `ActionModel`
|
| 150 |
- `RewardModel`
|
| 151 |
|
| 152 |
+
## Definitions of Action and Observation Spaces
|
| 153 |
+
|
| 154 |
+
- Observation space: Box(4) with fields `code_length`, `complexity_score`, `runtime_s`, `error_flag`.
|
| 155 |
+
- Action space: Discrete(5) with actions `rename_variable`, `remove_dead_code`, `simplify_loop`, `optimize_condition`, `inline_function`.
|
| 156 |
+
|
| 157 |
---
|
| 158 |
|
| 159 |
## 🌐 HTTP API
|
|
|
|
| 172 |
|
| 173 |
## 🚀 Run Locally
|
| 174 |
|
| 175 |
+
## Setup and Usage Instructions
|
| 176 |
+
|
| 177 |
```bash
|
| 178 |
pip install -r requirements.txt
|
| 179 |
uvicorn server.app:app --host 0.0.0.0 --port 7860
|
|
|
|
| 188 |
docker run -p 7860:7860 \
|
| 189 |
-e API_BASE_URL=https://api.openai.com/v1 \
|
| 190 |
-e MODEL_NAME=gpt-4o-mini \
|
| 191 |
+
-e API_KEY=your_key \
|
| 192 |
-e ENV_URL=http://localhost:7860 \
|
| 193 |
acre
|
| 194 |
```
|
|
|
|
| 202 |
```bash
|
| 203 |
export API_BASE_URL=https://api.openai.com/v1
|
| 204 |
export MODEL_NAME=gpt-4o-mini
|
| 205 |
+
export API_KEY=your_key
|
| 206 |
export ENV_URL=http://localhost:7860
|
| 207 |
```
|
| 208 |
|
|
|
|
| 256 |
|
| 257 |
## 📊 Baseline Performance
|
| 258 |
|
| 259 |
+
## Baseline Performance Scores
|
| 260 |
+
|
| 261 |
| Task | Score |
|
| 262 |
|---|---|
|
| 263 |
| `rename_variables` | 1.0000 |
|
inference.py
CHANGED
|
@@ -2,17 +2,17 @@
|
|
| 2 |
ACRE inference script for OpenEnv submission evaluation.
|
| 3 |
|
| 4 |
Environment variables:
|
| 5 |
-
|
| 6 |
- MODEL_NAME: model identifier (default allowed)
|
| 7 |
-
|
| 8 |
- ENV_URL: running ACRE server base URL (required)
|
| 9 |
- LOCAL_IMAGE_NAME: present for evaluator compatibility (optional)
|
| 10 |
-
|
| 11 |
|
| 12 |
STRICT stdout format (do not change):
|
| 13 |
-
|
| 14 |
-
|
| 15 |
-
|
| 16 |
"""
|
| 17 |
from __future__ import annotations
|
| 18 |
|
|
@@ -26,9 +26,9 @@ from typing import Dict, List, Optional, Tuple
|
|
| 26 |
import requests
|
| 27 |
from openai import OpenAI
|
| 28 |
|
| 29 |
-
API_BASE_URL = os.getenv("API_BASE_URL") or "https://api.openai.com/v1"
|
| 30 |
MODEL_NAME = os.getenv("MODEL_NAME") or "gpt-4o-mini"
|
| 31 |
-
|
|
|
|
| 32 |
ENV_URL: str = os.getenv("ENV_URL", "http://localhost:7860")
|
| 33 |
LOCAL_IMAGE_NAME: str | None = os.getenv("LOCAL_IMAGE_NAME")
|
| 34 |
|
|
@@ -164,7 +164,8 @@ def choose_action(client: Optional[OpenAI], state: dict, task_id: str) -> Tuple[
|
|
| 164 |
return 1, "heuristic: remove remaining dead code"
|
| 165 |
return 3, "heuristic: condition optimization as safe default"
|
| 166 |
|
| 167 |
-
|
|
|
|
| 168 |
if (not use_llm) or client is None:
|
| 169 |
return heuristic_action()
|
| 170 |
|
|
@@ -208,6 +209,42 @@ def choose_action(client: Optional[OpenAI], state: dict, task_id: str) -> Tuple[
|
|
| 208 |
return heuristic_action()
|
| 209 |
|
| 210 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 211 |
def run_episode(client: Optional[OpenAI], task_id: str, episode_num: int) -> float:
|
| 212 |
reset_env(task_id)
|
| 213 |
state = get_state()
|
|
@@ -260,46 +297,8 @@ def run_all_tasks() -> Dict[str, float]:
|
|
| 260 |
registry = TaskRegistry() if TaskRegistry is not None else None
|
| 261 |
env = OpenEnvRefactorEnv(registry=registry) if OpenEnvRefactorEnv is not None else None
|
| 262 |
|
| 263 |
-
|
| 264 |
-
|
| 265 |
-
has_generic = re.search(r"\b(x|tmp|i)\b", code) is not None
|
| 266 |
-
has_if_false = re.search(r"\bif\s+False\b", code) is not None
|
| 267 |
-
has_if_true = re.search(r"\bif\s+True\b", code) is not None
|
| 268 |
-
has_append_loop = ".append(" in code and "for " in code
|
| 269 |
-
has_double_not = "not not" in code
|
| 270 |
-
has_add_call = "add(" in code
|
| 271 |
-
|
| 272 |
-
if task_id == "rename_variables":
|
| 273 |
-
if has_generic:
|
| 274 |
-
return 0
|
| 275 |
-
if has_if_false or "unused" in code:
|
| 276 |
-
return 1
|
| 277 |
-
if has_append_loop:
|
| 278 |
-
return 2
|
| 279 |
-
if has_if_true or has_double_not:
|
| 280 |
-
return 3
|
| 281 |
-
return 4
|
| 282 |
-
|
| 283 |
-
if task_id == "remove_dead_code":
|
| 284 |
-
if has_if_false or "unused" in code:
|
| 285 |
-
return 1
|
| 286 |
-
if has_append_loop:
|
| 287 |
-
return 2
|
| 288 |
-
if has_if_true or has_double_not:
|
| 289 |
-
return 3
|
| 290 |
-
if has_generic:
|
| 291 |
-
return 0
|
| 292 |
-
return 4
|
| 293 |
-
|
| 294 |
-
if has_generic:
|
| 295 |
-
return 0
|
| 296 |
-
if has_append_loop:
|
| 297 |
-
return 2
|
| 298 |
-
if has_if_false or has_if_true or has_double_not:
|
| 299 |
-
return 3
|
| 300 |
-
if has_add_call:
|
| 301 |
-
return 4
|
| 302 |
-
return 1
|
| 303 |
|
| 304 |
task_plan = [
|
| 305 |
"rename_variables",
|
|
@@ -321,11 +320,11 @@ def run_all_tasks() -> Dict[str, float]:
|
|
| 321 |
return _safe_scores()
|
| 322 |
|
| 323 |
for task_id in task_plan:
|
| 324 |
-
print(f"START {task_id}", flush=True)
|
| 325 |
reset_env(task_id)
|
| 326 |
for _ in range(5):
|
| 327 |
state = get_state()
|
| 328 |
-
action =
|
| 329 |
print(f"[STEP] action={int(action)}", flush=True)
|
| 330 |
step_env(action)
|
| 331 |
final_state = get_state()
|
|
@@ -349,8 +348,14 @@ def run_all_tasks() -> Dict[str, float]:
|
|
| 349 |
env.reset(seed=0, task_id=task_id)
|
| 350 |
for _ in range(5):
|
| 351 |
st = env.state()
|
| 352 |
-
|
| 353 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 354 |
print(f"[STEP] action={int(action)}", flush=True)
|
| 355 |
env.step(action)
|
| 356 |
st = env.state()
|
|
|
|
| 2 |
ACRE inference script for OpenEnv submission evaluation.
|
| 3 |
|
| 4 |
Environment variables:
|
| 5 |
+
- API_BASE_URL: LLM API endpoint injected by evaluator
|
| 6 |
- MODEL_NAME: model identifier (default allowed)
|
| 7 |
+
- API_KEY: API token for the OpenAI-compatible proxy endpoint
|
| 8 |
- ENV_URL: running ACRE server base URL (required)
|
| 9 |
- LOCAL_IMAGE_NAME: present for evaluator compatibility (optional)
|
| 10 |
+
- USE_LLM: set to "0" to disable LLM action selection
|
| 11 |
|
| 12 |
STRICT stdout format (do not change):
|
| 13 |
+
[START] task=<task_id>
|
| 14 |
+
[STEP] action=<action_int>
|
| 15 |
+
[END] task=<task_id> score=<score_float>
|
| 16 |
"""
|
| 17 |
from __future__ import annotations
|
| 18 |
|
|
|
|
| 26 |
import requests
|
| 27 |
from openai import OpenAI
|
| 28 |
|
|
|
|
| 29 |
MODEL_NAME = os.getenv("MODEL_NAME") or "gpt-4o-mini"
|
| 30 |
+
# Phase-2 validator expects API_KEY through provided proxy.
|
| 31 |
+
API_KEY = os.getenv("API_KEY")
|
| 32 |
ENV_URL: str = os.getenv("ENV_URL", "http://localhost:7860")
|
| 33 |
LOCAL_IMAGE_NAME: str | None = os.getenv("LOCAL_IMAGE_NAME")
|
| 34 |
|
|
|
|
| 164 |
return 1, "heuristic: remove remaining dead code"
|
| 165 |
return 3, "heuristic: condition optimization as safe default"
|
| 166 |
|
| 167 |
+
# Enable LLM by default when credentials are present.
|
| 168 |
+
use_llm = bool(API_KEY) and os.getenv("USE_LLM", "1") == "1"
|
| 169 |
if (not use_llm) or client is None:
|
| 170 |
return heuristic_action()
|
| 171 |
|
|
|
|
| 209 |
return heuristic_action()
|
| 210 |
|
| 211 |
|
| 212 |
+
def _build_openai_client() -> Optional[OpenAI]:
|
| 213 |
+
"""
|
| 214 |
+
Build OpenAI-compatible client using hackathon-required proxy env vars.
|
| 215 |
+
Falls back safely when vars are absent in local runs.
|
| 216 |
+
"""
|
| 217 |
+
base_url = os.getenv("API_BASE_URL")
|
| 218 |
+
api_key = os.getenv("API_KEY")
|
| 219 |
+
|
| 220 |
+
if not base_url or not api_key:
|
| 221 |
+
return None
|
| 222 |
+
|
| 223 |
+
try:
|
| 224 |
+
return OpenAI(base_url=base_url, api_key=api_key)
|
| 225 |
+
except Exception:
|
| 226 |
+
return None
|
| 227 |
+
|
| 228 |
+
|
| 229 |
+
def _touch_proxy(client: Optional[OpenAI]) -> None:
|
| 230 |
+
"""
|
| 231 |
+
Ensure at least one request is sent through the provided proxy in Phase-2.
|
| 232 |
+
"""
|
| 233 |
+
if client is None:
|
| 234 |
+
return None
|
| 235 |
+
try:
|
| 236 |
+
client.chat.completions.create(
|
| 237 |
+
model=MODEL_NAME,
|
| 238 |
+
messages=[{"role": "user", "content": "Return exactly: ok"}],
|
| 239 |
+
temperature=0.0,
|
| 240 |
+
max_tokens=2,
|
| 241 |
+
)
|
| 242 |
+
except Exception:
|
| 243 |
+
# Keep inference resilient even if proxy is temporarily unavailable.
|
| 244 |
+
return None
|
| 245 |
+
return None
|
| 246 |
+
|
| 247 |
+
|
| 248 |
def run_episode(client: Optional[OpenAI], task_id: str, episode_num: int) -> float:
|
| 249 |
reset_env(task_id)
|
| 250 |
state = get_state()
|
|
|
|
| 297 |
registry = TaskRegistry() if TaskRegistry is not None else None
|
| 298 |
env = OpenEnvRefactorEnv(registry=registry) if OpenEnvRefactorEnv is not None else None
|
| 299 |
|
| 300 |
+
client = _build_openai_client()
|
| 301 |
+
_touch_proxy(client)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 302 |
|
| 303 |
task_plan = [
|
| 304 |
"rename_variables",
|
|
|
|
| 320 |
return _safe_scores()
|
| 321 |
|
| 322 |
for task_id in task_plan:
|
| 323 |
+
print(f"[START] task={task_id}", flush=True)
|
| 324 |
reset_env(task_id)
|
| 325 |
for _ in range(5):
|
| 326 |
state = get_state()
|
| 327 |
+
action, _reason = choose_action(client, state, task_id)
|
| 328 |
print(f"[STEP] action={int(action)}", flush=True)
|
| 329 |
step_env(action)
|
| 330 |
final_state = get_state()
|
|
|
|
| 348 |
env.reset(seed=0, task_id=task_id)
|
| 349 |
for _ in range(5):
|
| 350 |
st = env.state()
|
| 351 |
+
state_payload = {
|
| 352 |
+
"current_code": str(st.current_code),
|
| 353 |
+
"episode_steps": int(st.episode_steps),
|
| 354 |
+
"max_steps": int(st.max_steps),
|
| 355 |
+
"complexity": float(st.complexity),
|
| 356 |
+
}
|
| 357 |
+
action, _reason = choose_action(client, state_payload, task_id)
|
| 358 |
+
action = int(action)
|
| 359 |
print(f"[STEP] action={int(action)}", flush=True)
|
| 360 |
env.step(action)
|
| 361 |
st = env.state()
|
server/app.py
CHANGED
|
@@ -210,13 +210,15 @@ def _demo_html() -> str:
|
|
| 210 |
return "<html><body><h1>ACRE</h1><p>UI unavailable.</p></body></html>"
|
| 211 |
|
| 212 |
|
| 213 |
-
@app.get("/",
|
| 214 |
-
def root() ->
|
| 215 |
-
|
|
|
|
| 216 |
|
| 217 |
|
| 218 |
@app.get("/health", response_model=CompatibilityHealthResponse)
|
| 219 |
def health_compat() -> CompatibilityHealthResponse:
|
|
|
|
| 220 |
return CompatibilityHealthResponse(status="healthy", service="acre-env")
|
| 221 |
|
| 222 |
|
|
|
|
| 210 |
return "<html><body><h1>ACRE</h1><p>UI unavailable.</p></body></html>"
|
| 211 |
|
| 212 |
|
| 213 |
+
@app.get("/", response_model=HealthResponse)
|
| 214 |
+
def root() -> HealthResponse:
|
| 215 |
+
"""Primary OpenEnv health endpoint used by validators."""
|
| 216 |
+
return HealthResponse(status="healthy", env="acre", version="1.0.0")
|
| 217 |
|
| 218 |
|
| 219 |
@app.get("/health", response_model=CompatibilityHealthResponse)
|
| 220 |
def health_compat() -> CompatibilityHealthResponse:
|
| 221 |
+
"""Secondary compatibility health endpoint."""
|
| 222 |
return CompatibilityHealthResponse(status="healthy", service="acre-env")
|
| 223 |
|
| 224 |
|
validate.py
CHANGED
|
@@ -204,28 +204,39 @@ def run_validation(base_url: str) -> int:
|
|
| 204 |
try:
|
| 205 |
inference_src = read_text("inference.py")
|
| 206 |
failures += 0 if check("inference.py exists", True) else 1
|
| 207 |
-
# Accept
|
| 208 |
-
#
|
| 209 |
-
#
|
| 210 |
-
#
|
| 211 |
-
# END <score>
|
| 212 |
json_markers_ok = all(m in inference_src for m in ['"event": "START"', '"event": "STEP"', '"event": "END"'])
|
|
|
|
| 213 |
line_markers_ok = all(m in inference_src for m in ["START ", "STEP ", "END "])
|
| 214 |
-
failures += 0 if check("inference.py emits START marker", json_markers_ok or line_markers_ok) else 1
|
| 215 |
-
failures += 0 if check("inference.py emits STEP marker", json_markers_ok or line_markers_ok) else 1
|
| 216 |
-
failures += 0 if check("inference.py emits END marker", json_markers_ok or line_markers_ok) else 1
|
| 217 |
failures += 0 if check(
|
| 218 |
"Uses OpenAI client",
|
| 219 |
"from openai import OpenAI" in inference_src,
|
| 220 |
) else 1
|
| 221 |
-
for var in ["API_BASE_URL", "MODEL_NAME", "
|
| 222 |
failures += 0 if check(f"inference.py reads {var} from env", var in inference_src) else 1
|
|
|
|
|
|
|
|
|
|
|
|
|
| 223 |
api_base_default_ok = (
|
| 224 |
'os.getenv("API_BASE_URL", "https://api.openai.com/v1")' in inference_src
|
| 225 |
or re.search(r'API_BASE_URL\s*=.*os\.getenv\("API_BASE_URL"\)\s*or\s*"https://api\.openai\.com/v1"', inference_src)
|
| 226 |
is not None
|
| 227 |
)
|
| 228 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 229 |
|
| 230 |
model_default_ok = (
|
| 231 |
'os.getenv("MODEL_NAME", "gpt-4o-mini")' in inference_src
|
|
@@ -233,11 +244,18 @@ def run_validation(base_url: str) -> int:
|
|
| 233 |
)
|
| 234 |
failures += 0 if check("MODEL_NAME has a default", model_default_ok) else 1
|
| 235 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 236 |
hf_token_no_default_ok = (
|
| 237 |
-
re.search(r'HF_TOKEN\s*=.*os\.getenv\("HF_TOKEN"\)
|
| 238 |
and re.search(r'os\.getenv\("HF_TOKEN"\s*,', inference_src) is None
|
| 239 |
)
|
| 240 |
-
failures += 0 if check(
|
|
|
|
|
|
|
|
|
|
| 241 |
except FileNotFoundError:
|
| 242 |
failures += 1
|
| 243 |
check("inference.py exists", False, "file not found")
|
|
|
|
| 204 |
try:
|
| 205 |
inference_src = read_text("inference.py")
|
| 206 |
failures += 0 if check("inference.py exists", True) else 1
|
| 207 |
+
# Accept legacy JSON markers and modern strict bracketed format:
|
| 208 |
+
# [START] task=<task_id>
|
| 209 |
+
# [STEP] action=<action>
|
| 210 |
+
# [END] task=<task_id> score=<score>
|
|
|
|
| 211 |
json_markers_ok = all(m in inference_src for m in ['"event": "START"', '"event": "STEP"', '"event": "END"'])
|
| 212 |
+
bracket_markers_ok = all(m in inference_src for m in ["[START]", "[STEP]", "[END]"])
|
| 213 |
line_markers_ok = all(m in inference_src for m in ["START ", "STEP ", "END "])
|
| 214 |
+
failures += 0 if check("inference.py emits START marker", json_markers_ok or line_markers_ok or bracket_markers_ok) else 1
|
| 215 |
+
failures += 0 if check("inference.py emits STEP marker", json_markers_ok or line_markers_ok or bracket_markers_ok) else 1
|
| 216 |
+
failures += 0 if check("inference.py emits END marker", json_markers_ok or line_markers_ok or bracket_markers_ok) else 1
|
| 217 |
failures += 0 if check(
|
| 218 |
"Uses OpenAI client",
|
| 219 |
"from openai import OpenAI" in inference_src,
|
| 220 |
) else 1
|
| 221 |
+
for var in ["API_BASE_URL", "MODEL_NAME", "ENV_URL", "LOCAL_IMAGE_NAME"]:
|
| 222 |
failures += 0 if check(f"inference.py reads {var} from env", var in inference_src) else 1
|
| 223 |
+
failures += 0 if check(
|
| 224 |
+
"inference.py reads API credentials from env (API_KEY or HF_TOKEN)",
|
| 225 |
+
("API_KEY" in inference_src) or ("HF_TOKEN" in inference_src),
|
| 226 |
+
) else 1
|
| 227 |
api_base_default_ok = (
|
| 228 |
'os.getenv("API_BASE_URL", "https://api.openai.com/v1")' in inference_src
|
| 229 |
or re.search(r'API_BASE_URL\s*=.*os\.getenv\("API_BASE_URL"\)\s*or\s*"https://api\.openai\.com/v1"', inference_src)
|
| 230 |
is not None
|
| 231 |
)
|
| 232 |
+
api_base_env_required_ok = (
|
| 233 |
+
re.search(r'base_url\s*=\s*os\.getenv\("API_BASE_URL"\)', inference_src) is not None
|
| 234 |
+
or re.search(r'base_url\s*=\s*os\.environ\["API_BASE_URL"\]', inference_src) is not None
|
| 235 |
+
)
|
| 236 |
+
failures += 0 if check(
|
| 237 |
+
"API_BASE_URL handling is valid (default or strict env)",
|
| 238 |
+
api_base_default_ok or api_base_env_required_ok,
|
| 239 |
+
) else 1
|
| 240 |
|
| 241 |
model_default_ok = (
|
| 242 |
'os.getenv("MODEL_NAME", "gpt-4o-mini")' in inference_src
|
|
|
|
| 244 |
)
|
| 245 |
failures += 0 if check("MODEL_NAME has a default", model_default_ok) else 1
|
| 246 |
|
| 247 |
+
api_key_no_default_ok = (
|
| 248 |
+
re.search(r'API_KEY\s*=.*os\.getenv\("API_KEY"\)', inference_src) is not None
|
| 249 |
+
and re.search(r'os\.getenv\("API_KEY"\s*,', inference_src) is None
|
| 250 |
+
)
|
| 251 |
hf_token_no_default_ok = (
|
| 252 |
+
re.search(r'HF_TOKEN\s*=.*os\.getenv\("HF_TOKEN"\)', inference_src) is not None
|
| 253 |
and re.search(r'os\.getenv\("HF_TOKEN"\s*,', inference_src) is None
|
| 254 |
)
|
| 255 |
+
failures += 0 if check(
|
| 256 |
+
"API key variable has no default",
|
| 257 |
+
api_key_no_default_ok or hf_token_no_default_ok,
|
| 258 |
+
) else 1
|
| 259 |
except FileNotFoundError:
|
| 260 |
failures += 1
|
| 261 |
check("inference.py exists", False, "file not found")
|