File size: 17,559 Bytes
a4f74f3
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5936836
 
 
 
 
 
 
 
 
a4f74f3
 
 
5936836
a4f74f3
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
#!/usr/bin/env python3
"""
inference.py β€” OpenEnv API Testing Environment baseline inference script.

Runs an LLM agent against the API Testing Environment for all 3 tasks
(basic_validation -> edge_cases -> security_workflows) and emits the
mandatory [START]/[STEP]/[END] stdout format used by the OpenEnv judging
pipeline.

Required env vars (per OpenEnv submission spec):
    API_BASE_URL   The OpenAI-compatible LLM endpoint
    MODEL_NAME     The model identifier to use for inference
    HF_TOKEN       Bearer token for the LLM endpoint (or API_KEY)

Optional env vars:
    IMAGE_NAME            Docker image to spin up the env via from_docker_image()
    LOCAL_IMAGE_NAME      Alias for IMAGE_NAME
    ENV_BASE_URL          URL of an already-running env server (e.g. http://localhost:8000)
    INFERENCE_TASKS       Comma-separated subset of tasks to run (default: all 3)
    INFERENCE_MAX_STEPS   Override max steps per task
    INFERENCE_TEMPERATURE Default 0.4
    INFERENCE_MAX_TOKENS  Default 4096 (plan completions need room for ~25 actions)

The script uses PLAN MODE: one LLM call per task produces a complete JSON
test plan, then the env executes each action sequentially. This matches the
GRPO training distribution and keeps total LLM cost to 3 calls per run, so
the script comfortably runs under 20 min on 2 vCPU / 8 GB RAM.

Usage:
    # Local in-process (no Docker, fastest)
    python inference.py

    # Against a built docker image
    IMAGE_NAME=api-testing-env:latest python inference.py

    # Against an already running server
    ENV_BASE_URL=http://localhost:8000 python inference.py

    # Against a deployed HF Space
    ENV_BASE_URL=https://your-user-api-testing-env.hf.space python inference.py
"""

import json
import os
import sys
import time
import traceback
from typing import Any, Optional

# Make sibling modules importable when run from the repo root
_THIS_DIR = os.path.dirname(os.path.abspath(__file__))
if _THIS_DIR not in sys.path:
    sys.path.insert(0, _THIS_DIR)

# Auto-load .env file if present (for local development)
# Judges set env vars directly so this is harmless in production
try:
    from dotenv import load_dotenv
    _env_path = os.path.join(_THIS_DIR, ".env")
    if os.path.exists(_env_path):
        load_dotenv(_env_path)
except ImportError:
    pass  # python-dotenv is optional

from openai import OpenAI

from models import APITestAction, HTTPMethod  # noqa: E402
from training.prompts import (  # noqa: E402
    PLAN_SYSTEM_PROMPT,
    format_plan_prompt,
    parse_test_plan,
)


# ---------------------------------------------------------------------------
# Config (env vars per OpenEnv spec)
# ---------------------------------------------------------------------------

API_BASE_URL = os.getenv("API_BASE_URL", "https://router.huggingface.co/v1")
# Default model: must be available on the HuggingFace Inference Router.
# Llama-3.3-70B-Instruct is reliable, follows JSON instructions well, and free.
# Override via: MODEL_NAME=other/model python inference.py
MODEL_NAME = os.getenv("MODEL_NAME", "meta-llama/Llama-3.3-70B-Instruct")
API_KEY = os.getenv("HF_TOKEN") or os.getenv("API_KEY")

if not API_KEY:
    print(
        "[ERROR] No HF_TOKEN or API_KEY found in environment.\n"
        "  Set one of:\n"
        "    export HF_TOKEN=hf_xxxxxxxxxxxxxxxxxxxx\n"
        "  Or create a .env file in this directory with:\n"
        "    HF_TOKEN=hf_xxxxxxxxxxxxxxxxxxxx\n"
        "  Get a token from: https://huggingface.co/settings/tokens\n"
        "  Make sure it has 'Make calls to Inference Providers' permission.",
        file=sys.stderr,
    )
    sys.exit(1)

IMAGE_NAME = os.getenv("IMAGE_NAME") or os.getenv("LOCAL_IMAGE_NAME")
ENV_BASE_URL = os.getenv("ENV_BASE_URL")

BENCHMARK = "api_testing_env"
DEFAULT_TASKS = ["basic_validation", "edge_cases", "security_workflows"]
TASKS = [t.strip() for t in os.getenv("INFERENCE_TASKS", ",".join(DEFAULT_TASKS)).split(",") if t.strip()]

TEMPERATURE = float(os.getenv("INFERENCE_TEMPERATURE", "0.4"))
MAX_TOKENS = int(os.getenv("INFERENCE_MAX_TOKENS", "4096"))
_MAX_STEPS_OVERRIDE = os.getenv("INFERENCE_MAX_STEPS")
MAX_STEPS_OVERRIDE: Optional[int] = int(_MAX_STEPS_OVERRIDE) if _MAX_STEPS_OVERRIDE else None


# ---------------------------------------------------------------------------
# Strict stdout logging β€” these line formats are checked by the judge
# ---------------------------------------------------------------------------

def log_start(task: str, env: str, model: str) -> None:
    print(f"[START] task={task} env={env} model={model}", flush=True)


def log_step(step: int, action: str, reward: float, done: bool, error: Optional[str]) -> None:
    print(
        f"[STEP] step={step} action={action} reward={reward:.2f} "
        f"done={str(done).lower()} error={error if error else 'null'}",
        flush=True,
    )


def log_end(success: bool, steps: int, score: float, rewards: list[float]) -> None:
    """Emit the [END] line in the EXACT format expected by the OpenEnv judge.

    Spec format (from problem statement):
        [END] success=<true|false> steps=<n> score=<score> rewards=<r1,r2,...,rn>
    Spec example:
        [END] success=true steps=3 score=1.00 rewards=0.00,0.00,1.00

    All numeric fields use 2-decimal format to match the spec example.
    """
    rewards_str = ",".join(f"{r:.2f}" for r in rewards)
    print(
        f"[END] success={str(success).lower()} steps={steps} "
        f"score={score:.2f} rewards={rewards_str}",
        flush=True,
    )


def _action_str(action: APITestAction) -> str:
    """Compact human-readable action label for the [STEP] line."""
    method = action.method.value if hasattr(action.method, "value") else str(action.method)
    return f"{method}_{action.endpoint}"


# ---------------------------------------------------------------------------
# LLM call β€” plan mode (one completion per task)
# ---------------------------------------------------------------------------

def get_plan_from_llm(client: OpenAI, observation) -> str:
    """Ask the LLM for a complete JSON test plan for this task.

    Wraps the array in {"actions": [...]} so we can use OpenAI structured
    output mode (`response_format={"type": "json_object"}`), which forces
    the LLM to produce valid JSON. This is much more reliable than asking
    for a raw JSON array.
    """
    user_prompt = format_plan_prompt(observation)

    # Stronger system prompt for structured output mode
    system_prompt = (
        PLAN_SYSTEM_PROMPT
        + "\n\nIMPORTANT: Output a JSON object with a single key 'actions' "
        + "containing the array of actions:\n"
        + '{"actions": [{"method": "GET", "endpoint": "/tasks", "headers": {}, '
        + '"query_params": {}, "body": null, "expected_status": 200}, ...]}'
    )

    try:
        completion = client.chat.completions.create(
            model=MODEL_NAME,
            messages=[
                {"role": "system", "content": system_prompt},
                {"role": "user", "content": user_prompt},
            ],
            temperature=TEMPERATURE,
            max_tokens=MAX_TOKENS,
            response_format={"type": "json_object"},  # forces valid JSON
            stream=False,
        )
        text = (completion.choices[0].message.content or "").strip()
        print(f"[DEBUG] LLM response length: {len(text)} chars", flush=True)
        if len(text) > 0:
            preview = text[:300].replace("\n", " ")
            print(f"[DEBUG] LLM response preview: {preview}...", flush=True)
        else:
            print(f"[DEBUG] LLM returned EMPTY string", flush=True)
            if hasattr(completion, "choices") and completion.choices:
                finish_reason = getattr(completion.choices[0], "finish_reason", None)
                print(f"[DEBUG] finish_reason: {finish_reason}", flush=True)
        return text
    except Exception as exc:  # noqa: BLE001
        print(f"[DEBUG] structured-output call failed ({type(exc).__name__}: {exc}), retrying without response_format...", flush=True)
        # Some providers don't support response_format β€” fall back to plain text
        try:
            completion = client.chat.completions.create(
                model=MODEL_NAME,
                messages=[
                    {"role": "system", "content": PLAN_SYSTEM_PROMPT},
                    {"role": "user", "content": user_prompt},
                ],
                temperature=TEMPERATURE,
                max_tokens=MAX_TOKENS,
                stream=False,
            )
            text = (completion.choices[0].message.content or "").strip()
            print(f"[DEBUG] fallback LLM response length: {len(text)} chars", flush=True)
            return text
        except Exception as exc2:  # noqa: BLE001
            print(f"[DEBUG] fallback LLM call failed: {type(exc2).__name__}: {exc2}", flush=True)
            return ""


# ---------------------------------------------------------------------------
# Per-task scoring helper β€” keeps the score in [0, 1]
# ---------------------------------------------------------------------------

def compute_task_score(state, total_step_reward: float) -> float:
    """Combine grader signals into a single normalized score in [0, 1].

    The server already runs `TaskGrader.grade(...)` at episode end and adds
    that score (already in [0, 1]) on top of the last step reward. We do
    NOT trust the raw step rewards β€” those are sums of partial signals and
    can exceed 1.0. Instead we derive the score from the published state:
        score = 0.7 * (bugs_found / total_bugs) + 0.3 * (coverage_pct / 100)
    which is bounded in [0, 1] and rewards both finding bugs and coverage.
    """
    bugs_found = getattr(state, "bugs_found", 0) or 0
    total_bugs = getattr(state, "total_bugs", 0) or 0
    coverage_pct = getattr(state, "coverage_pct", 0.0) or 0.0

    bug_ratio = (bugs_found / total_bugs) if total_bugs > 0 else 0.0
    coverage_ratio = max(0.0, min(1.0, coverage_pct / 100.0))

    score = 0.70 * bug_ratio + 0.30 * coverage_ratio
    return max(0.0, min(1.0, score))


# ---------------------------------------------------------------------------
# Environment connector β€” supports docker / remote / in-process
# ---------------------------------------------------------------------------

class _EnvHandle:
    """Thin wrapper that exposes a uniform reset/step/state/close API.

    Three modes, picked automatically:
        1. IMAGE_NAME set         -> APITestEnv.from_docker_image(IMAGE_NAME)
        2. ENV_BASE_URL set       -> APITestEnv(base_url=ENV_BASE_URL)
        3. neither set (default)  -> APITestEnvironment() in-process
    """

    def __init__(self):
        self._mode: str = ""
        self._client = None        # remote/docker client
        self._env = None           # in-process env

    def open(self):
        if IMAGE_NAME:
            from client import APITestEnv
            self._mode = "docker"
            self._client = APITestEnv.from_docker_image(IMAGE_NAME)
        elif ENV_BASE_URL:
            from client import APITestEnv
            self._mode = "remote"
            self._client = APITestEnv(base_url=ENV_BASE_URL)
            if hasattr(self._client, "connect"):
                self._client.connect()
        else:
            from server.environment import APITestEnvironment
            self._mode = "local"
            self._env = APITestEnvironment()
        return self

    @property
    def mode(self) -> str:
        return self._mode

    def reset(self, task_id: str, seed: int = 42):
        if self._mode in ("docker", "remote"):
            result = self._client.reset(task_id=task_id, seed=seed)
            return result.observation, result
        obs = self._env.reset(seed=seed, task_id=task_id)
        return obs, None

    def step(self, action: APITestAction):
        if self._mode in ("docker", "remote"):
            result = self._client.step(action)
            return result.observation, result.reward or 0.0, result.done
        obs = self._env.step(action)
        return obs, (obs.reward or 0.0), obs.done

    def state(self):
        if self._mode in ("docker", "remote"):
            return self._client.state()
        return self._env.state

    def close(self):
        try:
            if self._client is not None and hasattr(self._client, "close"):
                self._client.close()
        except Exception as exc:  # noqa: BLE001
            print(f"[DEBUG] env close error: {exc}", flush=True)


# ---------------------------------------------------------------------------
# One full episode (one task) -> emits [START] / [STEP]* / [END]
# ---------------------------------------------------------------------------

def run_task(env: _EnvHandle, client: OpenAI, task_id: str, seed: int = 42) -> dict:
    rewards: list[float] = []
    steps_taken = 0
    last_error: Optional[str] = None
    score = 0.0

    log_start(task=task_id, env=BENCHMARK, model=MODEL_NAME)

    try:
        obs, _ = env.reset(task_id=task_id, seed=seed)
        max_steps = MAX_STEPS_OVERRIDE or getattr(obs, "max_steps", 25)

        # 1) Ask the LLM for a full plan
        plan_text = get_plan_from_llm(client, obs)
        actions = parse_test_plan(plan_text) if plan_text else []

        # Fallback: if parser failed but we have text, try a more lenient parse
        if not actions and plan_text:
            print(f"[DEBUG] {task_id}: parse_test_plan returned 0, trying lenient parse...", flush=True)
            try:
                import json as _json, re as _re
                # Try to find any JSON array of objects in the text
                cleaned = plan_text
                if "</think>" in cleaned:
                    cleaned = cleaned.split("</think>", 1)[-1]
                # Find first [ and last ]
                start = cleaned.find("[")
                end = cleaned.rfind("]")
                if start >= 0 and end > start:
                    arr_str = cleaned[start:end+1]
                    raw = _json.loads(arr_str)
                    if isinstance(raw, list):
                        from training.prompts import _dict_to_action
                        for item in raw:
                            if isinstance(item, dict) and "method" in item:
                                a = _dict_to_action(item)
                                if a:
                                    actions.append(a)
                        print(f"[DEBUG] {task_id}: lenient parse recovered {len(actions)} actions", flush=True)
            except Exception as exc:
                print(f"[DEBUG] {task_id}: lenient parse failed: {exc}", flush=True)
        if not actions:
            last_error = "no_plan_parsed"
            print(f"[DEBUG] {task_id}: model produced 0 valid actions", flush=True)

        actions = actions[:max_steps]

        # 2) Execute each action and emit one [STEP] line per env.step()
        done = False
        for i, action in enumerate(actions, start=1):
            if done:
                break
            try:
                obs, reward, done = env.step(action)
                rewards.append(float(reward))
                steps_taken = i
                log_step(step=i, action=_action_str(action), reward=reward, done=done, error=None)
            except Exception as exc:  # noqa: BLE001
                last_error = f"{type(exc).__name__}: {exc}"
                rewards.append(0.0)
                steps_taken = i
                log_step(step=i, action=_action_str(action), reward=0.0, done=False, error=last_error)

        # 3) Score from final state
        try:
            final_state = env.state()
            score = compute_task_score(final_state, sum(rewards))
        except Exception as exc:  # noqa: BLE001
            last_error = last_error or f"state_error: {exc}"
            score = 0.0

    except Exception as exc:  # noqa: BLE001
        last_error = f"{type(exc).__name__}: {exc}"
        traceback.print_exc()

    success = score >= 0.20  # any meaningful progress counts as a successful episode
    log_end(success=success, steps=steps_taken, score=score, rewards=rewards)

    return {
        "task_id": task_id,
        "success": success,
        "steps": steps_taken,
        "score": score,
        "rewards": rewards,
        "error": last_error,
    }


# ---------------------------------------------------------------------------
# Main β€” runs all 3 tasks sequentially against ONE env handle
# ---------------------------------------------------------------------------

def main() -> None:
    client = OpenAI(base_url=API_BASE_URL, api_key=API_KEY)

    print(
        f"[DEBUG] inference.py starting | model={MODEL_NAME} | "
        f"base_url={API_BASE_URL} | tasks={TASKS}",
        flush=True,
    )

    env = _EnvHandle().open()
    print(f"[DEBUG] env mode={env.mode}", flush=True)

    summary: list[dict] = []
    t0 = time.time()
    try:
        for task_id in TASKS:
            result = run_task(env, client, task_id=task_id, seed=42)
            summary.append(result)
    finally:
        env.close()

    elapsed = time.time() - t0
    avg_score = sum(r["score"] for r in summary) / max(len(summary), 1)
    print(
        f"[DEBUG] inference.py finished in {elapsed:.1f}s | "
        f"avg_score={avg_score:.3f}",
        flush=True,
    )
    print("[DEBUG] per-task scores: " + json.dumps(
        {r["task_id"]: round(r["score"], 3) for r in summary}
    ), flush=True)


if __name__ == "__main__":
    main()