Spaces:
Running
Running
Upload folder using huggingface_hub
Browse files- Dockerfile +1 -1
- README.md +42 -18
- client.py +7 -1
- inference.py +10 -1
Dockerfile
CHANGED
|
@@ -54,7 +54,7 @@ ENV PYTHONPATH="/app/env:$PYTHONPATH"
|
|
| 54 |
HEALTHCHECK --interval=30s --timeout=3s --start-period=5s --retries=3 \
|
| 55 |
CMD python -c "import urllib.request; urllib.request.urlopen('http://localhost:8000/health')" || exit 1
|
| 56 |
|
| 57 |
-
# Enable web interface
|
| 58 |
ENV ENABLE_WEB_INTERFACE=true
|
| 59 |
|
| 60 |
# Run the server
|
|
|
|
| 54 |
HEALTHCHECK --interval=30s --timeout=3s --start-period=5s --retries=3 \
|
| 55 |
CMD python -c "import urllib.request; urllib.request.urlopen('http://localhost:8000/health')" || exit 1
|
| 56 |
|
| 57 |
+
# Enable web interface (default OpenEnv UI at /web; custom Gradio at /ui)
|
| 58 |
ENV ENABLE_WEB_INTERFACE=true
|
| 59 |
|
| 60 |
# Run the server
|
README.md
CHANGED
|
@@ -162,38 +162,62 @@ docker run -p 8000:8000 api-testing-env
|
|
| 162 |
curl -X POST http://localhost:8000/reset -H 'Content-Type: application/json' -d '{}'
|
| 163 |
```
|
| 164 |
|
| 165 |
-
### Inference (`inference.py`)
|
| 166 |
|
| 167 |
-
The
|
| 168 |
-
|
| 169 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 170 |
|
| 171 |
```bash
|
| 172 |
-
|
| 173 |
-
|
| 174 |
-
|
| 175 |
-
|
|
|
|
| 176 |
|
| 177 |
-
#
|
| 178 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 179 |
python inference.py
|
| 180 |
|
| 181 |
-
#
|
| 182 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 183 |
|
| 184 |
-
#
|
| 185 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 186 |
```
|
| 187 |
|
| 188 |
-
|
| 189 |
-
JSON action plan against the env, and emits exactly:
|
| 190 |
|
| 191 |
```
|
| 192 |
-
[START] task=basic_validation env=api_testing_env model=
|
| 193 |
[STEP] step=1 action=GET_/tasks reward=0.33 done=false error=null
|
| 194 |
[STEP] step=2 action=POST_/tasks reward=0.28 done=false error=null
|
| 195 |
...
|
| 196 |
-
[END] success=true steps=
|
| 197 |
```
|
| 198 |
|
| 199 |
Each per-task `score` is normalized to **[0, 1]** as
|
|
|
|
| 162 |
curl -X POST http://localhost:8000/reset -H 'Content-Type: application/json' -d '{}'
|
| 163 |
```
|
| 164 |
|
| 165 |
+
### Inference (`inference.py`) — SUBMISSION ENTRY POINT
|
| 166 |
|
| 167 |
+
The script judges run to evaluate this environment. It uses an OpenAI-compatible
|
| 168 |
+
client, makes **one LLM call per task** in plan mode, executes the returned JSON
|
| 169 |
+
action plan against the env, and emits the mandatory `[START] / [STEP] / [END]`
|
| 170 |
+
log lines.
|
| 171 |
+
|
| 172 |
+
#### Required Environment Variables
|
| 173 |
+
|
| 174 |
+
| Variable | Purpose |
|
| 175 |
+
|----------|---------|
|
| 176 |
+
| `API_BASE_URL` | OpenAI-compatible LLM endpoint (default: HuggingFace router) |
|
| 177 |
+
| `MODEL_NAME` | Model identifier to use for inference |
|
| 178 |
+
| `HF_TOKEN` | HuggingFace token (used as API key) |
|
| 179 |
+
|
| 180 |
+
#### Run Command (the format judges use)
|
| 181 |
|
| 182 |
```bash
|
| 183 |
+
API_BASE_URL=https://router.huggingface.co/v1 \
|
| 184 |
+
MODEL_NAME=meta-llama/Llama-3.3-70B-Instruct \
|
| 185 |
+
HF_TOKEN=hf_xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx \
|
| 186 |
+
python inference.py
|
| 187 |
+
```
|
| 188 |
|
| 189 |
+
#### Optional — Choose How to Attach to the Environment
|
| 190 |
+
|
| 191 |
+
```bash
|
| 192 |
+
# (a) In-process — default, fastest, no Docker
|
| 193 |
+
API_BASE_URL=https://router.huggingface.co/v1 \
|
| 194 |
+
MODEL_NAME=meta-llama/Llama-3.3-70B-Instruct \
|
| 195 |
+
HF_TOKEN=hf_xxx \
|
| 196 |
python inference.py
|
| 197 |
|
| 198 |
+
# (b) Against a built Docker image
|
| 199 |
+
API_BASE_URL=https://router.huggingface.co/v1 \
|
| 200 |
+
MODEL_NAME=meta-llama/Llama-3.3-70B-Instruct \
|
| 201 |
+
HF_TOKEN=hf_xxx \
|
| 202 |
+
IMAGE_NAME=api-testing-env:latest \
|
| 203 |
+
python inference.py
|
| 204 |
|
| 205 |
+
# (c) Against a deployed HuggingFace Space
|
| 206 |
+
API_BASE_URL=https://router.huggingface.co/v1 \
|
| 207 |
+
MODEL_NAME=meta-llama/Llama-3.3-70B-Instruct \
|
| 208 |
+
HF_TOKEN=hf_xxx \
|
| 209 |
+
ENV_BASE_URL=https://Mayank022-api-testing-env.hf.space \
|
| 210 |
+
python inference.py
|
| 211 |
```
|
| 212 |
|
| 213 |
+
#### Mandatory Output Format (parsed by the OpenEnv judge)
|
|
|
|
| 214 |
|
| 215 |
```
|
| 216 |
+
[START] task=basic_validation env=api_testing_env model=meta-llama/Llama-3.3-70B-Instruct
|
| 217 |
[STEP] step=1 action=GET_/tasks reward=0.33 done=false error=null
|
| 218 |
[STEP] step=2 action=POST_/tasks reward=0.28 done=false error=null
|
| 219 |
...
|
| 220 |
+
[END] success=true steps=21 score=0.820 rewards=0.33,0.28,...
|
| 221 |
```
|
| 222 |
|
| 223 |
Each per-task `score` is normalized to **[0, 1]** as
|
client.py
CHANGED
|
@@ -5,7 +5,13 @@ from typing import Dict
|
|
| 5 |
from openenv.core.client_types import StepResult
|
| 6 |
from openenv.core import EnvClient
|
| 7 |
|
| 8 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 9 |
|
| 10 |
|
| 11 |
class APITestEnv(
|
|
|
|
| 5 |
from openenv.core.client_types import StepResult
|
| 6 |
from openenv.core import EnvClient
|
| 7 |
|
| 8 |
+
# Support both package import (`from api_testing_env.client import ...`)
|
| 9 |
+
# and flat-module import (`from client import ...` from inference.py).
|
| 10 |
+
# `inference.py` injects its own directory into sys.path so the fallback works.
|
| 11 |
+
try:
|
| 12 |
+
from .models import APITestAction, APITestObservation, APITestState
|
| 13 |
+
except ImportError: # pragma: no cover - flat-module fallback for inference.py
|
| 14 |
+
from models import APITestAction, APITestObservation, APITestState # type: ignore[no-redef,import-not-found]
|
| 15 |
|
| 16 |
|
| 17 |
class APITestEnv(
|
inference.py
CHANGED
|
@@ -126,10 +126,19 @@ def log_step(step: int, action: str, reward: float, done: bool, error: Optional[
|
|
| 126 |
|
| 127 |
|
| 128 |
def log_end(success: bool, steps: int, score: float, rewards: list[float]) -> None:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 129 |
rewards_str = ",".join(f"{r:.2f}" for r in rewards)
|
| 130 |
print(
|
| 131 |
f"[END] success={str(success).lower()} steps={steps} "
|
| 132 |
-
f"score={score:.
|
| 133 |
flush=True,
|
| 134 |
)
|
| 135 |
|
|
|
|
| 126 |
|
| 127 |
|
| 128 |
def log_end(success: bool, steps: int, score: float, rewards: list[float]) -> None:
|
| 129 |
+
"""Emit the [END] line in the EXACT format expected by the OpenEnv judge.
|
| 130 |
+
|
| 131 |
+
Spec format (from problem statement):
|
| 132 |
+
[END] success=<true|false> steps=<n> score=<score> rewards=<r1,r2,...,rn>
|
| 133 |
+
Spec example:
|
| 134 |
+
[END] success=true steps=3 score=1.00 rewards=0.00,0.00,1.00
|
| 135 |
+
|
| 136 |
+
All numeric fields use 2-decimal format to match the spec example.
|
| 137 |
+
"""
|
| 138 |
rewards_str = ",".join(f"{r:.2f}" for r in rewards)
|
| 139 |
print(
|
| 140 |
f"[END] success={str(success).lower()} steps={steps} "
|
| 141 |
+
f"score={score:.2f} rewards={rewards_str}",
|
| 142 |
flush=True,
|
| 143 |
)
|
| 144 |
|