Spaces:
Sleeping
Sleeping
Commit ·
fe45227
1
Parent(s): 379f291
docs: professionalize README and trim repo utilities
Browse files- .env.example +9 -9
- README.md +254 -209
- baseline_runner.py +3 -3
- scripts/live_provider_audit.py +0 -34
- scripts/problem_statement_audit.py +0 -268
- scripts/run_baseline.py +0 -16
.env.example
CHANGED
|
@@ -1,6 +1,6 @@
|
|
| 1 |
# Baseline provider selection
|
| 2 |
-
BASELINE_PROVIDER=
|
| 3 |
-
BASELINE_MODEL=
|
| 4 |
BASELINE_REQUEST_TIMEOUT_SECONDS=4
|
| 5 |
PROVIDER_RATE_LIMIT_RETRIES=0
|
| 6 |
PROVIDER_RETRY_BACKOFF_SECONDS=0.5
|
|
@@ -9,16 +9,16 @@ STRICT_LLM_MODE=0
|
|
| 9 |
# Challenge-compatible inference contract
|
| 10 |
# `HF_TOKEN` is the generic API key passed to the OpenAI client for the selected base URL.
|
| 11 |
# For OpenRouter, set it to your OpenRouter key. For Groq, set it to your Groq key.
|
| 12 |
-
API_BASE_URL=https://
|
| 13 |
-
MODEL_NAME=
|
| 14 |
HF_TOKEN=
|
| 15 |
INFERENCE_TIMEOUT_SECONDS=4
|
| 16 |
|
| 17 |
-
# Alternative
|
| 18 |
-
# BASELINE_PROVIDER=
|
| 19 |
-
# BASELINE_MODEL=
|
| 20 |
-
# API_BASE_URL=https://
|
| 21 |
-
# MODEL_NAME=
|
| 22 |
|
| 23 |
# OpenRouter setup
|
| 24 |
OPENROUTER_API_KEY=
|
|
|
|
| 1 |
# Baseline provider selection
|
| 2 |
+
BASELINE_PROVIDER=openrouter
|
| 3 |
+
BASELINE_MODEL=openai/gpt-oss-120b
|
| 4 |
BASELINE_REQUEST_TIMEOUT_SECONDS=4
|
| 5 |
PROVIDER_RATE_LIMIT_RETRIES=0
|
| 6 |
PROVIDER_RETRY_BACKOFF_SECONDS=0.5
|
|
|
|
| 9 |
# Challenge-compatible inference contract
|
| 10 |
# `HF_TOKEN` is the generic API key passed to the OpenAI client for the selected base URL.
|
| 11 |
# For OpenRouter, set it to your OpenRouter key. For Groq, set it to your Groq key.
|
| 12 |
+
API_BASE_URL=https://openrouter.ai/api/v1
|
| 13 |
+
MODEL_NAME=openai/gpt-oss-120b
|
| 14 |
HF_TOKEN=
|
| 15 |
INFERENCE_TIMEOUT_SECONDS=4
|
| 16 |
|
| 17 |
+
# Alternative Groq path:
|
| 18 |
+
# BASELINE_PROVIDER=groq
|
| 19 |
+
# BASELINE_MODEL=llama-3.3-70b-versatile
|
| 20 |
+
# API_BASE_URL=https://api.groq.com/openai/v1
|
| 21 |
+
# MODEL_NAME=llama-3.3-70b-versatile
|
| 22 |
|
| 23 |
# OpenRouter setup
|
| 24 |
OPENROUTER_API_KEY=
|
README.md
CHANGED
|
@@ -1,8 +1,5 @@
|
|
| 1 |
---
|
| 2 |
title: ChargebackOps
|
| 3 |
-
emoji: 💳
|
| 4 |
-
colorFrom: blue
|
| 5 |
-
colorTo: gray
|
| 6 |
sdk: docker
|
| 7 |
app_port: 8000
|
| 8 |
tags:
|
|
@@ -11,328 +8,376 @@ tags:
|
|
| 11 |
|
| 12 |
# ChargebackOps
|
| 13 |
|
| 14 |
-
ChargebackOps is a real-world OpenEnv environment for merchant-side
|
| 15 |
|
| 16 |
-
|
| 17 |
|
| 18 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 19 |
|
| 20 |
-
|
| 21 |
|
| 22 |
-
|
| 23 |
-
|
| 24 |
-
-
|
| 25 |
-
-
|
|
|
|
| 26 |
- choose whether to contest, accept, or refund
|
| 27 |
-
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 28 |
|
| 29 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 30 |
|
| 31 |
-
## Environment
|
| 32 |
|
| 33 |
-
|
| 34 |
|
| 35 |
-
|
| 36 |
-
- payment gateway ledger
|
| 37 |
-
- shipping and delivery records
|
| 38 |
-
- customer support transcripts
|
| 39 |
-
- refund ledger
|
| 40 |
-
- fraud and device-risk summaries
|
| 41 |
-
- dispute policy guidance by reason code
|
| 42 |
|
| 43 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 44 |
|
| 45 |
-
|
| 46 |
|
| 47 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 48 |
|
| 49 |
-
|
| 50 |
-
- `inspect_case`: reveal merchant-side notes for the selected case
|
| 51 |
-
- `query_system`: inspect one of `orders`, `payment`, `shipping`, `support`, `refunds`, or `risk`
|
| 52 |
-
- `retrieve_policy`: load reason-code guidance and required evidence hints
|
| 53 |
-
- `add_evidence`: attach one or more revealed evidence items
|
| 54 |
-
- `remove_evidence`: remove attached evidence
|
| 55 |
-
- `set_strategy`: set `contest`, `accept_chargeback`, or `issue_refund`
|
| 56 |
-
- `submit_representment`: submit the contest package
|
| 57 |
-
- `resolve_case`: resolve a case via `accept_chargeback` or `issue_refund`
|
| 58 |
|
| 59 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 60 |
|
| 61 |
-
|
| 62 |
|
| 63 |
-
|
|
|
|
|
|
|
|
|
|
| 64 |
|
| 65 |
-
|
| 66 |
-
|
| 67 |
-
|
| 68 |
-
|
| 69 |
-
|
| 70 |
-
-
|
| 71 |
-
|
| 72 |
-
|
| 73 |
-
|
| 74 |
-
-
|
| 75 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 76 |
|
| 77 |
-
|
| 78 |
|
| 79 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 80 |
|
| 81 |
-
|
| 82 |
|
| 83 |
-
|
| 84 |
|
| 85 |
-
-
|
| 86 |
-
-
|
| 87 |
-
-
|
|
|
|
|
|
|
| 88 |
|
| 89 |
-
##
|
| 90 |
|
| 91 |
-
|
| 92 |
-
-
|
| 93 |
-
|
|
|
|
|
|
|
| 94 |
|
| 95 |
-
|
| 96 |
|
| 97 |
-
-
|
| 98 |
-
-
|
| 99 |
-
-
|
| 100 |
|
| 101 |
-
## Reward
|
| 102 |
|
| 103 |
-
ChargebackOps
|
| 104 |
|
| 105 |
-
Positive signals:
|
| 106 |
|
| 107 |
-
- selecting
|
| 108 |
-
-
|
| 109 |
-
-
|
| 110 |
-
-
|
| 111 |
-
-
|
| 112 |
-
-
|
|
|
|
| 113 |
|
| 114 |
-
Negative signals:
|
| 115 |
|
| 116 |
-
- duplicate or redundant queries
|
| 117 |
- invalid actions
|
|
|
|
| 118 |
- attaching harmful evidence
|
| 119 |
-
-
|
| 120 |
-
-
|
| 121 |
-
-
|
|
|
|
| 122 |
|
| 123 |
-
|
| 124 |
|
| 125 |
-
|
| 126 |
|
| 127 |
-
|
| 128 |
-
- evidence quality
|
| 129 |
-
- packet validity
|
| 130 |
-
- deadline compliance
|
| 131 |
-
- efficiency
|
| 132 |
-
- outcome quality
|
| 133 |
|
| 134 |
-
|
| 135 |
|
| 136 |
-
|
| 137 |
-
-
|
| 138 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 139 |
|
| 140 |
-
|
| 141 |
|
| 142 |
-
|
| 143 |
|
| 144 |
-
|
| 145 |
-
- default model: `llama-3.3-70b-versatile`
|
| 146 |
|
| 147 |
-
|
|
|
|
|
|
|
| 148 |
|
| 149 |
-
|
| 150 |
-
- Anthropic
|
| 151 |
-
- Groq
|
| 152 |
-
- OpenRouter (`nvidia/nemotron-3-super-120b-a12b:free`)
|
| 153 |
|
| 154 |
-
|
| 155 |
-
|
| 156 |
|
| 157 |
-
|
| 158 |
|
| 159 |
-
|
|
|
|
|
|
|
|
|
|
| 160 |
|
| 161 |
-
|
| 162 |
|
| 163 |
-
- `BASELINE_PROVIDER`
|
| 164 |
-
- `BASELINE_MODEL`
|
| 165 |
-
- `BASELINE_REQUEST_TIMEOUT_SECONDS`
|
| 166 |
-
- `PROVIDER_RATE_LIMIT_RETRIES`
|
| 167 |
-
- `PROVIDER_RETRY_BACKOFF_SECONDS`
|
| 168 |
-
- `STRICT_LLM_MODE`
|
| 169 |
-
- `API_BASE_URL`
|
| 170 |
-
- `MODEL_NAME`
|
| 171 |
-
- `HF_TOKEN`
|
| 172 |
-
- `INFERENCE_TIMEOUT_SECONDS`
|
| 173 |
-
- `OPENROUTER_API_KEY`
|
| 174 |
-
- `OPENROUTER_HTTP_REFERER`
|
| 175 |
-
- `OPENROUTER_APP_TITLE`
|
| 176 |
- `OPENAI_API_KEY`
|
| 177 |
- `ANTHROPIC_API_KEY`
|
| 178 |
- `GROQ_API_KEY`
|
|
|
|
| 179 |
|
| 180 |
-
|
| 181 |
-
|
| 182 |
-
`HF_TOKEN` is the generic API key passed to the OpenAI client for the selected `API_BASE_URL`. For OpenRouter, put your OpenRouter key there. For Groq, point `API_BASE_URL` to `https://api.groq.com/openai/v1`, set `MODEL_NAME=llama-3.3-70b-versatile`, and put your Groq key in `HF_TOKEN`.
|
| 183 |
-
`PROVIDER_RATE_LIMIT_RETRIES` and `PROVIDER_RETRY_BACKOFF_SECONDS` control bounded retry behavior for transient provider rate limits and timeouts. The default `.env.example` keeps these low on purpose so `inference.py` stays within hackathon runtime expectations.
|
| 184 |
-
Set `STRICT_LLM_MODE=1` when you want evaluation to fail immediately on any provider fallback instead of silently dropping to the heuristic policy.
|
| 185 |
|
| 186 |
-
|
| 187 |
|
| 188 |
-
|
|
|
|
|
|
|
|
|
|
| 189 |
|
| 190 |
-
|
| 191 |
-
- `fraud_signal_ambiguity`: `0.7075`
|
| 192 |
-
- `queue_optimization_hard`: `0.7271`
|
| 193 |
-
- average: `0.7140`
|
| 194 |
|
| 195 |
-
|
| 196 |
-
The payload includes `provider_calls_attempted`, `provider_calls_succeeded`, and `provider_errors` so rate-limited free-model runs do not masquerade as successful live inference. If every provider request falls back locally, `mode` is reported as `heuristic_fallback`.
|
| 197 |
|
| 198 |
-
|
|
|
|
| 199 |
|
| 200 |
-
|
| 201 |
|
| 202 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 203 |
|
| 204 |
-
|
| 205 |
-
- `GET /grader`: latest grade report, or `?episode_id=<id>` for a specific episode
|
| 206 |
-
- `GET /baseline`: run the baseline with optional `provider` and `model_name`
|
| 207 |
|
| 208 |
-
##
|
| 209 |
|
| 210 |
-
|
| 211 |
|
| 212 |
-
``
|
| 213 |
-
|
| 214 |
-
``
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 215 |
|
| 216 |
-
##
|
| 217 |
|
| 218 |
-
|
| 219 |
-
uvicorn server.app:app --host 0.0.0.0 --port 8000
|
| 220 |
-
```
|
| 221 |
|
| 222 |
-
|
| 223 |
|
| 224 |
```bash
|
| 225 |
-
|
| 226 |
```
|
| 227 |
|
| 228 |
-
|
| 229 |
|
| 230 |
```bash
|
| 231 |
-
python
|
| 232 |
```
|
| 233 |
|
| 234 |
-
|
| 235 |
-
|
| 236 |
-
- easy / medium / hard task coverage
|
| 237 |
-
- deterministic grader behavior
|
| 238 |
-
- partial-progress reward shaping
|
| 239 |
-
- separation between a competent policy and a bad control policy
|
| 240 |
-
- `inference.py` contract
|
| 241 |
-
- `openenv validate`
|
| 242 |
-
- baseline and inference execution
|
| 243 |
-
|
| 244 |
-
This audit disables live provider keys on purpose so it stays deterministic and fast.
|
| 245 |
-
|
| 246 |
-
### 3b. Run the live-provider audit
|
| 247 |
|
| 248 |
```bash
|
| 249 |
-
|
| 250 |
```
|
| 251 |
|
| 252 |
-
|
| 253 |
-
The output also includes `provider_errors` so you can distinguish rate limits from connectivity or response-format failures.
|
| 254 |
-
|
| 255 |
-
### 4. Run the baseline
|
| 256 |
|
| 257 |
```bash
|
| 258 |
-
|
|
|
|
|
|
|
| 259 |
```
|
| 260 |
|
| 261 |
-
###
|
| 262 |
|
| 263 |
```bash
|
|
|
|
|
|
|
| 264 |
python inference.py
|
| 265 |
```
|
| 266 |
|
| 267 |
-
|
| 268 |
-
|
| 269 |
-
- `API_BASE_URL`
|
| 270 |
-
- `MODEL_NAME`
|
| 271 |
-
- `HF_TOKEN`
|
| 272 |
-
|
| 273 |
-
To use a provider-backed baseline:
|
| 274 |
|
| 275 |
```bash
|
| 276 |
-
|
| 277 |
```
|
| 278 |
|
| 279 |
-
|
|
|
|
|
|
|
| 280 |
|
| 281 |
```bash
|
| 282 |
-
|
|
|
|
| 283 |
```
|
| 284 |
|
| 285 |
-
|
| 286 |
-
|
| 287 |
-
Build from the project root:
|
| 288 |
|
| 289 |
```bash
|
| 290 |
-
|
| 291 |
-
|
|
|
|
| 292 |
```
|
| 293 |
|
| 294 |
-
|
| 295 |
|
| 296 |
-
|
| 297 |
|
| 298 |
-
|
| 299 |
|
| 300 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 301 |
|
| 302 |
-
|
| 303 |
-
openenv validate .
|
| 304 |
-
openenv push
|
| 305 |
-
```
|
| 306 |
|
| 307 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 308 |
|
| 309 |
```text
|
| 310 |
-
|
| 311 |
-
├── .env.example
|
| 312 |
-
├── README.md
|
| 313 |
├── baseline_runner.py
|
| 314 |
├── client.py
|
| 315 |
-
├── episode_store.py
|
| 316 |
├── grading.py
|
|
|
|
| 317 |
├── models.py
|
| 318 |
├── openenv.yaml
|
| 319 |
-
├── pyproject.toml
|
| 320 |
-
├── simulation.py
|
| 321 |
-
├── scripts/
|
| 322 |
-
│ └── run_baseline.py
|
| 323 |
├── server/
|
| 324 |
│ ├── app.py
|
| 325 |
-
│
|
| 326 |
-
|
| 327 |
└── tests/
|
| 328 |
-
├── conftest.py
|
| 329 |
-
├── test_api.py
|
| 330 |
-
├── test_env.py
|
| 331 |
-
└── test_grader.py
|
| 332 |
```
|
| 333 |
|
| 334 |
## Notes
|
| 335 |
|
| 336 |
-
-
|
| 337 |
-
- The
|
| 338 |
-
-
|
|
|
|
| 1 |
---
|
| 2 |
title: ChargebackOps
|
|
|
|
|
|
|
|
|
|
| 3 |
sdk: docker
|
| 4 |
app_port: 8000
|
| 5 |
tags:
|
|
|
|
| 8 |
|
| 9 |
# ChargebackOps
|
| 10 |
|
| 11 |
+
ChargebackOps is a real-world OpenEnv environment for merchant-side chargeback operations. An agent acts as a dispute analyst, works a queue of payment disputes, investigates evidence across synthetic internal systems, chooses whether to contest or concede, and is graded on recovery quality, deadline handling, and operational discipline.
|
| 12 |
|
| 13 |
+
The environment is designed for the Round 1 OpenEnv problem statement:
|
| 14 |
|
| 15 |
+
- Real-world task, not a game or toy
|
| 16 |
+
- Typed OpenEnv models and `reset()` / `step()` / `state()` support
|
| 17 |
+
- Three graded tasks with easy, medium, and hard difficulty
|
| 18 |
+
- Dense reward shaping with partial progress and negative signals
|
| 19 |
+
- Root-level `inference.py` that uses the OpenAI client contract
|
| 20 |
+
- Docker and Hugging Face Spaces deployment path
|
| 21 |
|
| 22 |
+
## Why This Environment Matters
|
| 23 |
|
| 24 |
+
Merchant dispute handling is a real operations workflow. Analysts do not just classify a ticket or answer a question. They must:
|
| 25 |
+
|
| 26 |
+
- inspect the dispute reason code and the response deadline
|
| 27 |
+
- gather evidence from the right internal systems
|
| 28 |
+
- avoid attaching evidence that weakens the case
|
| 29 |
- choose whether to contest, accept, or refund
|
| 30 |
+
- maximize recovery across a queue under limited time
|
| 31 |
+
|
| 32 |
+
That makes ChargebackOps a strong benchmark for tool-using agents. It tests retrieval, decision-making, prioritization, and operational restraint in a controlled environment with deterministic scoring.
|
| 33 |
+
|
| 34 |
+
## System Architecture
|
| 35 |
+
|
| 36 |
+
```mermaid
|
| 37 |
+
flowchart LR
|
| 38 |
+
A["Agent or inference.py"] --> B["OpenAI-compatible client<br/>API_BASE_URL + MODEL_NAME + HF_TOKEN"]
|
| 39 |
+
A --> C["ChargebackOps HTTP API"]
|
| 40 |
+
C --> D["OpenEnv server<br/>server.app"]
|
| 41 |
+
D --> E["ChargebackOpsEnvironment<br/>step / reset / state"]
|
| 42 |
+
E --> F["Task simulator<br/>simulation.py"]
|
| 43 |
+
E --> G["Dense reward shaping<br/>server/chargeback_ops_environment.py"]
|
| 44 |
+
E --> H["Deterministic grader<br/>grading.py"]
|
| 45 |
+
H --> I["Episode report store<br/>episode_store.py"]
|
| 46 |
+
D --> J["Utility routes<br/>/tasks /grader /baseline /health"]
|
| 47 |
+
```
|
| 48 |
|
| 49 |
+
## Episode Workflow
|
| 50 |
+
|
| 51 |
+
```mermaid
|
| 52 |
+
flowchart TD
|
| 53 |
+
A["reset(task_id)"] --> B["Select the next case from the queue"]
|
| 54 |
+
B --> C["Inspect case metadata"]
|
| 55 |
+
C --> D["Retrieve policy guidance"]
|
| 56 |
+
D --> E["Query merchant systems<br/>orders, payment, shipping, support, refunds, risk"]
|
| 57 |
+
E --> F["Attach or remove evidence"]
|
| 58 |
+
F --> G["Set strategy"]
|
| 59 |
+
G --> H{"contest?"}
|
| 60 |
+
H -->|yes| I["submit_representment"]
|
| 61 |
+
H -->|no| J["resolve_case<br/>accept_chargeback or issue_refund"]
|
| 62 |
+
I --> K{"all cases resolved or max steps reached?"}
|
| 63 |
+
J --> K
|
| 64 |
+
K -->|no| B
|
| 65 |
+
K -->|yes| L["grader computes final score 0.0 to 1.0"]
|
| 66 |
+
```
|
| 67 |
|
| 68 |
+
## Environment Design
|
| 69 |
|
| 70 |
+
### Internal systems
|
| 71 |
|
| 72 |
+
The environment exposes evidence gradually from six synthetic merchant systems:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 73 |
|
| 74 |
+
- `orders`
|
| 75 |
+
- `payment`
|
| 76 |
+
- `shipping`
|
| 77 |
+
- `support`
|
| 78 |
+
- `refunds`
|
| 79 |
+
- `risk`
|
| 80 |
|
| 81 |
+
Each task contains hidden ground truth about:
|
| 82 |
|
| 83 |
+
- optimal strategy per case
|
| 84 |
+
- acceptable fallback strategies
|
| 85 |
+
- required evidence
|
| 86 |
+
- helpful evidence
|
| 87 |
+
- harmful evidence
|
| 88 |
+
- deadline pressure
|
| 89 |
+
- case weight in the final score
|
| 90 |
|
| 91 |
+
### OpenEnv contract
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 92 |
|
| 93 |
+
| Method | Behavior |
|
| 94 |
+
| --- | --- |
|
| 95 |
+
| `reset(task_id=...)` | starts a fresh episode and returns the initial typed observation |
|
| 96 |
+
| `step(action)` | applies one typed action and returns the next observation with reward and done |
|
| 97 |
+
| `state()` | returns the current typed internal state |
|
| 98 |
|
| 99 |
+
Core runtime files:
|
| 100 |
|
| 101 |
+
- [`models.py`](/home/btwitsvoid/Documents/Agents/ChargeBackOps/models.py)
|
| 102 |
+
- [`server/chargeback_ops_environment.py`](/home/btwitsvoid/Documents/Agents/ChargeBackOps/server/chargeback_ops_environment.py)
|
| 103 |
+
- [`server/app.py`](/home/btwitsvoid/Documents/Agents/ChargeBackOps/server/app.py)
|
| 104 |
+
- [`openenv.yaml`](/home/btwitsvoid/Documents/Agents/ChargeBackOps/openenv.yaml)
|
| 105 |
|
| 106 |
+
## Typed Spaces
|
| 107 |
+
|
| 108 |
+
### Action space
|
| 109 |
+
|
| 110 |
+
| Action | Purpose |
|
| 111 |
+
| --- | --- |
|
| 112 |
+
| `select_case` | focus a case from the queue |
|
| 113 |
+
| `inspect_case` | reveal analyst notes for the selected case |
|
| 114 |
+
| `query_system` | pull evidence from one merchant system |
|
| 115 |
+
| `retrieve_policy` | reveal reason-code guidance and required evidence |
|
| 116 |
+
| `add_evidence` | attach retrieved evidence to the current package |
|
| 117 |
+
| `remove_evidence` | remove evidence, including harmful attachments |
|
| 118 |
+
| `set_strategy` | choose `contest`, `accept_chargeback`, or `issue_refund` |
|
| 119 |
+
| `submit_representment` | submit a contest package for a contested case |
|
| 120 |
+
| `resolve_case` | close a non-contest case with acceptance or refund |
|
| 121 |
+
|
| 122 |
+
### Observation space
|
| 123 |
|
| 124 |
+
Each observation includes:
|
| 125 |
|
| 126 |
+
- task metadata: id, title, difficulty, objective
|
| 127 |
+
- current queue with deadlines and case summaries
|
| 128 |
+
- currently selected case
|
| 129 |
+
- visible evidence and policy data
|
| 130 |
+
- available actions
|
| 131 |
+
- `steps_remaining`
|
| 132 |
+
- `progress_score`
|
| 133 |
+
- `last_action_result`
|
| 134 |
+
- optional terminal `grader_report`
|
| 135 |
|
| 136 |
+
### State space
|
| 137 |
|
| 138 |
+
The environment state exposes:
|
| 139 |
|
| 140 |
+
- current episode id and step count
|
| 141 |
+
- public queue resolution state
|
| 142 |
+
- action history
|
| 143 |
+
- latest grade estimate
|
| 144 |
+
- final grader report once complete
|
| 145 |
|
| 146 |
+
## Task Suite
|
| 147 |
|
| 148 |
+
| Task ID | Title | Difficulty | Objective |
|
| 149 |
+
| --- | --- | --- | --- |
|
| 150 |
+
| `goods_not_received_easy` | Delivered But Disputed | easy | contest a straightforward goods-not-received case with delivery proof |
|
| 151 |
+
| `fraud_signal_ambiguity` | Fraud Signal Ambiguity | medium | handle a card-not-present fraud dispute with mixed evidence and harmful artifacts |
|
| 152 |
+
| `queue_optimization_hard` | Dispute Queue Optimization | hard | maximize recovery across a multi-case queue under tight step and deadline pressure |
|
| 153 |
|
| 154 |
+
Difficulty progression is deliberate:
|
| 155 |
|
| 156 |
+
- Easy teaches the standard representment loop.
|
| 157 |
+
- Medium introduces ambiguity and evidence curation.
|
| 158 |
+
- Hard adds queue prioritization, step-budget pressure, and opportunity cost.
|
| 159 |
|
| 160 |
+
## Reward Design
|
| 161 |
|
| 162 |
+
ChargebackOps provides dense per-step feedback and a terminal bonus. The environment rewards progress and penalizes obviously bad operations behavior.
|
| 163 |
|
| 164 |
+
Positive signals include:
|
| 165 |
|
| 166 |
+
- selecting and inspecting the right case
|
| 167 |
+
- retrieving policy guidance
|
| 168 |
+
- querying systems that expose useful evidence
|
| 169 |
+
- attaching helpful or required evidence
|
| 170 |
+
- setting the optimal strategy
|
| 171 |
+
- submitting a complete representment on time
|
| 172 |
+
- resolving a case with the optimal non-contest strategy
|
| 173 |
|
| 174 |
+
Negative signals include:
|
| 175 |
|
|
|
|
| 176 |
- invalid actions
|
| 177 |
+
- duplicate system queries
|
| 178 |
- attaching harmful evidence
|
| 179 |
+
- removing helpful evidence
|
| 180 |
+
- weak strategy choices
|
| 181 |
+
- submitting incomplete or late representments
|
| 182 |
+
- missing deadlines on still-open cases
|
| 183 |
|
| 184 |
+
At episode end, the environment adds a terminal bonus proportional to the deterministic grader score.
|
| 185 |
|
| 186 |
+
## Grading
|
| 187 |
|
| 188 |
+
Each finished episode is scored in `[0.0, 1.0]` by the deterministic grader in [`grading.py`](/home/btwitsvoid/Documents/Agents/ChargeBackOps/grading.py).
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 189 |
|
| 190 |
+
Per-case weighting:
|
| 191 |
|
| 192 |
+
| Component | Weight |
|
| 193 |
+
| --- | --- |
|
| 194 |
+
| strategy correctness | 0.25 |
|
| 195 |
+
| evidence quality | 0.25 |
|
| 196 |
+
| packet validity | 0.15 |
|
| 197 |
+
| deadline compliance | 0.15 |
|
| 198 |
+
| efficiency | 0.10 |
|
| 199 |
+
| outcome quality | 0.10 |
|
| 200 |
|
| 201 |
+
The hard task aggregates multiple case scores by case weight and normalizes the final result to `0.0` to `1.0`.
|
| 202 |
|
| 203 |
+
## Inference and Model Providers
|
| 204 |
|
| 205 |
+
The required root inference entry point is [`inference.py`](/home/btwitsvoid/Documents/Agents/ChargeBackOps/inference.py). It uses the OpenAI Python client with the challenge-compatible environment variables:
|
|
|
|
| 206 |
|
| 207 |
+
- `API_BASE_URL`
|
| 208 |
+
- `MODEL_NAME`
|
| 209 |
+
- `HF_TOKEN`
|
| 210 |
|
| 211 |
+
Default configuration:
|
|
|
|
|
|
|
|
|
|
| 212 |
|
| 213 |
+
- provider path: OpenRouter
|
| 214 |
+
- model: `openai/gpt-oss-120b`
|
| 215 |
|
| 216 |
+
Also supported through the same OpenAI-compatible client pattern:
|
| 217 |
|
| 218 |
+
- OpenAI
|
| 219 |
+
- Anthropic-compatible gateways
|
| 220 |
+
- Groq
|
| 221 |
+
- OpenRouter
|
| 222 |
|
| 223 |
+
The repository also keeps optional direct keys for convenience in [`.env.example`](/home/btwitsvoid/Documents/Agents/ChargeBackOps/.env.example):
|
| 224 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 225 |
- `OPENAI_API_KEY`
|
| 226 |
- `ANTHROPIC_API_KEY`
|
| 227 |
- `GROQ_API_KEY`
|
| 228 |
+
- `OPENROUTER_API_KEY`
|
| 229 |
|
| 230 |
+
### OpenRouter referer
|
|
|
|
|
|
|
|
|
|
|
|
|
| 231 |
|
| 232 |
+
Leave `OPENROUTER_HTTP_REFERER` empty during local development. Once the app is deployed, set it to the public app URL, for example:
|
| 233 |
|
| 234 |
+
```bash
|
| 235 |
+
OPENROUTER_HTTP_REFERER=https://your-space-name.hf.space
|
| 236 |
+
OPENROUTER_APP_TITLE=ChargebackOps
|
| 237 |
+
```
|
| 238 |
|
| 239 |
+
## Baseline Results
|
|
|
|
|
|
|
|
|
|
| 240 |
|
| 241 |
+
The repository includes two baseline entry points:
|
|
|
|
| 242 |
|
| 243 |
+
- [`inference.py`](/home/btwitsvoid/Documents/Agents/ChargeBackOps/inference.py) for the challenge contract
|
| 244 |
+
- [`baseline_runner.py`](/home/btwitsvoid/Documents/Agents/ChargeBackOps/baseline_runner.py) for direct local runs and the `/baseline` endpoint
|
| 245 |
|
| 246 |
+
Verified local heuristic-fallback baseline scores are documented below after the latest validation pass:
|
| 247 |
|
| 248 |
+
| Task | Score |
|
| 249 |
+
| --- | --- |
|
| 250 |
+
| Delivered But Disputed | `0.7075` |
|
| 251 |
+
| Fraud Signal Ambiguity | `0.7075` |
|
| 252 |
+
| Dispute Queue Optimization | `0.7271` |
|
| 253 |
+
| Average | `0.7140` |
|
| 254 |
|
| 255 |
+
These values are replaced after each validation run so the README reflects real, reproducible output from the current codebase.
|
|
|
|
|
|
|
| 256 |
|
| 257 |
+
## API Surface
|
| 258 |
|
| 259 |
+
The FastAPI app exposes:
|
| 260 |
|
| 261 |
+
- `GET /` basic service ping
|
| 262 |
+
- `GET /health` health check
|
| 263 |
+
- `GET /docs` interactive OpenAPI docs
|
| 264 |
+
- `POST /reset` start a new episode
|
| 265 |
+
- `POST /step` advance the environment
|
| 266 |
+
- `GET /state` inspect the current state
|
| 267 |
+
- `GET /tasks` enumerate tasks and the action schema
|
| 268 |
+
- `GET /grader` or `POST /grader` fetch the last completed episode grade
|
| 269 |
+
- `GET /baseline` or `POST /baseline` run the bundled baseline
|
| 270 |
|
| 271 |
+
## Local Setup
|
| 272 |
|
| 273 |
+
### 1. Install dependencies
|
|
|
|
|
|
|
| 274 |
|
| 275 |
+
Using `uv`:
|
| 276 |
|
| 277 |
```bash
|
| 278 |
+
uv sync --extra dev
|
| 279 |
```
|
| 280 |
|
| 281 |
+
Using `pip`:
|
| 282 |
|
| 283 |
```bash
|
| 284 |
+
python -m pip install -e ".[dev]"
|
| 285 |
```
|
| 286 |
|
| 287 |
+
### 2. Configure environment variables
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 288 |
|
| 289 |
```bash
|
| 290 |
+
cp .env.example .env
|
| 291 |
```
|
| 292 |
|
| 293 |
+
At minimum, configure:
|
|
|
|
|
|
|
|
|
|
| 294 |
|
| 295 |
```bash
|
| 296 |
+
API_BASE_URL=https://openrouter.ai/api/v1
|
| 297 |
+
MODEL_NAME=openai/gpt-oss-120b
|
| 298 |
+
HF_TOKEN=your_provider_key
|
| 299 |
```
|
| 300 |
|
| 301 |
+
### 3. Run the test and validation suite
|
| 302 |
|
| 303 |
```bash
|
| 304 |
+
pytest -q tests
|
| 305 |
+
openenv validate .
|
| 306 |
python inference.py
|
| 307 |
```
|
| 308 |
|
| 309 |
+
### 4. Start the server locally
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 310 |
|
| 311 |
```bash
|
| 312 |
+
uvicorn server.app:app --host 0.0.0.0 --port 8000
|
| 313 |
```
|
| 314 |
|
| 315 |
+
## Docker
|
| 316 |
+
|
| 317 |
+
Build and run the root Docker image:
|
| 318 |
|
| 319 |
```bash
|
| 320 |
+
docker build -t chargebackops .
|
| 321 |
+
docker run --rm -p 8000:8000 --env-file .env chargebackops
|
| 322 |
```
|
| 323 |
|
| 324 |
+
Once the container is running:
|
|
|
|
|
|
|
| 325 |
|
| 326 |
```bash
|
| 327 |
+
curl http://localhost:8000/
|
| 328 |
+
curl http://localhost:8000/tasks
|
| 329 |
+
curl http://localhost:8000/health
|
| 330 |
```
|
| 331 |
|
| 332 |
+
## Hugging Face Spaces Deployment
|
| 333 |
|
| 334 |
+
ChargebackOps is configured as a Docker Space through the YAML frontmatter in this README.
|
| 335 |
|
| 336 |
+
Recommended deployment steps:
|
| 337 |
|
| 338 |
+
1. Create a new Hugging Face Space with `Docker` as the SDK.
|
| 339 |
+
2. Push this repository to the Space.
|
| 340 |
+
3. Add the runtime variables in Space Settings:
|
| 341 |
+
- `API_BASE_URL`
|
| 342 |
+
- `MODEL_NAME`
|
| 343 |
+
- `HF_TOKEN`
|
| 344 |
+
4. If using OpenRouter, add:
|
| 345 |
+
- `OPENROUTER_HTTP_REFERER=https://your-space-name.hf.space`
|
| 346 |
+
- `OPENROUTER_APP_TITLE=ChargebackOps`
|
| 347 |
+
5. Verify:
|
| 348 |
+
- `/`
|
| 349 |
+
- `/health`
|
| 350 |
+
- `/tasks`
|
| 351 |
+
- `/docs`
|
| 352 |
+
- `/baseline`
|
| 353 |
|
| 354 |
+
## Validation Checklist
|
|
|
|
|
|
|
|
|
|
| 355 |
|
| 356 |
+
- `pytest -q tests`
|
| 357 |
+
- `openenv validate .`
|
| 358 |
+
- `python inference.py`
|
| 359 |
+
- `docker build -t chargebackops .`
|
| 360 |
+
- `docker run --rm -p 8000:8000 --env-file .env chargebackops`
|
| 361 |
+
|
| 362 |
+
## Project Layout
|
| 363 |
|
| 364 |
```text
|
| 365 |
+
.
|
|
|
|
|
|
|
| 366 |
├── baseline_runner.py
|
| 367 |
├── client.py
|
|
|
|
| 368 |
├── grading.py
|
| 369 |
+
├── inference.py
|
| 370 |
├── models.py
|
| 371 |
├── openenv.yaml
|
|
|
|
|
|
|
|
|
|
|
|
|
| 372 |
├── server/
|
| 373 |
│ ├── app.py
|
| 374 |
+
│ └── chargeback_ops_environment.py
|
| 375 |
+
├── simulation.py
|
| 376 |
└── tests/
|
|
|
|
|
|
|
|
|
|
|
|
|
| 377 |
```
|
| 378 |
|
| 379 |
## Notes
|
| 380 |
|
| 381 |
+
- This is a synthetic benchmark environment, not a live payments integration.
|
| 382 |
+
- The world state is deterministic by design so graders remain reproducible.
|
| 383 |
+
- Live model quality still depends on the quota and reliability of the configured provider.
|
baseline_runner.py
CHANGED
|
@@ -30,11 +30,11 @@ except ImportError: # pragma: no cover
|
|
| 30 |
if load_dotenv is not None: # pragma: no cover
|
| 31 |
load_dotenv()
|
| 32 |
|
| 33 |
-
DEFAULT_PROVIDER = "
|
| 34 |
MAX_LLM_CANDIDATES = 4
|
| 35 |
MAX_PROVIDER_RESPONSE_TOKENS = 80
|
| 36 |
DEFAULT_MODELS = {
|
| 37 |
-
"openrouter": "
|
| 38 |
"groq": "llama-3.3-70b-versatile",
|
| 39 |
"openai": "gpt-5-mini",
|
| 40 |
"anthropic": "claude-3-5-haiku-latest",
|
|
@@ -567,7 +567,7 @@ def _resolve_provider(
|
|
| 567 |
chosen_provider = (provider or os.getenv("BASELINE_PROVIDER") or DEFAULT_PROVIDER).lower()
|
| 568 |
chosen_model = model_name or os.getenv("BASELINE_MODEL") or DEFAULT_MODELS.get(
|
| 569 |
chosen_provider,
|
| 570 |
-
"
|
| 571 |
)
|
| 572 |
return ProviderConfig(provider=chosen_provider, model_name=chosen_model)
|
| 573 |
|
|
|
|
| 30 |
if load_dotenv is not None: # pragma: no cover
|
| 31 |
load_dotenv()
|
| 32 |
|
| 33 |
+
DEFAULT_PROVIDER = "openrouter"
|
| 34 |
MAX_LLM_CANDIDATES = 4
|
| 35 |
MAX_PROVIDER_RESPONSE_TOKENS = 80
|
| 36 |
DEFAULT_MODELS = {
|
| 37 |
+
"openrouter": "openai/gpt-oss-120b",
|
| 38 |
"groq": "llama-3.3-70b-versatile",
|
| 39 |
"openai": "gpt-5-mini",
|
| 40 |
"anthropic": "claude-3-5-haiku-latest",
|
|
|
|
| 567 |
chosen_provider = (provider or os.getenv("BASELINE_PROVIDER") or DEFAULT_PROVIDER).lower()
|
| 568 |
chosen_model = model_name or os.getenv("BASELINE_MODEL") or DEFAULT_MODELS.get(
|
| 569 |
chosen_provider,
|
| 570 |
+
"openai/gpt-oss-120b",
|
| 571 |
)
|
| 572 |
return ProviderConfig(provider=chosen_provider, model_name=chosen_model)
|
| 573 |
|
scripts/live_provider_audit.py
DELETED
|
@@ -1,34 +0,0 @@
|
|
| 1 |
-
"""Live-provider audit for ChargebackOps."""
|
| 2 |
-
|
| 3 |
-
from __future__ import annotations
|
| 4 |
-
|
| 5 |
-
import json
|
| 6 |
-
import os
|
| 7 |
-
import sys
|
| 8 |
-
from pathlib import Path
|
| 9 |
-
|
| 10 |
-
PROJECT_ROOT = Path(__file__).resolve().parents[1]
|
| 11 |
-
if str(PROJECT_ROOT) not in sys.path:
|
| 12 |
-
sys.path.insert(0, str(PROJECT_ROOT))
|
| 13 |
-
|
| 14 |
-
from baseline_runner import run_baseline
|
| 15 |
-
from inference import run_inference
|
| 16 |
-
|
| 17 |
-
|
| 18 |
-
def main() -> None:
|
| 19 |
-
report = {
|
| 20 |
-
"config": {
|
| 21 |
-
"baseline_provider": os.getenv("BASELINE_PROVIDER"),
|
| 22 |
-
"baseline_model": os.getenv("BASELINE_MODEL"),
|
| 23 |
-
"api_base_url": os.getenv("API_BASE_URL"),
|
| 24 |
-
"model_name": os.getenv("MODEL_NAME"),
|
| 25 |
-
"strict_llm_mode": os.getenv("STRICT_LLM_MODE", ""),
|
| 26 |
-
},
|
| 27 |
-
"baseline": run_baseline().model_dump(),
|
| 28 |
-
"inference": run_inference().model_dump(),
|
| 29 |
-
}
|
| 30 |
-
print(json.dumps(report, indent=2))
|
| 31 |
-
|
| 32 |
-
|
| 33 |
-
if __name__ == "__main__":
|
| 34 |
-
main()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
scripts/problem_statement_audit.py
DELETED
|
@@ -1,268 +0,0 @@
|
|
| 1 |
-
"""Requirement-focused audit for the ChargebackOps submission."""
|
| 2 |
-
|
| 3 |
-
from __future__ import annotations
|
| 4 |
-
|
| 5 |
-
import json
|
| 6 |
-
import os
|
| 7 |
-
import shutil
|
| 8 |
-
import subprocess
|
| 9 |
-
import sys
|
| 10 |
-
from contextlib import contextmanager
|
| 11 |
-
from pathlib import Path
|
| 12 |
-
|
| 13 |
-
PROJECT_ROOT = Path(__file__).resolve().parents[1]
|
| 14 |
-
if str(PROJECT_ROOT) not in sys.path:
|
| 15 |
-
sys.path.insert(0, str(PROJECT_ROOT))
|
| 16 |
-
|
| 17 |
-
from baseline_runner import _heuristic_pick, candidate_actions
|
| 18 |
-
from grading import grade_episode
|
| 19 |
-
from inference import run_inference
|
| 20 |
-
from models import ChargebackOpsAction
|
| 21 |
-
from server.app import baseline, tasks
|
| 22 |
-
from server.chargeback_ops_environment import ChargebackOpsEnvironment
|
| 23 |
-
from simulation import get_task, list_tasks
|
| 24 |
-
|
| 25 |
-
|
| 26 |
-
def _run_heuristic_episode(task_id: str) -> dict[str, float]:
|
| 27 |
-
env = ChargebackOpsEnvironment()
|
| 28 |
-
observation = env.reset(task_id=task_id)
|
| 29 |
-
total_reward = 0.0
|
| 30 |
-
while not observation.done:
|
| 31 |
-
candidates = candidate_actions(observation.model_dump())
|
| 32 |
-
observation = env.step(_heuristic_pick(candidates).action)
|
| 33 |
-
total_reward += observation.reward or 0.0
|
| 34 |
-
assert observation.grader_report is not None
|
| 35 |
-
return {
|
| 36 |
-
"reward": round(total_reward, 4),
|
| 37 |
-
"score": observation.grader_report.normalized_score,
|
| 38 |
-
}
|
| 39 |
-
|
| 40 |
-
|
| 41 |
-
def _run_bad_episode(task_id: str) -> dict[str, float]:
|
| 42 |
-
env = ChargebackOpsEnvironment()
|
| 43 |
-
observation = env.reset(task_id=task_id)
|
| 44 |
-
total_reward = 0.0
|
| 45 |
-
while not observation.done:
|
| 46 |
-
if observation.selected_case_id is None:
|
| 47 |
-
open_case = next(case for case in observation.queue if case.status == "open")
|
| 48 |
-
action = ChargebackOpsAction(action_type="select_case", case_id=open_case.case_id)
|
| 49 |
-
else:
|
| 50 |
-
case_id = observation.selected_case_id
|
| 51 |
-
visible_case = observation.visible_case
|
| 52 |
-
if visible_case and visible_case.current_strategy is None:
|
| 53 |
-
action = ChargebackOpsAction(
|
| 54 |
-
action_type="set_strategy",
|
| 55 |
-
case_id=case_id,
|
| 56 |
-
strategy="accept_chargeback",
|
| 57 |
-
)
|
| 58 |
-
elif visible_case and visible_case.current_strategy == "accept_chargeback":
|
| 59 |
-
action = ChargebackOpsAction(
|
| 60 |
-
action_type="resolve_case",
|
| 61 |
-
case_id=case_id,
|
| 62 |
-
strategy="accept_chargeback",
|
| 63 |
-
)
|
| 64 |
-
else:
|
| 65 |
-
action = ChargebackOpsAction(
|
| 66 |
-
action_type="query_system",
|
| 67 |
-
case_id=case_id,
|
| 68 |
-
system_name="payment",
|
| 69 |
-
)
|
| 70 |
-
observation = env.step(action)
|
| 71 |
-
total_reward += observation.reward or 0.0
|
| 72 |
-
assert observation.grader_report is not None
|
| 73 |
-
return {
|
| 74 |
-
"reward": round(total_reward, 4),
|
| 75 |
-
"score": observation.grader_report.normalized_score,
|
| 76 |
-
}
|
| 77 |
-
|
| 78 |
-
|
| 79 |
-
def _check(condition: bool, message: str, details: object | None = None) -> dict[str, object]:
|
| 80 |
-
return {
|
| 81 |
-
"pass": condition,
|
| 82 |
-
"message": message,
|
| 83 |
-
"details": details,
|
| 84 |
-
}
|
| 85 |
-
|
| 86 |
-
|
| 87 |
-
@contextmanager
|
| 88 |
-
def _deterministic_provider_disabled():
|
| 89 |
-
keys = [
|
| 90 |
-
"HF_TOKEN",
|
| 91 |
-
"API_BASE_URL",
|
| 92 |
-
"MODEL_NAME",
|
| 93 |
-
"OPENROUTER_API_KEY",
|
| 94 |
-
"OPENAI_API_KEY",
|
| 95 |
-
"ANTHROPIC_API_KEY",
|
| 96 |
-
"GROQ_API_KEY",
|
| 97 |
-
"STRICT_LLM_MODE",
|
| 98 |
-
]
|
| 99 |
-
previous = {key: os.environ.get(key) for key in keys}
|
| 100 |
-
try:
|
| 101 |
-
for key in keys:
|
| 102 |
-
os.environ.pop(key, None)
|
| 103 |
-
yield
|
| 104 |
-
finally:
|
| 105 |
-
for key, value in previous.items():
|
| 106 |
-
if value is None:
|
| 107 |
-
os.environ.pop(key, None)
|
| 108 |
-
else:
|
| 109 |
-
os.environ[key] = value
|
| 110 |
-
|
| 111 |
-
|
| 112 |
-
def main() -> None:
|
| 113 |
-
tasks_payload = tasks()
|
| 114 |
-
task_list = list_tasks()
|
| 115 |
-
|
| 116 |
-
openenv_cli = shutil.which("openenv")
|
| 117 |
-
openenv_validate = subprocess.run(
|
| 118 |
-
[openenv_cli or "openenv", "validate", "."],
|
| 119 |
-
cwd=PROJECT_ROOT,
|
| 120 |
-
capture_output=True,
|
| 121 |
-
text=True,
|
| 122 |
-
)
|
| 123 |
-
|
| 124 |
-
files = {
|
| 125 |
-
"Dockerfile": (PROJECT_ROOT / "Dockerfile").exists(),
|
| 126 |
-
"README.md": (PROJECT_ROOT / "README.md").exists(),
|
| 127 |
-
"openenv.yaml": (PROJECT_ROOT / "openenv.yaml").exists(),
|
| 128 |
-
"inference.py": (PROJECT_ROOT / "inference.py").exists(),
|
| 129 |
-
}
|
| 130 |
-
|
| 131 |
-
heuristic_hard = _run_heuristic_episode("queue_optimization_hard")
|
| 132 |
-
bad_hard = _run_bad_episode("queue_optimization_hard")
|
| 133 |
-
|
| 134 |
-
env = ChargebackOpsEnvironment()
|
| 135 |
-
reset_obs = env.reset(task_id="goods_not_received_easy")
|
| 136 |
-
initial_episode = env.state.episode_id
|
| 137 |
-
env.step(ChargebackOpsAction(action_type="select_case", case_id="CB-E1"))
|
| 138 |
-
reset_obs_2 = env.reset(task_id="fraud_signal_ambiguity")
|
| 139 |
-
|
| 140 |
-
env_reward = ChargebackOpsEnvironment()
|
| 141 |
-
env_reward.reset(task_id="fraud_signal_ambiguity")
|
| 142 |
-
env_reward.step(ChargebackOpsAction(action_type="select_case", case_id="CB-M1"))
|
| 143 |
-
helpful = env_reward.step(
|
| 144 |
-
ChargebackOpsAction(action_type="query_system", case_id="CB-M1", system_name="orders")
|
| 145 |
-
)
|
| 146 |
-
duplicate = env_reward.step(
|
| 147 |
-
ChargebackOpsAction(action_type="query_system", case_id="CB-M1", system_name="orders")
|
| 148 |
-
)
|
| 149 |
-
harmful = env_reward.step(
|
| 150 |
-
ChargebackOpsAction(
|
| 151 |
-
action_type="add_evidence",
|
| 152 |
-
case_id="CB-M1",
|
| 153 |
-
evidence_ids=["M1-AVS-MISMATCH"],
|
| 154 |
-
)
|
| 155 |
-
)
|
| 156 |
-
|
| 157 |
-
task = get_task("queue_optimization_hard")
|
| 158 |
-
env_grader = ChargebackOpsEnvironment()
|
| 159 |
-
env_grader.reset(task_id="queue_optimization_hard")
|
| 160 |
-
grader_a = grade_episode(
|
| 161 |
-
task,
|
| 162 |
-
env_grader._progress_by_case, # type: ignore[attr-defined]
|
| 163 |
-
env_grader.state.step_count,
|
| 164 |
-
env_grader.state.episode_id or "",
|
| 165 |
-
completed=False,
|
| 166 |
-
)
|
| 167 |
-
grader_b = grade_episode(
|
| 168 |
-
task,
|
| 169 |
-
env_grader._progress_by_case, # type: ignore[attr-defined]
|
| 170 |
-
env_grader.state.step_count,
|
| 171 |
-
env_grader.state.episode_id or "",
|
| 172 |
-
completed=False,
|
| 173 |
-
)
|
| 174 |
-
|
| 175 |
-
with _deterministic_provider_disabled():
|
| 176 |
-
baseline_payload = baseline()
|
| 177 |
-
inference_payload = run_inference()
|
| 178 |
-
source = (PROJECT_ROOT / "inference.py").read_text()
|
| 179 |
-
|
| 180 |
-
report = {
|
| 181 |
-
"task_catalog": _check(
|
| 182 |
-
len(task_list) >= 3 and {task.difficulty for task in task_list} == {"easy", "medium", "hard"},
|
| 183 |
-
"Environment exposes easy, medium, and hard tasks.",
|
| 184 |
-
[task.task_id for task in task_list],
|
| 185 |
-
),
|
| 186 |
-
"grader_range": _check(
|
| 187 |
-
all(0.0 <= result.score <= 1.0 for result in baseline_payload.task_results),
|
| 188 |
-
"Grader returns scores in [0.0, 1.0] for all baseline tasks.",
|
| 189 |
-
[result.score for result in baseline_payload.task_results],
|
| 190 |
-
),
|
| 191 |
-
"grader_determinism": _check(
|
| 192 |
-
grader_a.model_dump() == grader_b.model_dump(),
|
| 193 |
-
"Grader is deterministic on identical state.",
|
| 194 |
-
{"score": grader_a.normalized_score},
|
| 195 |
-
),
|
| 196 |
-
"reward_signal": _check(
|
| 197 |
-
(helpful.reward or 0.0) > 0 and (duplicate.reward or 0.0) < 0 and (harmful.reward or 0.0) < 0,
|
| 198 |
-
"Reward provides partial progress and penalty signals.",
|
| 199 |
-
{
|
| 200 |
-
"helpful_reward": helpful.reward,
|
| 201 |
-
"duplicate_reward": duplicate.reward,
|
| 202 |
-
"harmful_reward": harmful.reward,
|
| 203 |
-
},
|
| 204 |
-
),
|
| 205 |
-
"agent_separation": _check(
|
| 206 |
-
heuristic_hard["score"] > bad_hard["score"] and heuristic_hard["reward"] > bad_hard["reward"],
|
| 207 |
-
"A competent policy scores better than a bad control policy on the hard task.",
|
| 208 |
-
{"heuristic": heuristic_hard, "bad": bad_hard},
|
| 209 |
-
),
|
| 210 |
-
"reset_state": _check(
|
| 211 |
-
reset_obs.done is False
|
| 212 |
-
and reset_obs_2.task_id == "fraud_signal_ambiguity"
|
| 213 |
-
and env.state.step_count == 0
|
| 214 |
-
and env.state.action_history == []
|
| 215 |
-
and env.state.episode_id != initial_episode,
|
| 216 |
-
"reset() produces a clean episode state.",
|
| 217 |
-
{
|
| 218 |
-
"first_task": reset_obs.task_id,
|
| 219 |
-
"second_task": reset_obs_2.task_id,
|
| 220 |
-
"step_count": env.state.step_count,
|
| 221 |
-
},
|
| 222 |
-
),
|
| 223 |
-
"tasks_endpoint": _check(
|
| 224 |
-
len(tasks_payload.tasks) >= 3 and "properties" in tasks_payload.action_schema,
|
| 225 |
-
"/tasks exposes task metadata and a typed action schema.",
|
| 226 |
-
{"task_count": len(tasks_payload.tasks)},
|
| 227 |
-
),
|
| 228 |
-
"inference_contract": _check(
|
| 229 |
-
all(token in source for token in ["from openai import OpenAI", "API_BASE_URL", "MODEL_NAME", "HF_TOKEN"]),
|
| 230 |
-
"inference.py uses the OpenAI client with the required environment variables.",
|
| 231 |
-
None,
|
| 232 |
-
),
|
| 233 |
-
"openenv_validate": _check(
|
| 234 |
-
openenv_validate.returncode == 0,
|
| 235 |
-
"openenv validate passes.",
|
| 236 |
-
openenv_validate.stdout.strip() or openenv_validate.stderr.strip(),
|
| 237 |
-
),
|
| 238 |
-
"baseline_runs": _check(
|
| 239 |
-
len(baseline_payload.task_results) == 3,
|
| 240 |
-
"Baseline endpoint runs across all tasks.",
|
| 241 |
-
{
|
| 242 |
-
"mode": baseline_payload.mode,
|
| 243 |
-
"provider_calls_attempted": baseline_payload.provider_calls_attempted,
|
| 244 |
-
"provider_calls_succeeded": baseline_payload.provider_calls_succeeded,
|
| 245 |
-
},
|
| 246 |
-
),
|
| 247 |
-
"inference_runs": _check(
|
| 248 |
-
len(inference_payload.task_results) == 3,
|
| 249 |
-
"inference.py runs across all tasks.",
|
| 250 |
-
{
|
| 251 |
-
"mode": inference_payload.mode,
|
| 252 |
-
"provider_calls_attempted": inference_payload.provider_calls_attempted,
|
| 253 |
-
"provider_calls_succeeded": inference_payload.provider_calls_succeeded,
|
| 254 |
-
},
|
| 255 |
-
),
|
| 256 |
-
"required_files": _check(
|
| 257 |
-
all(files.values()),
|
| 258 |
-
"Submission-critical files exist.",
|
| 259 |
-
files,
|
| 260 |
-
),
|
| 261 |
-
}
|
| 262 |
-
|
| 263 |
-
report["all_passed"] = all(item["pass"] for item in report.values())
|
| 264 |
-
print(json.dumps(report, indent=2))
|
| 265 |
-
|
| 266 |
-
|
| 267 |
-
if __name__ == "__main__":
|
| 268 |
-
main()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
scripts/run_baseline.py
DELETED
|
@@ -1,16 +0,0 @@
|
|
| 1 |
-
"""CLI wrapper for the ChargebackOps baseline."""
|
| 2 |
-
|
| 3 |
-
from __future__ import annotations
|
| 4 |
-
|
| 5 |
-
import sys
|
| 6 |
-
from pathlib import Path
|
| 7 |
-
|
| 8 |
-
PROJECT_ROOT = Path(__file__).resolve().parents[1]
|
| 9 |
-
if str(PROJECT_ROOT) not in sys.path:
|
| 10 |
-
sys.path.insert(0, str(PROJECT_ROOT))
|
| 11 |
-
|
| 12 |
-
from baseline_runner import main
|
| 13 |
-
|
| 14 |
-
|
| 15 |
-
if __name__ == "__main__":
|
| 16 |
-
main()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|