Spaces:

mitudrudutta
/

ChargeBackOps

Sleeping

App Files Files Community

mitudrudutta commited on Mar 27

Commit

fe45227

1 Parent(s): 379f291

docs: professionalize README and trim repo utilities

Browse files

Files changed (6) hide show

.env.example +9 -9
README.md +254 -209
baseline_runner.py +3 -3
scripts/live_provider_audit.py +0 -34
scripts/problem_statement_audit.py +0 -268
scripts/run_baseline.py +0 -16

.env.example CHANGED Viewed

@@ -1,6 +1,6 @@
 # Baseline provider selection
-BASELINE_PROVIDER=groq
-BASELINE_MODEL=llama-3.3-70b-versatile
 BASELINE_REQUEST_TIMEOUT_SECONDS=4
 PROVIDER_RATE_LIMIT_RETRIES=0
 PROVIDER_RETRY_BACKOFF_SECONDS=0.5
@@ -9,16 +9,16 @@ STRICT_LLM_MODE=0
 # Challenge-compatible inference contract
 # `HF_TOKEN` is the generic API key passed to the OpenAI client for the selected base URL.
 # For OpenRouter, set it to your OpenRouter key. For Groq, set it to your Groq key.
-API_BASE_URL=https://api.groq.com/openai/v1
-MODEL_NAME=llama-3.3-70b-versatile
 HF_TOKEN=
 INFERENCE_TIMEOUT_SECONDS=4
-# Alternative OpenRouter free path:
-# BASELINE_PROVIDER=openrouter
-# BASELINE_MODEL=nvidia/nemotron-3-super-120b-a12b:free
-# API_BASE_URL=https://openrouter.ai/api/v1
-# MODEL_NAME=nvidia/nemotron-3-super-120b-a12b:free
 # OpenRouter setup
 OPENROUTER_API_KEY=

 # Baseline provider selection
+BASELINE_PROVIDER=openrouter
+BASELINE_MODEL=openai/gpt-oss-120b
 BASELINE_REQUEST_TIMEOUT_SECONDS=4
 PROVIDER_RATE_LIMIT_RETRIES=0
 PROVIDER_RETRY_BACKOFF_SECONDS=0.5
 # Challenge-compatible inference contract
 # `HF_TOKEN` is the generic API key passed to the OpenAI client for the selected base URL.
 # For OpenRouter, set it to your OpenRouter key. For Groq, set it to your Groq key.
+API_BASE_URL=https://openrouter.ai/api/v1
+MODEL_NAME=openai/gpt-oss-120b
 HF_TOKEN=
 INFERENCE_TIMEOUT_SECONDS=4
+# Alternative Groq path:
+# BASELINE_PROVIDER=groq
+# BASELINE_MODEL=llama-3.3-70b-versatile
+# API_BASE_URL=https://api.groq.com/openai/v1
+# MODEL_NAME=llama-3.3-70b-versatile
 # OpenRouter setup
 OPENROUTER_API_KEY=

README.md CHANGED Viewed

@@ -1,8 +1,5 @@
 ---
 title: ChargebackOps
-emoji: 💳
-colorFrom: blue
-colorTo: gray
 sdk: docker
 app_port: 8000
 tags:
@@ -11,328 +8,376 @@ tags:
 # ChargebackOps
-ChargebackOps is a real-world OpenEnv environment for merchant-side dispute operations. An agent acts like a chargeback analyst: it reviews incoming disputes, decides whether to contest or concede, gathers evidence from synthetic merchant systems, and resolves cases under deadline pressure.
-This is not a toy environment and not a retrieval demo. The hard task is a portfolio-optimization problem over a live queue of disputes with different amounts, deadlines, and win profiles.
-## Why this environment exists
-Chargeback operations are painful, repetitive, and economically important. Real analysts do not just fill forms. They:
-- triage cases by deadline and recovery value
-- inspect reason-code policy
-- gather evidence across internal systems
-- avoid harmful or contradictory attachments
 - choose whether to contest, accept, or refund
-- close cases before network deadlines
-That maps cleanly to the standard `reset()` / `step()` / `state()` API and produces deterministic grading.
-## Environment design
-ChargebackOps simulates a merchant operations stack with fully synthetic data:
-- order management
-- payment gateway ledger
-- shipping and delivery records
-- customer support transcripts
-- refund ledger
-- fraud and device-risk summaries
-- dispute policy guidance by reason code
-The agent never gets hidden truth directly. It must reveal systems, curate evidence, and resolve cases using typed actions.
-## Action space
-The action model is [`ChargebackOpsAction`](./models.py) and includes:
-- `select_case`: choose which dispute to work on
-- `inspect_case`: reveal merchant-side notes for the selected case
-- `query_system`: inspect one of `orders`, `payment`, `shipping`, `support`, `refunds`, or `risk`
-- `retrieve_policy`: load reason-code guidance and required evidence hints
-- `add_evidence`: attach one or more revealed evidence items
-- `remove_evidence`: remove attached evidence
-- `set_strategy`: set `contest`, `accept_chargeback`, or `issue_refund`
-- `submit_representment`: submit the contest package
-- `resolve_case`: resolve a case via `accept_chargeback` or `issue_refund`
-The full schema is available at `GET /tasks`.
-## Observation space
-The observation model is [`ChargebackOpsObservation`](./models.py). Each step returns:
-- task id, title, objective, and difficulty
-- current queue with case amount, reason code, status, and deadline countdown
-- selected case workspace
-- revealed evidence snippets
-- attached evidence
-- visible policy guidance
-- available actions
-- steps remaining
-- dense reward in `reward`
-- reward breakdown in `metadata.reward_components`
-- final grader report when the episode is done
-`state()` returns the extended [`ChargebackOpsState`](./models.py) with queue state, action history, and the latest grading report.
-## Tasks
-ChargebackOps ships with three deterministic tasks.
-### 1. Delivered But Disputed
-- Difficulty: `easy`
-- Goal: contest a `goods_not_received` dispute
-- What matters: order confirmation + carrier delivery evidence + submitting before deadline
-### 2. Fraud Signal Ambiguity
-- Difficulty: `medium`
-- Goal: handle a `fraud_cnp` case with both supportive and harmful signals
-- What matters: using account-linkage evidence while avoiding AVS/CVV mismatch artifacts
-### 3. Dispute Queue Optimization
-- Difficulty: `hard`
-- Goal: maximize recovery across three simultaneous disputes
-- What matters: prioritization, avoiding weak contests, and not missing short deadlines
-## Reward shaping
-ChargebackOps uses dense trajectory rewards, not a final binary score only.
-Positive signals:
-- selecting a live case
-- revealing a useful system
-- attaching helpful evidence
-- setting the right strategy
-- submitting a valid representment
-- resolving a case correctly before deadline
-Negative signals:
-- duplicate or redundant queries
 - invalid actions
 - attaching harmful evidence
-- late submissions
-- contesting unwinnable cases
-- leaving cases unresolved when the step budget expires
-## Deterministic grading
-Each episode ends with a programmatic grader report. Per-case scoring combines:
-- strategy correctness
-- evidence quality
-- packet validity
-- deadline compliance
-- efficiency
-- outcome quality
-Scores are normalized to `0.0` to `1.0` and exposed through:
-- the final observation
-- `state()`
-- `GET /grader`
-## Baseline providers
-The baseline runner now defaults to the more reliable free live path:
-- default provider: `groq`
-- default model: `llama-3.3-70b-versatile`
-The repository also keeps provider integrations for:
-- OpenAI
-- Anthropic
-- Groq
-- OpenRouter (`nvidia/nemotron-3-super-120b-a12b:free`)
-If no provider key is available, the runner falls back to a deterministic heuristic policy so the project can still be validated locally.
-The runner also fast-paths obvious housekeeping actions so live provider calls are spent on genuine branching decisions instead of deterministic retrieval/attach/submit steps.
-### Supported environment variables
-See [`.env.example`](./.env.example).
-Key variables:
-- `BASELINE_PROVIDER`
-- `BASELINE_MODEL`
-- `BASELINE_REQUEST_TIMEOUT_SECONDS`
-- `PROVIDER_RATE_LIMIT_RETRIES`
-- `PROVIDER_RETRY_BACKOFF_SECONDS`
-- `STRICT_LLM_MODE`
-- `API_BASE_URL`
-- `MODEL_NAME`
-- `HF_TOKEN`
-- `INFERENCE_TIMEOUT_SECONDS`
-- `OPENROUTER_API_KEY`
-- `OPENROUTER_HTTP_REFERER`
-- `OPENROUTER_APP_TITLE`
 - `OPENAI_API_KEY`
 - `ANTHROPIC_API_KEY`
 - `GROQ_API_KEY`
-For `OPENROUTER_HTTP_REFERER`, use the public URL of the deployed app after it exists, such as your Hugging Face Space URL (`https://your-space-name.hf.space`). If nothing is deployed yet, leave it unset. It is optional and only used for OpenRouter app attribution.
-`HF_TOKEN` is the generic API key passed to the OpenAI client for the selected `API_BASE_URL`. For OpenRouter, put your OpenRouter key there. For Groq, point `API_BASE_URL` to `https://api.groq.com/openai/v1`, set `MODEL_NAME=llama-3.3-70b-versatile`, and put your Groq key in `HF_TOKEN`.
-`PROVIDER_RATE_LIMIT_RETRIES` and `PROVIDER_RETRY_BACKOFF_SECONDS` control bounded retry behavior for transient provider rate limits and timeouts. The default `.env.example` keeps these low on purpose so `inference.py` stays within hackathon runtime expectations.
-Set `STRICT_LLM_MODE=1` when you want evaluation to fail immediately on any provider fallback instead of silently dropping to the heuristic policy.
-## Baseline scores
-Local deterministic fallback baseline:
-- `goods_not_received_easy`: `0.7075`
-- `fraud_signal_ambiguity`: `0.7075`
-- `queue_optimization_hard`: `0.7271`
-- average: `0.7140`
-When provider credentials are present, the same script and `/baseline` endpoint use the configured LLM provider.
-The payload includes `provider_calls_attempted`, `provider_calls_succeeded`, and `provider_errors` so rate-limited free-model runs do not masquerade as successful live inference. If every provider request falls back locally, `mode` is reported as `heuristic_fallback`.
-## API surface
-OpenEnv endpoints are exposed by the generated server scaffold.
-Custom endpoints added by this project:
-- `GET /tasks`: list tasks and the action schema
-- `GET /grader`: latest grade report, or `?episode_id=<id>` for a specific episode
-- `GET /baseline`: run the baseline with optional `provider` and `model_name`
-## Local setup
-### 1. Install dependencies
-```bash
-pip install -e .[dev]
-```
-### 2. Run the server
-```bash
-uvicorn server.app:app --host 0.0.0.0 --port 8000
-```
-### 3. Run tests
 ```bash
-pytest -q tests
 ```
-### 3a. Run the problem-statement audit
 ```bash
-python scripts/problem_statement_audit.py
 ```
-This audit checks the environment against the challenge brief:
-- easy / medium / hard task coverage
-- deterministic grader behavior
-- partial-progress reward shaping
-- separation between a competent policy and a bad control policy
-- `inference.py` contract
-- `openenv validate`
-- baseline and inference execution
-This audit disables live provider keys on purpose so it stays deterministic and fast.
-### 3b. Run the live-provider audit
 ```bash
-python scripts/live_provider_audit.py
 ```
-Use this when you want to see whether the configured provider is actually making decisions live, how many provider calls succeeded, and whether fallback was used.
-The output also includes `provider_errors` so you can distinguish rate limits from connectivity or response-format failures.
-### 4. Run the baseline
 ```bash
-python scripts/run_baseline.py
 ```
-### 5. Run the submission inference script
 ```bash
 python inference.py
 ```
-This script uses the challenge-style environment variables:
-- `API_BASE_URL`
-- `MODEL_NAME`
-- `HF_TOKEN`
-To use a provider-backed baseline:
 ```bash
-BASELINE_PROVIDER=groq BASELINE_MODEL=llama-3.3-70b-versatile python scripts/run_baseline.py
 ```
-To force the OpenRouter free path:
 ```bash
-BASELINE_PROVIDER=openrouter BASELINE_MODEL=nvidia/nemotron-3-super-120b-a12b:free python scripts/run_baseline.py
 ```
-## Docker
-Build from the project root:
 ```bash
-docker build -t chargebackops .
-docker run -p 8000:8000 chargebackops
 ```
-The repository also includes the OpenEnv scaffold Dockerfile at [`server/Dockerfile`](./server/Dockerfile).
-## Hugging Face Spaces
-This repository is ready for a Docker-based Hugging Face Space.
-Typical workflow:
-```bash
-openenv validate .
-openenv push
-```
-## File layout
 ```text
-chargeback_ops/
-├── .env.example
-├── README.md
 ├── baseline_runner.py
 ├── client.py
-├── episode_store.py
 ├── grading.py
 ├── models.py
 ├── openenv.yaml
-├── pyproject.toml
-├── simulation.py
-├── scripts/
-│   └── run_baseline.py
 ├── server/
 │   ├── app.py
-│   ├── chargeback_ops_environment.py
-│   └── Dockerfile
 └── tests/
-    ├── conftest.py
-    ├── test_api.py
-    ├── test_env.py
-    └── test_grader.py
 ```
 ## Notes
-- All cases and merchant data are synthetic.
-- The environment idea remains fixed: merchant chargeback representment and dispute handling.
-- The provider layer is configurable, but the benchmark logic and task design are deterministic.

 ---
 title: ChargebackOps
 sdk: docker
 app_port: 8000
 tags:
 # ChargebackOps
+ChargebackOps is a real-world OpenEnv environment for merchant-side chargeback operations. An agent acts as a dispute analyst, works a queue of payment disputes, investigates evidence across synthetic internal systems, chooses whether to contest or concede, and is graded on recovery quality, deadline handling, and operational discipline.
+The environment is designed for the Round 1 OpenEnv problem statement:
+- Real-world task, not a game or toy
+- Typed OpenEnv models and `reset()` / `step()` / `state()` support
+- Three graded tasks with easy, medium, and hard difficulty
+- Dense reward shaping with partial progress and negative signals
+- Root-level `inference.py` that uses the OpenAI client contract
+- Docker and Hugging Face Spaces deployment path
+## Why This Environment Matters
+Merchant dispute handling is a real operations workflow. Analysts do not just classify a ticket or answer a question. They must:
+- inspect the dispute reason code and the response deadline
+- gather evidence from the right internal systems
+- avoid attaching evidence that weakens the case
 - choose whether to contest, accept, or refund
+- maximize recovery across a queue under limited time
+That makes ChargebackOps a strong benchmark for tool-using agents. It tests retrieval, decision-making, prioritization, and operational restraint in a controlled environment with deterministic scoring.
+## System Architecture
+```mermaid
+flowchart LR
+    A["Agent or inference.py"] --> B["OpenAI-compatible client<br/>API_BASE_URL + MODEL_NAME + HF_TOKEN"]
+    A --> C["ChargebackOps HTTP API"]
+    C --> D["OpenEnv server<br/>server.app"]
+    D --> E["ChargebackOpsEnvironment<br/>step / reset / state"]
+    E --> F["Task simulator<br/>simulation.py"]
+    E --> G["Dense reward shaping<br/>server/chargeback_ops_environment.py"]
+    E --> H["Deterministic grader<br/>grading.py"]
+    H --> I["Episode report store<br/>episode_store.py"]
+    D --> J["Utility routes<br/>/tasks /grader /baseline /health"]
+```
+## Episode Workflow
+```mermaid
+flowchart TD
+    A["reset(task_id)"] --> B["Select the next case from the queue"]
+    B --> C["Inspect case metadata"]
+    C --> D["Retrieve policy guidance"]
+    D --> E["Query merchant systems<br/>orders, payment, shipping, support, refunds, risk"]
+    E --> F["Attach or remove evidence"]
+    F --> G["Set strategy"]
+    G --> H{"contest?"}
+    H -->|yes| I["submit_representment"]
+    H -->|no| J["resolve_case<br/>accept_chargeback or issue_refund"]
+    I --> K{"all cases resolved or max steps reached?"}
+    J --> K
+    K -->|no| B
+    K -->|yes| L["grader computes final score 0.0 to 1.0"]
+```
+## Environment Design
+### Internal systems
+The environment exposes evidence gradually from six synthetic merchant systems:
+- `orders`
+- `payment`
+- `shipping`
+- `support`
+- `refunds`
+- `risk`
+Each task contains hidden ground truth about:
+- optimal strategy per case
+- acceptable fallback strategies
+- required evidence
+- helpful evidence
+- harmful evidence
+- deadline pressure
+- case weight in the final score
+### OpenEnv contract
+| Method | Behavior |
+| --- | --- |
+| `reset(task_id=...)` | starts a fresh episode and returns the initial typed observation |
+| `step(action)` | applies one typed action and returns the next observation with reward and done |
+| `state()` | returns the current typed internal state |
+Core runtime files:
+- [`models.py`](/home/btwitsvoid/Documents/Agents/ChargeBackOps/models.py)
+- [`server/chargeback_ops_environment.py`](/home/btwitsvoid/Documents/Agents/ChargeBackOps/server/chargeback_ops_environment.py)
+- [`server/app.py`](/home/btwitsvoid/Documents/Agents/ChargeBackOps/server/app.py)
+- [`openenv.yaml`](/home/btwitsvoid/Documents/Agents/ChargeBackOps/openenv.yaml)
+## Typed Spaces
+### Action space
+| Action | Purpose |
+| --- | --- |
+| `select_case` | focus a case from the queue |
+| `inspect_case` | reveal analyst notes for the selected case |
+| `query_system` | pull evidence from one merchant system |
+| `retrieve_policy` | reveal reason-code guidance and required evidence |
+| `add_evidence` | attach retrieved evidence to the current package |
+| `remove_evidence` | remove evidence, including harmful attachments |
+| `set_strategy` | choose `contest`, `accept_chargeback`, or `issue_refund` |
+| `submit_representment` | submit a contest package for a contested case |
+| `resolve_case` | close a non-contest case with acceptance or refund |
+### Observation space
+Each observation includes:
+- task metadata: id, title, difficulty, objective
+- current queue with deadlines and case summaries
+- currently selected case
+- visible evidence and policy data
+- available actions
+- `steps_remaining`
+- `progress_score`
+- `last_action_result`
+- optional terminal `grader_report`
+### State space
+The environment state exposes:
+- current episode id and step count
+- public queue resolution state
+- action history
+- latest grade estimate
+- final grader report once complete
+## Task Suite
+| Task ID | Title | Difficulty | Objective |
+| --- | --- | --- | --- |
+| `goods_not_received_easy` | Delivered But Disputed | easy | contest a straightforward goods-not-received case with delivery proof |
+| `fraud_signal_ambiguity` | Fraud Signal Ambiguity | medium | handle a card-not-present fraud dispute with mixed evidence and harmful artifacts |
+| `queue_optimization_hard` | Dispute Queue Optimization | hard | maximize recovery across a multi-case queue under tight step and deadline pressure |
+Difficulty progression is deliberate:
+- Easy teaches the standard representment loop.
+- Medium introduces ambiguity and evidence curation.
+- Hard adds queue prioritization, step-budget pressure, and opportunity cost.
+## Reward Design
+ChargebackOps provides dense per-step feedback and a terminal bonus. The environment rewards progress and penalizes obviously bad operations behavior.
+Positive signals include:
+- selecting and inspecting the right case
+- retrieving policy guidance
+- querying systems that expose useful evidence
+- attaching helpful or required evidence
+- setting the optimal strategy
+- submitting a complete representment on time
+- resolving a case with the optimal non-contest strategy
+Negative signals include:
 - invalid actions
+- duplicate system queries
 - attaching harmful evidence
+- removing helpful evidence
+- weak strategy choices
+- submitting incomplete or late representments
+- missing deadlines on still-open cases
+At episode end, the environment adds a terminal bonus proportional to the deterministic grader score.
+## Grading
+Each finished episode is scored in `[0.0, 1.0]` by the deterministic grader in [`grading.py`](/home/btwitsvoid/Documents/Agents/ChargeBackOps/grading.py).
+Per-case weighting:
+| Component | Weight |
+| --- | --- |
+| strategy correctness | 0.25 |
+| evidence quality | 0.25 |
+| packet validity | 0.15 |
+| deadline compliance | 0.15 |
+| efficiency | 0.10 |
+| outcome quality | 0.10 |
+The hard task aggregates multiple case scores by case weight and normalizes the final result to `0.0` to `1.0`.
+## Inference and Model Providers
+The required root inference entry point is [`inference.py`](/home/btwitsvoid/Documents/Agents/ChargeBackOps/inference.py). It uses the OpenAI Python client with the challenge-compatible environment variables:
+- `API_BASE_URL`
+- `MODEL_NAME`
+- `HF_TOKEN`
+Default configuration:
+- provider path: OpenRouter
+- model: `openai/gpt-oss-120b`
+Also supported through the same OpenAI-compatible client pattern:
+- OpenAI
+- Anthropic-compatible gateways
+- Groq
+- OpenRouter
+The repository also keeps optional direct keys for convenience in [`.env.example`](/home/btwitsvoid/Documents/Agents/ChargeBackOps/.env.example):
 - `OPENAI_API_KEY`
 - `ANTHROPIC_API_KEY`
 - `GROQ_API_KEY`
+- `OPENROUTER_API_KEY`
+### OpenRouter referer
+Leave `OPENROUTER_HTTP_REFERER` empty during local development. Once the app is deployed, set it to the public app URL, for example:
+```bash
+OPENROUTER_HTTP_REFERER=https://your-space-name.hf.space
+OPENROUTER_APP_TITLE=ChargebackOps
+```
+## Baseline Results
+The repository includes two baseline entry points:
+- [`inference.py`](/home/btwitsvoid/Documents/Agents/ChargeBackOps/inference.py) for the challenge contract
+- [`baseline_runner.py`](/home/btwitsvoid/Documents/Agents/ChargeBackOps/baseline_runner.py) for direct local runs and the `/baseline` endpoint
+Verified local heuristic-fallback baseline scores are documented below after the latest validation pass:
+| Task | Score |
+| --- | --- |
+| Delivered But Disputed | `0.7075` |
+| Fraud Signal Ambiguity | `0.7075` |
+| Dispute Queue Optimization | `0.7271` |
+| Average | `0.7140` |
+These values are replaced after each validation run so the README reflects real, reproducible output from the current codebase.
+## API Surface
+The FastAPI app exposes:
+- `GET /` basic service ping
+- `GET /health` health check
+- `GET /docs` interactive OpenAPI docs
+- `POST /reset` start a new episode
+- `POST /step` advance the environment
+- `GET /state` inspect the current state
+- `GET /tasks` enumerate tasks and the action schema
+- `GET /grader` or `POST /grader` fetch the last completed episode grade
+- `GET /baseline` or `POST /baseline` run the bundled baseline
+## Local Setup
+### 1. Install dependencies
+Using `uv`:
 ```bash
+uv sync --extra dev
 ```
+Using `pip`:
 ```bash
+python -m pip install -e ".[dev]"
 ```
+### 2. Configure environment variables
 ```bash
+cp .env.example .env
 ```
+At minimum, configure:
 ```bash
+API_BASE_URL=https://openrouter.ai/api/v1
+MODEL_NAME=openai/gpt-oss-120b
+HF_TOKEN=your_provider_key
 ```
+### 3. Run the test and validation suite
 ```bash
+pytest -q tests
+openenv validate .
 python inference.py
 ```
+### 4. Start the server locally
 ```bash
+uvicorn server.app:app --host 0.0.0.0 --port 8000
 ```
+## Docker
+Build and run the root Docker image:
 ```bash
+docker build -t chargebackops .
+docker run --rm -p 8000:8000 --env-file .env chargebackops
 ```
+Once the container is running:
 ```bash
+curl http://localhost:8000/
+curl http://localhost:8000/tasks
+curl http://localhost:8000/health
 ```
+## Hugging Face Spaces Deployment
+ChargebackOps is configured as a Docker Space through the YAML frontmatter in this README.
+Recommended deployment steps:
+1. Create a new Hugging Face Space with `Docker` as the SDK.
+2. Push this repository to the Space.
+3. Add the runtime variables in Space Settings:
+   - `API_BASE_URL`
+   - `MODEL_NAME`
+   - `HF_TOKEN`
+4. If using OpenRouter, add:
+   - `OPENROUTER_HTTP_REFERER=https://your-space-name.hf.space`
+   - `OPENROUTER_APP_TITLE=ChargebackOps`
+5. Verify:
+   - `/`
+   - `/health`
+   - `/tasks`
+   - `/docs`
+   - `/baseline`
+## Validation Checklist
+- `pytest -q tests`
+- `openenv validate .`
+- `python inference.py`
+- `docker build -t chargebackops .`
+- `docker run --rm -p 8000:8000 --env-file .env chargebackops`
+## Project Layout
 ```text
+.
 ├── baseline_runner.py
 ├── client.py
 ├── grading.py
+├── inference.py
 ├── models.py
 ├── openenv.yaml
 ├── server/
 │   ├── app.py
+│   └── chargeback_ops_environment.py
+├── simulation.py
 └── tests/
 ```
 ## Notes
+- This is a synthetic benchmark environment, not a live payments integration.
+- The world state is deterministic by design so graders remain reproducible.
+- Live model quality still depends on the quota and reliability of the configured provider.

baseline_runner.py CHANGED Viewed

@@ -30,11 +30,11 @@ except ImportError:  # pragma: no cover
 if load_dotenv is not None:  # pragma: no cover
     load_dotenv()
-DEFAULT_PROVIDER = "groq"
 MAX_LLM_CANDIDATES = 4
 MAX_PROVIDER_RESPONSE_TOKENS = 80
 DEFAULT_MODELS = {
-    "openrouter": "nvidia/nemotron-3-super-120b-a12b:free",
     "groq": "llama-3.3-70b-versatile",
     "openai": "gpt-5-mini",
     "anthropic": "claude-3-5-haiku-latest",
@@ -567,7 +567,7 @@ def _resolve_provider(
     chosen_provider = (provider or os.getenv("BASELINE_PROVIDER") or DEFAULT_PROVIDER).lower()
     chosen_model = model_name or os.getenv("BASELINE_MODEL") or DEFAULT_MODELS.get(
         chosen_provider,
-        "nvidia/nemotron-3-super-120b-a12b:free",
     )
     return ProviderConfig(provider=chosen_provider, model_name=chosen_model)

 if load_dotenv is not None:  # pragma: no cover
     load_dotenv()
+DEFAULT_PROVIDER = "openrouter"
 MAX_LLM_CANDIDATES = 4
 MAX_PROVIDER_RESPONSE_TOKENS = 80
 DEFAULT_MODELS = {
+    "openrouter": "openai/gpt-oss-120b",
     "groq": "llama-3.3-70b-versatile",
     "openai": "gpt-5-mini",
     "anthropic": "claude-3-5-haiku-latest",
     chosen_provider = (provider or os.getenv("BASELINE_PROVIDER") or DEFAULT_PROVIDER).lower()
     chosen_model = model_name or os.getenv("BASELINE_MODEL") or DEFAULT_MODELS.get(
         chosen_provider,
+        "openai/gpt-oss-120b",
     )
     return ProviderConfig(provider=chosen_provider, model_name=chosen_model)

scripts/live_provider_audit.py DELETED Viewed

@@ -1,34 +0,0 @@
-"""Live-provider audit for ChargebackOps."""
-from __future__ import annotations
-import json
-import os
-import sys
-from pathlib import Path
-PROJECT_ROOT = Path(__file__).resolve().parents[1]
-if str(PROJECT_ROOT) not in sys.path:
-    sys.path.insert(0, str(PROJECT_ROOT))
-from baseline_runner import run_baseline
-from inference import run_inference
-def main() -> None:
-    report = {
-        "config": {
-            "baseline_provider": os.getenv("BASELINE_PROVIDER"),
-            "baseline_model": os.getenv("BASELINE_MODEL"),
-            "api_base_url": os.getenv("API_BASE_URL"),
-            "model_name": os.getenv("MODEL_NAME"),
-            "strict_llm_mode": os.getenv("STRICT_LLM_MODE", ""),
-        },
-        "baseline": run_baseline().model_dump(),
-        "inference": run_inference().model_dump(),
-    }
-    print(json.dumps(report, indent=2))
-if __name__ == "__main__":
-    main()

scripts/problem_statement_audit.py DELETED Viewed

@@ -1,268 +0,0 @@
-"""Requirement-focused audit for the ChargebackOps submission."""
-from __future__ import annotations
-import json
-import os
-import shutil
-import subprocess
-import sys
-from contextlib import contextmanager
-from pathlib import Path
-PROJECT_ROOT = Path(__file__).resolve().parents[1]
-if str(PROJECT_ROOT) not in sys.path:
-    sys.path.insert(0, str(PROJECT_ROOT))
-from baseline_runner import _heuristic_pick, candidate_actions
-from grading import grade_episode
-from inference import run_inference
-from models import ChargebackOpsAction
-from server.app import baseline, tasks
-from server.chargeback_ops_environment import ChargebackOpsEnvironment
-from simulation import get_task, list_tasks
-def _run_heuristic_episode(task_id: str) -> dict[str, float]:
-    env = ChargebackOpsEnvironment()
-    observation = env.reset(task_id=task_id)
-    total_reward = 0.0
-    while not observation.done:
-        candidates = candidate_actions(observation.model_dump())
-        observation = env.step(_heuristic_pick(candidates).action)
-        total_reward += observation.reward or 0.0
-    assert observation.grader_report is not None
-    return {
-        "reward": round(total_reward, 4),
-        "score": observation.grader_report.normalized_score,
-    }
-def _run_bad_episode(task_id: str) -> dict[str, float]:
-    env = ChargebackOpsEnvironment()
-    observation = env.reset(task_id=task_id)
-    total_reward = 0.0
-    while not observation.done:
-        if observation.selected_case_id is None:
-            open_case = next(case for case in observation.queue if case.status == "open")
-            action = ChargebackOpsAction(action_type="select_case", case_id=open_case.case_id)
-        else:
-            case_id = observation.selected_case_id
-            visible_case = observation.visible_case
-            if visible_case and visible_case.current_strategy is None:
-                action = ChargebackOpsAction(
-                    action_type="set_strategy",
-                    case_id=case_id,
-                    strategy="accept_chargeback",
-                )
-            elif visible_case and visible_case.current_strategy == "accept_chargeback":
-                action = ChargebackOpsAction(
-                    action_type="resolve_case",
-                    case_id=case_id,
-                    strategy="accept_chargeback",
-                )
-            else:
-                action = ChargebackOpsAction(
-                    action_type="query_system",
-                    case_id=case_id,
-                    system_name="payment",
-                )
-        observation = env.step(action)
-        total_reward += observation.reward or 0.0
-    assert observation.grader_report is not None
-    return {
-        "reward": round(total_reward, 4),
-        "score": observation.grader_report.normalized_score,
-    }
-def _check(condition: bool, message: str, details: object | None = None) -> dict[str, object]:
-    return {
-        "pass": condition,
-        "message": message,
-        "details": details,
-    }
-@contextmanager
-def _deterministic_provider_disabled():
-    keys = [
-        "HF_TOKEN",
-        "API_BASE_URL",
-        "MODEL_NAME",
-        "OPENROUTER_API_KEY",
-        "OPENAI_API_KEY",
-        "ANTHROPIC_API_KEY",
-        "GROQ_API_KEY",
-        "STRICT_LLM_MODE",
-    ]
-    previous = {key: os.environ.get(key) for key in keys}
-    try:
-        for key in keys:
-            os.environ.pop(key, None)
-        yield
-    finally:
-        for key, value in previous.items():
-            if value is None:
-                os.environ.pop(key, None)
-            else:
-                os.environ[key] = value
-def main() -> None:
-    tasks_payload = tasks()
-    task_list = list_tasks()
-    openenv_cli = shutil.which("openenv")
-    openenv_validate = subprocess.run(
-        [openenv_cli or "openenv", "validate", "."],
-        cwd=PROJECT_ROOT,
-        capture_output=True,
-        text=True,
-    )
-    files = {
-        "Dockerfile": (PROJECT_ROOT / "Dockerfile").exists(),
-        "README.md": (PROJECT_ROOT / "README.md").exists(),
-        "openenv.yaml": (PROJECT_ROOT / "openenv.yaml").exists(),
-        "inference.py": (PROJECT_ROOT / "inference.py").exists(),
-    }
-    heuristic_hard = _run_heuristic_episode("queue_optimization_hard")
-    bad_hard = _run_bad_episode("queue_optimization_hard")
-    env = ChargebackOpsEnvironment()
-    reset_obs = env.reset(task_id="goods_not_received_easy")
-    initial_episode = env.state.episode_id
-    env.step(ChargebackOpsAction(action_type="select_case", case_id="CB-E1"))
-    reset_obs_2 = env.reset(task_id="fraud_signal_ambiguity")
-    env_reward = ChargebackOpsEnvironment()
-    env_reward.reset(task_id="fraud_signal_ambiguity")
-    env_reward.step(ChargebackOpsAction(action_type="select_case", case_id="CB-M1"))
-    helpful = env_reward.step(
-        ChargebackOpsAction(action_type="query_system", case_id="CB-M1", system_name="orders")
-    )
-    duplicate = env_reward.step(
-        ChargebackOpsAction(action_type="query_system", case_id="CB-M1", system_name="orders")
-    )
-    harmful = env_reward.step(
-        ChargebackOpsAction(
-            action_type="add_evidence",
-            case_id="CB-M1",
-            evidence_ids=["M1-AVS-MISMATCH"],
-        )
-    )
-    task = get_task("queue_optimization_hard")
-    env_grader = ChargebackOpsEnvironment()
-    env_grader.reset(task_id="queue_optimization_hard")
-    grader_a = grade_episode(
-        task,
-        env_grader._progress_by_case,  # type: ignore[attr-defined]
-        env_grader.state.step_count,
-        env_grader.state.episode_id or "",
-        completed=False,
-    )
-    grader_b = grade_episode(
-        task,
-        env_grader._progress_by_case,  # type: ignore[attr-defined]
-        env_grader.state.step_count,
-        env_grader.state.episode_id or "",
-        completed=False,
-    )
-    with _deterministic_provider_disabled():
-        baseline_payload = baseline()
-        inference_payload = run_inference()
-    source = (PROJECT_ROOT / "inference.py").read_text()
-    report = {
-        "task_catalog": _check(
-            len(task_list) >= 3 and {task.difficulty for task in task_list} == {"easy", "medium", "hard"},
-            "Environment exposes easy, medium, and hard tasks.",
-            [task.task_id for task in task_list],
-        ),
-        "grader_range": _check(
-            all(0.0 <= result.score <= 1.0 for result in baseline_payload.task_results),
-            "Grader returns scores in [0.0, 1.0] for all baseline tasks.",
-            [result.score for result in baseline_payload.task_results],
-        ),
-        "grader_determinism": _check(
-            grader_a.model_dump() == grader_b.model_dump(),
-            "Grader is deterministic on identical state.",
-            {"score": grader_a.normalized_score},
-        ),
-        "reward_signal": _check(
-            (helpful.reward or 0.0) > 0 and (duplicate.reward or 0.0) < 0 and (harmful.reward or 0.0) < 0,
-            "Reward provides partial progress and penalty signals.",
-            {
-                "helpful_reward": helpful.reward,
-                "duplicate_reward": duplicate.reward,
-                "harmful_reward": harmful.reward,
-            },
-        ),
-        "agent_separation": _check(
-            heuristic_hard["score"] > bad_hard["score"] and heuristic_hard["reward"] > bad_hard["reward"],
-            "A competent policy scores better than a bad control policy on the hard task.",
-            {"heuristic": heuristic_hard, "bad": bad_hard},
-        ),
-        "reset_state": _check(
-            reset_obs.done is False
-            and reset_obs_2.task_id == "fraud_signal_ambiguity"
-            and env.state.step_count == 0
-            and env.state.action_history == []
-            and env.state.episode_id != initial_episode,
-            "reset() produces a clean episode state.",
-            {
-                "first_task": reset_obs.task_id,
-                "second_task": reset_obs_2.task_id,
-                "step_count": env.state.step_count,
-            },
-        ),
-        "tasks_endpoint": _check(
-            len(tasks_payload.tasks) >= 3 and "properties" in tasks_payload.action_schema,
-            "/tasks exposes task metadata and a typed action schema.",
-            {"task_count": len(tasks_payload.tasks)},
-        ),
-        "inference_contract": _check(
-            all(token in source for token in ["from openai import OpenAI", "API_BASE_URL", "MODEL_NAME", "HF_TOKEN"]),
-            "inference.py uses the OpenAI client with the required environment variables.",
-            None,
-        ),
-        "openenv_validate": _check(
-            openenv_validate.returncode == 0,
-            "openenv validate passes.",
-            openenv_validate.stdout.strip() or openenv_validate.stderr.strip(),
-        ),
-        "baseline_runs": _check(
-            len(baseline_payload.task_results) == 3,
-            "Baseline endpoint runs across all tasks.",
-            {
-                "mode": baseline_payload.mode,
-                "provider_calls_attempted": baseline_payload.provider_calls_attempted,
-                "provider_calls_succeeded": baseline_payload.provider_calls_succeeded,
-            },
-        ),
-        "inference_runs": _check(
-            len(inference_payload.task_results) == 3,
-            "inference.py runs across all tasks.",
-            {
-                "mode": inference_payload.mode,
-                "provider_calls_attempted": inference_payload.provider_calls_attempted,
-                "provider_calls_succeeded": inference_payload.provider_calls_succeeded,
-            },
-        ),
-        "required_files": _check(
-            all(files.values()),
-            "Submission-critical files exist.",
-            files,
-        ),
-    }
-    report["all_passed"] = all(item["pass"] for item in report.values())
-    print(json.dumps(report, indent=2))
-if __name__ == "__main__":
-    main()

scripts/run_baseline.py DELETED Viewed

@@ -1,16 +0,0 @@
-"""CLI wrapper for the ChargebackOps baseline."""
-from __future__ import annotations
-import sys
-from pathlib import Path
-PROJECT_ROOT = Path(__file__).resolve().parents[1]
-if str(PROJECT_ROOT) not in sys.path:
-    sys.path.insert(0, str(PROJECT_ROOT))
-from baseline_runner import main
-if __name__ == "__main__":
-    main()