Kartik Goyal commited on
Commit ·
47fa380
1
Parent(s): daa0358
improved logic
Browse files- .gitignore +35 -0
- README.md +175 -74
- apps/start.sh +24 -3
- dockerfile +9 -11
- grpo_train.py +116 -38
- inference.py +28 -7
- pyproject.toml +19 -2
- requirements.txt +2 -1
- server/app.py +5 -9
.gitignore
ADDED
|
@@ -0,0 +1,35 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Python
|
| 2 |
+
__pycache__/
|
| 3 |
+
*.py[cod]
|
| 4 |
+
*$py.class
|
| 5 |
+
*.egg-info/
|
| 6 |
+
*.egg
|
| 7 |
+
build/
|
| 8 |
+
dist/
|
| 9 |
+
.eggs/
|
| 10 |
+
|
| 11 |
+
# Virtual environments
|
| 12 |
+
.venv/
|
| 13 |
+
venv/
|
| 14 |
+
env/
|
| 15 |
+
.env
|
| 16 |
+
|
| 17 |
+
# Editor / OS
|
| 18 |
+
.vscode/
|
| 19 |
+
.idea/
|
| 20 |
+
.DS_Store
|
| 21 |
+
Thumbs.db
|
| 22 |
+
|
| 23 |
+
# Project-specific
|
| 24 |
+
outputs/
|
| 25 |
+
AD_sandbox.zip
|
| 26 |
+
*.log
|
| 27 |
+
debug-*.log
|
| 28 |
+
checkpoint-*/
|
| 29 |
+
|
| 30 |
+
# Notebooks
|
| 31 |
+
.ipynb_checkpoints/
|
| 32 |
+
|
| 33 |
+
# Cached models / datasets
|
| 34 |
+
.cache/
|
| 35 |
+
hf_cache/
|
README.md
CHANGED
|
@@ -1,135 +1,236 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
|
| 2 |
-
#
|
| 3 |
|
| 4 |
-
>
|
|
|
|
| 5 |
|
| 6 |

|
| 7 |

|
| 8 |
-
![
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 9 |
|
| 10 |
---
|
| 11 |
|
| 12 |
-
##
|
| 13 |
-
|
|
|
|
| 14 |
|
| 15 |
-
|
| 16 |
-
|
| 17 |
-
|
| 18 |
|
| 19 |
-
|
| 20 |
-
|
|
|
|
| 21 |
|
| 22 |
---
|
| 23 |
|
| 24 |
-
##
|
| 25 |
-
|
|
|
|
| 26 |
|
| 27 |
-
### 🔄 Interaction Flow
|
| 28 |
```mermaid
|
| 29 |
graph LR
|
| 30 |
-
subgraph "
|
| 31 |
-
A[
|
| 32 |
end
|
| 33 |
|
| 34 |
-
subgraph "MetaGuard Core"
|
| 35 |
B(Environment Hub :8000)
|
| 36 |
end
|
| 37 |
|
| 38 |
-
subgraph "
|
| 39 |
C[[Regulatory API :8001]]
|
| 40 |
D[[CRM API :8002]]
|
| 41 |
E[[Audit API :8003]]
|
| 42 |
end
|
| 43 |
|
| 44 |
-
A -- "
|
| 45 |
-
B -- "
|
| 46 |
-
B -- "
|
| 47 |
-
|
| 48 |
-
|
| 49 |
-
|
| 50 |
-
|
| 51 |
-
|
| 52 |
-
|
| 53 |
-
|
| 54 |
-
B -- "6. Immutable Log" --> E
|
| 55 |
```
|
| 56 |
|
| 57 |
-
|
| 58 |
-
|
|
| 59 |
-
|
|
| 60 |
-
|
|
| 61 |
-
|
|
| 62 |
-
|
|
| 63 |
-
|
|
|
|
|
|
|
| 64 |
|
| 65 |
---
|
| 66 |
|
| 67 |
-
##
|
| 68 |
-
We utilize **Group Relative Policy Optimization (GRPO)** to train the agent. Unlike standard LLMs, our agent learns an optimal **Action Sequence**:
|
| 69 |
|
| 70 |
-
|
| 71 |
-
|
| 72 |
-
|
| 73 |
-
|
| 74 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 75 |
|
| 76 |
---
|
| 77 |
|
| 78 |
-
##
|
| 79 |
-
|
|
|
|
| 80 |
|
| 81 |
-
|
| 82 |
-
|
| 83 |
-
|
| 84 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 85 |
|
| 86 |
-
|
| 87 |
-
|
| 88 |
-
* **Trace:** `REGULATIONS` ➔ `IMAGE_SCAN` ➔ `CRM_CHECK` ➔ `AUDIT_LOG` ➔ `REJECT`.
|
| 89 |
-
* **Final Compliance Rating:** `9/10` 🌟
|
| 90 |
|
| 91 |
---
|
| 92 |
|
| 93 |
-
##
|
|
|
|
|
|
|
| 94 |
|
| 95 |
-
|
|
| 96 |
-
|
|
| 97 |
-
|
|
| 98 |
-
|
|
| 99 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 100 |
|
| 101 |
---
|
| 102 |
|
| 103 |
-
##
|
| 104 |
|
| 105 |
-
### 1.
|
| 106 |
```bash
|
| 107 |
-
git clone
|
| 108 |
cd meta-ad-policy-sandbox
|
| 109 |
-
pip install -
|
| 110 |
```
|
| 111 |
|
| 112 |
-
### 2. Launch
|
| 113 |
-
|
| 114 |
```bash
|
| 115 |
-
python apps/regulatory_api.py
|
| 116 |
-
python apps/crm_api.py
|
| 117 |
-
python apps/audit_api.py
|
|
|
|
| 118 |
```
|
| 119 |
|
| 120 |
-
### 3. Run the
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 121 |
```bash
|
| 122 |
python demo.py
|
| 123 |
```
|
| 124 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 125 |
---
|
| 126 |
|
| 127 |
-
##
|
| 128 |
-
|
| 129 |
-
|
| 130 |
-
|
|
|
|
| 131 |
|
| 132 |
---
|
| 133 |
|
| 134 |
-
##
|
| 135 |
-
|
|
|
|
|
|
| 1 |
+
---
|
| 2 |
+
title: MetaGuard Ad Policy Sandbox
|
| 3 |
+
emoji: 🛡
|
| 4 |
+
colorFrom: blue
|
| 5 |
+
colorTo: indigo
|
| 6 |
+
sdk: docker
|
| 7 |
+
app_port: 8000
|
| 8 |
+
pinned: false
|
| 9 |
+
license: mit
|
| 10 |
+
---
|
| 11 |
|
| 12 |
+
# MetaGuard: A Multi-App RL Environment for Enterprise Ad Policy Compliance
|
| 13 |
|
| 14 |
+
> An OpenEnv-compatible reinforcement learning environment that forces an LLM agent
|
| 15 |
+
> to do **real investigative work** across multiple enterprise APIs — not pattern-match.
|
| 16 |
|
| 17 |

|
| 18 |

|
| 19 |
+

|
| 20 |
+
|
| 21 |
+
---
|
| 22 |
+
|
| 23 |
+
## TL;DR for Judges
|
| 24 |
+
|
| 25 |
+
MetaGuard is a **partially observable, multi-application RL environment** modelled
|
| 26 |
+
after a real enterprise ad-moderation workflow. The agent (LLM) must orchestrate
|
| 27 |
+
calls across 4 microservices (Regulatory, CRM, Audit, Core), update its internal
|
| 28 |
+
beliefs based on each tool result, and produce a defensible decision in the
|
| 29 |
+
correct procedural order — or get penalised.
|
| 30 |
+
|
| 31 |
+
| Theme 3.1 requirement | How MetaGuard satisfies it |
|
| 32 |
+
| --- | --- |
|
| 33 |
+
| Real interaction with tools / APIs / dynamic systems | 4 independent FastAPI microservices on ports 8000-8003 |
|
| 34 |
+
| "Real hard work, not shortcuts" | Procedural penalties + ambiguity tasks force investigation |
|
| 35 |
+
| Maintain consistent internal state | Env tracks `actions_taken`, `signals`, `api_failed`, `trace` |
|
| 36 |
+
| Update beliefs based on outcomes | `signals` dict (`risk_score`, `policy_confidence`, `image_flag`, `landing_flag`) is populated only as the agent acts |
|
| 37 |
+
| Orchestrate multi-step workflows | `REQUIRED_BEFORE_TERMINAL` enforces `query_regulations` → `submit_audit` → decide |
|
| 38 |
+
| Partially observable world | Agent sees only what its actions reveal; no global view |
|
| 39 |
+
| **Scaler AI Labs bonus** — Multi-App RL for Enterprise Workflows | 4-app architecture mirrors a real compliance stack with business-rule nuance |
|
| 40 |
|
| 41 |
---
|
| 42 |
|
| 43 |
+
## The Problem
|
| 44 |
+
|
| 45 |
+
Single-shot LLM moderation is brittle in enterprise settings:
|
| 46 |
|
| 47 |
+
- **No traceability** — no record of *why* a decision was made.
|
| 48 |
+
- **No context** — no advertiser history, no jurisdiction-specific rules.
|
| 49 |
+
- **No risk gating** — high-risk content can be approved without an audit trail.
|
| 50 |
|
| 51 |
+
Real compliance teams follow a **procedure**: check policy → inspect creative →
|
| 52 |
+
verify the advertiser → log the audit → only then decide. MetaGuard makes the
|
| 53 |
+
agent learn that procedure end-to-end.
|
| 54 |
|
| 55 |
---
|
| 56 |
|
| 57 |
+
## Architecture
|
| 58 |
+
|
| 59 |
+
A 4-service ecosystem that mirrors a real enterprise compliance stack.
|
| 60 |
|
|
|
|
| 61 |
```mermaid
|
| 62 |
graph LR
|
| 63 |
+
subgraph "Agent"
|
| 64 |
+
A[LLM Policy Agent]
|
| 65 |
end
|
| 66 |
|
| 67 |
+
subgraph "MetaGuard Core (OpenEnv)"
|
| 68 |
B(Environment Hub :8000)
|
| 69 |
end
|
| 70 |
|
| 71 |
+
subgraph "Enterprise APIs"
|
| 72 |
C[[Regulatory API :8001]]
|
| 73 |
D[[CRM API :8002]]
|
| 74 |
E[[Audit API :8003]]
|
| 75 |
end
|
| 76 |
|
| 77 |
+
A -- "action_type, reasoning" --> B
|
| 78 |
+
B -- "GET /regulations/{cat}" --> C
|
| 79 |
+
B -- "GET /advertiser/{id}" --> D
|
| 80 |
+
B -- "POST /log" --> E
|
| 81 |
+
|
| 82 |
+
C -- "policy + violations" --> B
|
| 83 |
+
D -- "risk_score + history" --> B
|
| 84 |
+
E -- "audit_id" --> B
|
| 85 |
+
|
| 86 |
+
B -- "obs + reward + signals" --> A
|
|
|
|
| 87 |
```
|
| 88 |
|
| 89 |
+
| Service | Port | Responsibility | Real-world analog |
|
| 90 |
+
| :--- | :--- | :--- | :--- |
|
| 91 |
+
| Core Env | `:8000` | State orchestration, reward shaping | Compliance workflow engine |
|
| 92 |
+
| Regulatory API | `:8001` | Category-specific policy lookup with random outages | Legal / policy database |
|
| 93 |
+
| CRM API | `:8002` | Advertiser trust score and prior-violation history | Salesforce / advertiser CRM |
|
| 94 |
+
| Audit API | `:8003` | Immutable audit-log writes | SOX-compliant audit ledger |
|
| 95 |
+
|
| 96 |
+
Each external API has a **10% random failure rate** to simulate real network
|
| 97 |
+
unreliability — the agent must learn to retry.
|
| 98 |
|
| 99 |
---
|
| 100 |
|
| 101 |
+
## Action Space
|
|
|
|
| 102 |
|
| 103 |
+
8 actions span the full investigative procedure:
|
| 104 |
+
|
| 105 |
+
| Action | Calls service | Purpose |
|
| 106 |
+
| --- | --- | --- |
|
| 107 |
+
| `query_regulations` | Regulatory API | Look up category-specific policy |
|
| 108 |
+
| `analyze_image` | (internal VLM stub) | Inspect creative for visual violations |
|
| 109 |
+
| `check_advertiser_history` | CRM API | Pull advertiser trust score |
|
| 110 |
+
| `request_landing_page` | (internal) | Check landing-page domain age + risk keywords |
|
| 111 |
+
| `request_id_verification` | (internal) | Targeting / age-gate check |
|
| 112 |
+
| `submit_audit` | Audit API | Write immutable audit record |
|
| 113 |
+
| `approve` | terminal | Final approval decision |
|
| 114 |
+
| `reject` | terminal | Final rejection decision |
|
| 115 |
|
| 116 |
---
|
| 117 |
|
| 118 |
+
## Business-Rule Nuances (the "hard work" criteria)
|
| 119 |
+
|
| 120 |
+
The env penalises shortcuts and rewards real reasoning. Specifically:
|
| 121 |
|
| 122 |
+
1. **Phase ordering.** `query_regulations` MUST come first. Any other action
|
| 123 |
+
first returns `-0.2` reward and is **not registered** as taken.
|
| 124 |
+
2. **Audit gate.** `submit_audit` is required before any `approve` / `reject`.
|
| 125 |
+
Skipping it costs `-0.2` from the terminal reward.
|
| 126 |
+
3. **API-failure recovery.** External services fail 10% of the time. Recovering
|
| 127 |
+
(retrying after a failure) earns `+0.3`; ignoring earns `-0.3`.
|
| 128 |
+
4. **Risk-aware approvals.** Approving high-risk content (`risk_score > 0.7`
|
| 129 |
+
AND `policy_confidence > 0.6`) costs `-0.5`.
|
| 130 |
+
5. **Ambiguity enforcement.** When `policy_confidence < 0.6`, the agent MUST
|
| 131 |
+
gather more signals (CRM or landing-page) or take a `-0.4` penalty.
|
| 132 |
+
6. **Step penalty.** Every action costs `-0.05` to discourage padding.
|
| 133 |
+
7. **Terminal correctness.** `+1.0` for the right decision, `-1.0` for wrong.
|
| 134 |
+
8. **Step cap.** Hard cap at 8 steps; exceeding it costs `-0.5`.
|
| 135 |
|
| 136 |
+
These rules together form a partially observable POMDP where greedy or
|
| 137 |
+
single-shot strategies provably under-perform a procedural agent.
|
|
|
|
|
|
|
| 138 |
|
| 139 |
---
|
| 140 |
|
| 141 |
+
## Task Suite
|
| 142 |
+
|
| 143 |
+
10 task families exposed via `task_id`:
|
| 144 |
|
| 145 |
+
| ID | Family | What it tests |
|
| 146 |
+
| --- | --- | --- |
|
| 147 |
+
| `task_1_healthcare` | Unverified medical claims, prescription bypass | Domain knowledge + policy lookup |
|
| 148 |
+
| `task_2_financial` | Predatory lending, guaranteed-returns scams | High-stakes risk gating |
|
| 149 |
+
| `task_3_multimodal` | Violation hidden in image, clean text | Forces `analyze_image` |
|
| 150 |
+
| `task_4_targeting` | Adult financial product targeting minors | Forces `request_id_verification` |
|
| 151 |
+
| `task_6_conflict` | Clean text + risky advertiser | Conflict resolution |
|
| 152 |
+
| `task_7_ambiguous` | Low policy confidence | Forces extra signal gathering |
|
| 153 |
+
| `task_8_adversarial` | Fine-print loophole | Adversarial robustness |
|
| 154 |
+
| `task_9_dependency_trap` | Mismatch between text and image | Multi-source verification |
|
| 155 |
+
| `task_10_failure` | Deterministic API failure on step 1 | Recovery behavior |
|
| 156 |
|
| 157 |
---
|
| 158 |
|
| 159 |
+
## Quick Start
|
| 160 |
|
| 161 |
+
### 1. Install
|
| 162 |
```bash
|
| 163 |
+
git clone https://github.com/Parth380/meta-ad-policy-sandbox.git
|
| 164 |
cd meta-ad-policy-sandbox
|
| 165 |
+
pip install -e .
|
| 166 |
```
|
| 167 |
|
| 168 |
+
### 2. Launch the 4-service stack
|
| 169 |
+
Four terminals (or use `apps/start_all.bat` on Windows):
|
| 170 |
```bash
|
| 171 |
+
python apps/regulatory_api.py # :8001
|
| 172 |
+
python apps/crm_api.py # :8002
|
| 173 |
+
python apps/audit_api.py # :8003
|
| 174 |
+
python -m uvicorn server.app:app --host 0.0.0.0 --port 8000 # :8000
|
| 175 |
```
|
| 176 |
|
| 177 |
+
### 3. Run the inference benchmark
|
| 178 |
+
Uses an LLM through the HF Router and emits the official `[START]/[STEP]/[END]`
|
| 179 |
+
grading log lines.
|
| 180 |
+
```bash
|
| 181 |
+
export HF_TOKEN=hf_xxxxxxxx # your Hugging Face token
|
| 182 |
+
export MODEL_NAME=meta-llama/Meta-Llama-3-8B-Instruct
|
| 183 |
+
python inference.py
|
| 184 |
+
```
|
| 185 |
+
|
| 186 |
+
### 4. Run the local naive-vs-procedural demo
|
| 187 |
```bash
|
| 188 |
python demo.py
|
| 189 |
```
|
| 190 |
|
| 191 |
+
### 5. (Optional) Train an agent with GRPO
|
| 192 |
+
Requires a CUDA GPU. Trains a LoRA on top of `unsloth/Llama-3.1-8B-Instruct`
|
| 193 |
+
using the env itself as the reward function.
|
| 194 |
+
```bash
|
| 195 |
+
python grpo_train.py
|
| 196 |
+
```
|
| 197 |
+
|
| 198 |
+
---
|
| 199 |
+
|
| 200 |
+
## Repository Layout
|
| 201 |
+
|
| 202 |
+
```
|
| 203 |
+
meta-ad-policy-sandbox/
|
| 204 |
+
├── apps/
|
| 205 |
+
│ ├── regulatory_api.py # FastAPI :8001 — policy DB
|
| 206 |
+
│ ├── crm_api.py # FastAPI :8002 — advertiser CRM
|
| 207 |
+
│ ├── audit_api.py # FastAPI :8003 — audit log
|
| 208 |
+
│ └── start_all.bat # Windows: launch all 4 at once
|
| 209 |
+
├── server/
|
| 210 |
+
│ └── app.py # OpenEnv FastAPI server :8000
|
| 211 |
+
├── src/
|
| 212 |
+
│ ├── environment.py # AdPolicyEnvironment — core RL logic
|
| 213 |
+
│ ├── models.py # Pydantic schemas (AdAction, AdObservation, AdState)
|
| 214 |
+
│ └── generator.py # AdGenerator — task-aware ad sampling
|
| 215 |
+
├── inference.py # LLM-via-HF-Router benchmark with grading logs
|
| 216 |
+
├── demo.py # Local naive-vs-procedural demo
|
| 217 |
+
├── grpo_train.py # GRPO + LoRA training script
|
| 218 |
+
├── test_env.py # Smoke test of env logic
|
| 219 |
+
├── openenv.yaml # OpenEnv manifest
|
| 220 |
+
├── dockerfile # Container build for HF Spaces deployment
|
| 221 |
+
└── validate.sh # Validator for HF Space + openenv submission
|
| 222 |
+
```
|
| 223 |
+
|
| 224 |
---
|
| 225 |
|
| 226 |
+
## Hackathon Submission
|
| 227 |
+
|
| 228 |
+
- **Theme:** 3.1 Professional Tasks — Multi-Step Reasoning & Policy Compliance
|
| 229 |
+
- **Bonus Track:** Scaler AI Labs — Multi-App RL Environment for Enterprise Workflows
|
| 230 |
+
- **Team:** Parth Singhal, Mehakveer Kaur, Kartik Goyal
|
| 231 |
|
| 232 |
---
|
| 233 |
|
| 234 |
+
## License
|
| 235 |
+
|
| 236 |
+
MIT.
|
apps/start.sh
CHANGED
|
@@ -1,9 +1,30 @@
|
|
| 1 |
#!/bin/bash
|
|
|
|
| 2 |
|
| 3 |
-
# Start the background microservices
|
| 4 |
python apps/regulatory_api.py &
|
|
|
|
| 5 |
python apps/crm_api.py &
|
|
|
|
| 6 |
python apps/audit_api.py &
|
|
|
|
| 7 |
|
| 8 |
-
|
| 9 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
#!/bin/bash
|
| 2 |
+
set -e
|
| 3 |
|
|
|
|
| 4 |
python apps/regulatory_api.py &
|
| 5 |
+
REG_PID=$!
|
| 6 |
python apps/crm_api.py &
|
| 7 |
+
CRM_PID=$!
|
| 8 |
python apps/audit_api.py &
|
| 9 |
+
AUD_PID=$!
|
| 10 |
|
| 11 |
+
wait_for_service() {
|
| 12 |
+
local url=$1
|
| 13 |
+
local name=$2
|
| 14 |
+
for i in $(seq 1 30); do
|
| 15 |
+
if curl -sf "$url" > /dev/null 2>&1; then
|
| 16 |
+
echo "[start.sh] $name ready"
|
| 17 |
+
return 0
|
| 18 |
+
fi
|
| 19 |
+
sleep 1
|
| 20 |
+
done
|
| 21 |
+
echo "[start.sh] WARNING: $name did not become ready within 30s"
|
| 22 |
+
return 1
|
| 23 |
+
}
|
| 24 |
+
|
| 25 |
+
wait_for_service "http://localhost:8001/health" "regulatory_api"
|
| 26 |
+
wait_for_service "http://localhost:8002/health" "crm_api"
|
| 27 |
+
wait_for_service "http://localhost:8003/health" "audit_api"
|
| 28 |
+
|
| 29 |
+
echo "[start.sh] All microservices up. Launching environment server on :8000"
|
| 30 |
+
exec uvicorn server.app:app --host 0.0.0.0 --port 8000
|
dockerfile
CHANGED
|
@@ -1,21 +1,19 @@
|
|
| 1 |
-
# 1. Use a lightweight Python image
|
| 2 |
FROM python:3.11-slim
|
| 3 |
|
| 4 |
-
|
|
|
|
|
|
|
|
|
|
| 5 |
WORKDIR /app
|
| 6 |
|
| 7 |
-
|
| 8 |
-
|
| 9 |
|
| 10 |
-
|
| 11 |
-
RUN pip install --no-cache-dir .
|
| 12 |
-
RUN pip install -r requirements.txt
|
| 13 |
|
| 14 |
-
# 5. Make the startup script executable (Bypasses Windows permission errors)
|
| 15 |
RUN chmod +x apps/start.sh
|
| 16 |
|
| 17 |
-
# 6. Expose the port the main server uses
|
| 18 |
EXPOSE 8000
|
| 19 |
|
| 20 |
-
|
| 21 |
-
CMD ["./apps/start.sh"]
|
|
|
|
|
|
|
| 1 |
FROM python:3.11-slim
|
| 2 |
|
| 3 |
+
RUN apt-get update \
|
| 4 |
+
&& apt-get install -y --no-install-recommends curl \
|
| 5 |
+
&& rm -rf /var/lib/apt/lists/*
|
| 6 |
+
|
| 7 |
WORKDIR /app
|
| 8 |
|
| 9 |
+
COPY pyproject.toml requirements.txt ./
|
| 10 |
+
RUN pip install --no-cache-dir -r requirements.txt
|
| 11 |
|
| 12 |
+
COPY . .
|
| 13 |
+
RUN pip install --no-cache-dir --no-deps .
|
|
|
|
| 14 |
|
|
|
|
| 15 |
RUN chmod +x apps/start.sh
|
| 16 |
|
|
|
|
| 17 |
EXPOSE 8000
|
| 18 |
|
| 19 |
+
CMD ["./apps/start.sh"]
|
|
|
grpo_train.py
CHANGED
|
@@ -98,95 +98,167 @@ def extract_json(text):
|
|
| 98 |
# =========================
|
| 99 |
|
| 100 |
BASE_SCENARIOS = [
|
| 101 |
-
#
|
| 102 |
{
|
| 103 |
"task_id": "task_1_healthcare",
|
| 104 |
-
"text": "
|
| 105 |
-
"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 106 |
},
|
| 107 |
|
| 108 |
-
#
|
| 109 |
{
|
| 110 |
"task_id": "task_1_healthcare",
|
| 111 |
-
"text": "
|
|
|
|
| 112 |
"setup_actions": [
|
| 113 |
-
{"action_type": "query_regulations", "reasoning": "
|
| 114 |
-
]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 115 |
},
|
| 116 |
|
| 117 |
-
#
|
| 118 |
{
|
| 119 |
"task_id": "task_2_financial",
|
| 120 |
-
"text": "
|
|
|
|
| 121 |
"setup_actions": [
|
| 122 |
-
{"action_type": "query_regulations", "reasoning": "
|
| 123 |
-
{"action_type": "check_advertiser_history", "reasoning": "
|
| 124 |
-
]
|
| 125 |
-
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 126 |
]
|
| 127 |
|
| 128 |
-
|
| 129 |
-
rows = []
|
| 130 |
|
| 131 |
-
|
| 132 |
-
|
| 133 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 134 |
|
| 135 |
-
|
| 136 |
-
|
|
|
|
|
|
|
|
|
|
| 137 |
|
| 138 |
-
|
| 139 |
-
|
| 140 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 141 |
rows.append({
|
| 142 |
"prompt": prompt,
|
| 143 |
"task_id": s["task_id"],
|
| 144 |
-
"setup_actions": s["setup_actions"]
|
| 145 |
})
|
| 146 |
-
|
| 147 |
-
return Dataset.from_list(rows * 20) # small repeat
|
| 148 |
|
| 149 |
# =========================
|
| 150 |
# REWARD FUNCTION (FIXED)
|
| 151 |
# =========================
|
| 152 |
|
| 153 |
def reward_environment(prompts, completions, task_id=None, setup_actions=None, **kwargs):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 154 |
client = EnvClient(ENV_URL)
|
| 155 |
-
|
| 156 |
rewards = []
|
| 157 |
|
| 158 |
for completion, t_id, setup in zip(completions, task_id, setup_actions):
|
| 159 |
-
|
| 160 |
parsed = extract_json(completion)
|
| 161 |
-
|
| 162 |
if not parsed:
|
| 163 |
rewards.append(-1.0)
|
| 164 |
continue
|
| 165 |
|
| 166 |
action_type = parsed.get("action_type")
|
| 167 |
-
|
| 168 |
if action_type not in ALLOWED_ACTIONS:
|
| 169 |
rewards.append(-1.0)
|
| 170 |
continue
|
| 171 |
|
| 172 |
action = {
|
| 173 |
"action_type": action_type,
|
| 174 |
-
"reasoning": parsed.get("reasoning", "")
|
| 175 |
}
|
| 176 |
|
| 177 |
try:
|
| 178 |
client.reset(t_id)
|
| 179 |
-
|
| 180 |
-
# 🔥 FAST-FORWARD STATE
|
| 181 |
for s in setup:
|
| 182 |
safe_step(client, s)
|
| 183 |
|
| 184 |
result = safe_step(client, action)
|
|
|
|
|
|
|
| 185 |
|
| 186 |
-
|
| 187 |
-
|
|
|
|
|
|
|
|
|
|
| 188 |
|
| 189 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 190 |
rewards.append(-0.3)
|
| 191 |
|
| 192 |
return rewards
|
|
@@ -204,9 +276,15 @@ model, tokenizer = FastLanguageModel.from_pretrained(
|
|
| 204 |
model = FastLanguageModel.get_peft_model(
|
| 205 |
model,
|
| 206 |
r=16,
|
| 207 |
-
target_modules=[
|
| 208 |
-
|
|
|
|
|
|
|
|
|
|
| 209 |
lora_dropout=0,
|
|
|
|
|
|
|
|
|
|
| 210 |
)
|
| 211 |
|
| 212 |
# =========================
|
|
|
|
| 98 |
# =========================
|
| 99 |
|
| 100 |
BASE_SCENARIOS = [
|
| 101 |
+
# Phase 1 — Fresh state, expected: query_regulations
|
| 102 |
{
|
| 103 |
"task_id": "task_1_healthcare",
|
| 104 |
+
"text": "Healthcare ad: 'miracle supplement cures disease'. No actions taken yet.",
|
| 105 |
+
"actions_already_taken": [],
|
| 106 |
+
"setup_actions": [],
|
| 107 |
+
},
|
| 108 |
+
{
|
| 109 |
+
"task_id": "task_2_financial",
|
| 110 |
+
"text": "Financial ad: 'guaranteed 500% returns, zero risk'. No actions taken yet.",
|
| 111 |
+
"actions_already_taken": [],
|
| 112 |
+
"setup_actions": [],
|
| 113 |
+
},
|
| 114 |
+
{
|
| 115 |
+
"task_id": "task_3_multimodal",
|
| 116 |
+
"text": "Multimodal ad: image may contain hidden violation. No actions taken yet.",
|
| 117 |
+
"actions_already_taken": [],
|
| 118 |
+
"setup_actions": [],
|
| 119 |
},
|
| 120 |
|
| 121 |
+
# Phase 2 — Policy checked, expected: analyze_image OR check_advertiser_history
|
| 122 |
{
|
| 123 |
"task_id": "task_1_healthcare",
|
| 124 |
+
"text": "Healthcare ad: pharma product. Policy already queried.",
|
| 125 |
+
"actions_already_taken": ["query_regulations"],
|
| 126 |
"setup_actions": [
|
| 127 |
+
{"action_type": "query_regulations", "reasoning": "policy lookup"},
|
| 128 |
+
],
|
| 129 |
+
},
|
| 130 |
+
{
|
| 131 |
+
"task_id": "task_3_multimodal",
|
| 132 |
+
"text": "Multimodal ad: image not yet inspected. Policy already queried.",
|
| 133 |
+
"actions_already_taken": ["query_regulations"],
|
| 134 |
+
"setup_actions": [
|
| 135 |
+
{"action_type": "query_regulations", "reasoning": "policy lookup"},
|
| 136 |
+
],
|
| 137 |
},
|
| 138 |
|
| 139 |
+
# Phase 3 — Policy + history checked, expected: submit_audit
|
| 140 |
{
|
| 141 |
"task_id": "task_2_financial",
|
| 142 |
+
"text": "Financial ad: investment scheme. Policy and advertiser history both checked.",
|
| 143 |
+
"actions_already_taken": ["query_regulations", "check_advertiser_history"],
|
| 144 |
"setup_actions": [
|
| 145 |
+
{"action_type": "query_regulations", "reasoning": "policy lookup"},
|
| 146 |
+
{"action_type": "check_advertiser_history", "reasoning": "trust score"},
|
| 147 |
+
],
|
| 148 |
+
},
|
| 149 |
+
|
| 150 |
+
# Phase 4 — Audit complete, expected: reject (high-risk) or approve (clean)
|
| 151 |
+
{
|
| 152 |
+
"task_id": "task_2_financial",
|
| 153 |
+
"text": "Financial ad: investment scheme. Policy, history, and audit all complete. Make final decision.",
|
| 154 |
+
"actions_already_taken": ["query_regulations", "check_advertiser_history", "submit_audit"],
|
| 155 |
+
"setup_actions": [
|
| 156 |
+
{"action_type": "query_regulations", "reasoning": "policy lookup"},
|
| 157 |
+
{"action_type": "check_advertiser_history", "reasoning": "trust score"},
|
| 158 |
+
{"action_type": "submit_audit", "reasoning": "audit log"},
|
| 159 |
+
],
|
| 160 |
+
},
|
| 161 |
]
|
| 162 |
|
| 163 |
+
PROMPT_TEMPLATE = """You are an enterprise Ad Policy Compliance Agent.
|
|
|
|
| 164 |
|
| 165 |
+
You MUST choose exactly ONE action_type from this list (any other value is invalid):
|
| 166 |
+
- query_regulations
|
| 167 |
+
- analyze_image
|
| 168 |
+
- check_advertiser_history
|
| 169 |
+
- submit_audit
|
| 170 |
+
- approve
|
| 171 |
+
- reject
|
| 172 |
|
| 173 |
+
REQUIRED PHASE ORDER:
|
| 174 |
+
1. query_regulations -> always first
|
| 175 |
+
2. analyze_image / check_advertiser_history -> gather signals
|
| 176 |
+
3. submit_audit -> always before final decision
|
| 177 |
+
4. approve OR reject -> only after audit
|
| 178 |
|
| 179 |
+
HARD RULES:
|
| 180 |
+
- NEVER repeat an action listed in `actions_already_taken`.
|
| 181 |
+
- Respond with ONLY a valid JSON object. No markdown, no prose.
|
| 182 |
+
|
| 183 |
+
Required format:
|
| 184 |
+
{{"action_type": "<one_of_the_actions_above>", "reasoning": "<short reason>"}}
|
| 185 |
+
|
| 186 |
+
Scenario: {text}
|
| 187 |
+
actions_already_taken: {actions_already_taken}
|
| 188 |
+
|
| 189 |
+
Your next action?"""
|
| 190 |
+
|
| 191 |
+
|
| 192 |
+
def build_dataset():
|
| 193 |
+
rows = []
|
| 194 |
+
for s in BASE_SCENARIOS:
|
| 195 |
+
prompt = PROMPT_TEMPLATE.format(
|
| 196 |
+
text=s["text"],
|
| 197 |
+
actions_already_taken=json.dumps(s["actions_already_taken"]),
|
| 198 |
+
)
|
| 199 |
rows.append({
|
| 200 |
"prompt": prompt,
|
| 201 |
"task_id": s["task_id"],
|
| 202 |
+
"setup_actions": s["setup_actions"],
|
| 203 |
})
|
| 204 |
+
return Dataset.from_list(rows * 10) # 7 scenarios x 10 = 70 examples
|
|
|
|
| 205 |
|
| 206 |
# =========================
|
| 207 |
# REWARD FUNCTION (FIXED)
|
| 208 |
# =========================
|
| 209 |
|
| 210 |
def reward_environment(prompts, completions, task_id=None, setup_actions=None, **kwargs):
|
| 211 |
+
"""Shaped reward for GRPO.
|
| 212 |
+
|
| 213 |
+
Pure env reward is too sparse (mostly -0.05) to give clear gradients.
|
| 214 |
+
We add explicit shaping:
|
| 215 |
+
- invalid JSON / invalid action_type -> -1.0 (strong negative signal)
|
| 216 |
+
- valid action env REJECTS (wrong phase / API failure) -> -0.5
|
| 217 |
+
- valid action env ACCEPTS (advances state) -> +0.5 + env_reward
|
| 218 |
+
- terminal correct decision -> env_reward already contains +1.0 bonus
|
| 219 |
+
"""
|
| 220 |
client = EnvClient(ENV_URL)
|
|
|
|
| 221 |
rewards = []
|
| 222 |
|
| 223 |
for completion, t_id, setup in zip(completions, task_id, setup_actions):
|
|
|
|
| 224 |
parsed = extract_json(completion)
|
|
|
|
| 225 |
if not parsed:
|
| 226 |
rewards.append(-1.0)
|
| 227 |
continue
|
| 228 |
|
| 229 |
action_type = parsed.get("action_type")
|
|
|
|
| 230 |
if action_type not in ALLOWED_ACTIONS:
|
| 231 |
rewards.append(-1.0)
|
| 232 |
continue
|
| 233 |
|
| 234 |
action = {
|
| 235 |
"action_type": action_type,
|
| 236 |
+
"reasoning": parsed.get("reasoning", "format-compliant"),
|
| 237 |
}
|
| 238 |
|
| 239 |
try:
|
| 240 |
client.reset(t_id)
|
|
|
|
|
|
|
| 241 |
for s in setup:
|
| 242 |
safe_step(client, s)
|
| 243 |
|
| 244 |
result = safe_step(client, action)
|
| 245 |
+
env_reward = float(result.get("reward", -0.2))
|
| 246 |
+
status_msg = (result.get("status_message") or "").lower()
|
| 247 |
|
| 248 |
+
rejected = (
|
| 249 |
+
"api failure" in status_msg
|
| 250 |
+
or "invalid action" in status_msg
|
| 251 |
+
or "must call" in status_msg
|
| 252 |
+
)
|
| 253 |
|
| 254 |
+
if rejected:
|
| 255 |
+
shaped = -0.5
|
| 256 |
+
else:
|
| 257 |
+
shaped = 0.5 + env_reward
|
| 258 |
+
|
| 259 |
+
rewards.append(shaped)
|
| 260 |
+
|
| 261 |
+
except Exception:
|
| 262 |
rewards.append(-0.3)
|
| 263 |
|
| 264 |
return rewards
|
|
|
|
| 276 |
model = FastLanguageModel.get_peft_model(
|
| 277 |
model,
|
| 278 |
r=16,
|
| 279 |
+
target_modules=[
|
| 280 |
+
"q_proj", "k_proj", "v_proj", "o_proj",
|
| 281 |
+
"gate_proj", "up_proj", "down_proj",
|
| 282 |
+
],
|
| 283 |
+
lora_alpha=32,
|
| 284 |
lora_dropout=0,
|
| 285 |
+
bias="none",
|
| 286 |
+
use_gradient_checkpointing="unsloth",
|
| 287 |
+
random_state=3407,
|
| 288 |
)
|
| 289 |
|
| 290 |
# =========================
|
inference.py
CHANGED
|
@@ -8,7 +8,7 @@ API_BASE_URL = os.getenv("API_BASE_URL", "https://router.huggingface.co/v1")
|
|
| 8 |
API_KEY = os.getenv("HF_TOKEN") or os.getenv("API_KEY", "dummy_local_token")
|
| 9 |
MODEL_NAME = os.getenv("MODEL_NAME", "meta-llama/Meta-Llama-3-8B-Instruct")
|
| 10 |
|
| 11 |
-
ENV_URL = "http://localhost:8000"
|
| 12 |
MAX_STEPS = 10
|
| 13 |
|
| 14 |
# 2. MANDATORY: Use OpenAI Client pointed at the HF Router
|
|
@@ -59,6 +59,12 @@ def get_llm_action(observation_data):
|
|
| 59 |
- approve
|
| 60 |
- reject
|
| 61 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 62 |
Response format:
|
| 63 |
{"action_type": "<action>", "reasoning": "<brief reason>"}
|
| 64 |
"""
|
|
@@ -99,7 +105,8 @@ def main() -> None:
|
|
| 99 |
rewards = []
|
| 100 |
steps_taken = 0
|
| 101 |
success = False
|
| 102 |
-
|
|
|
|
| 103 |
try:
|
| 104 |
# 1. Reset the environment
|
| 105 |
res = requests.post(f"{ENV_URL}/reset", json={"task_id": task_id})
|
|
@@ -122,16 +129,17 @@ def main() -> None:
|
|
| 122 |
"task_id": task_id,
|
| 123 |
"last_feedback": step_data.get("status_message", "No feedback yet."),
|
| 124 |
"step_count": steps_taken,
|
| 125 |
-
"
|
|
|
|
| 126 |
}
|
| 127 |
|
| 128 |
# Get action from LLM
|
| 129 |
action_payload = get_llm_action(llm_observation)
|
| 130 |
action_str = action_payload["action_type"]
|
| 131 |
if "Error code: 402" in action_payload.get("reasoning", ""):
|
| 132 |
-
|
| 133 |
-
|
| 134 |
-
|
| 135 |
# Execute action in environment
|
| 136 |
step_res = requests.post(f"{ENV_URL}/step", json={"action": action_payload})
|
| 137 |
step_data = step_res.json()
|
|
@@ -140,8 +148,21 @@ def main() -> None:
|
|
| 140 |
observation = step_data.get("observation", {})
|
| 141 |
done = step_data.get("done", False)
|
| 142 |
reward = step_data.get("reward", 0.0)
|
| 143 |
-
|
| 144 |
rewards.append(reward)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 145 |
log_step(step=steps_taken, action=action_str, reward=reward, done=done, error=None)
|
| 146 |
|
| 147 |
# 4. Final Scoring (Single Log)
|
|
|
|
| 8 |
API_KEY = os.getenv("HF_TOKEN") or os.getenv("API_KEY", "dummy_local_token")
|
| 9 |
MODEL_NAME = os.getenv("MODEL_NAME", "meta-llama/Meta-Llama-3-8B-Instruct")
|
| 10 |
|
| 11 |
+
ENV_URL = os.getenv("ENV_URL", "http://localhost:8000")
|
| 12 |
MAX_STEPS = 10
|
| 13 |
|
| 14 |
# 2. MANDATORY: Use OpenAI Client pointed at the HF Router
|
|
|
|
| 59 |
- approve
|
| 60 |
- reject
|
| 61 |
|
| 62 |
+
HARD RULES:
|
| 63 |
+
- NEVER repeat an action listed in `actions_already_taken`.
|
| 64 |
+
- You MUST progress through the phase order. Do NOT call submit_audit or approve/reject
|
| 65 |
+
before the prerequisite phases are complete.
|
| 66 |
+
- Choose your action_type ONLY from the AVAILABLE ACTIONS list above. Any other value is invalid.
|
| 67 |
+
|
| 68 |
Response format:
|
| 69 |
{"action_type": "<action>", "reasoning": "<brief reason>"}
|
| 70 |
"""
|
|
|
|
| 105 |
rewards = []
|
| 106 |
steps_taken = 0
|
| 107 |
success = False
|
| 108 |
+
actions_taken_list: list = []
|
| 109 |
+
|
| 110 |
try:
|
| 111 |
# 1. Reset the environment
|
| 112 |
res = requests.post(f"{ENV_URL}/reset", json={"task_id": task_id})
|
|
|
|
| 129 |
"task_id": task_id,
|
| 130 |
"last_feedback": step_data.get("status_message", "No feedback yet."),
|
| 131 |
"step_count": steps_taken,
|
| 132 |
+
"actions_already_taken": actions_taken_list,
|
| 133 |
+
"ad_details": observation
|
| 134 |
}
|
| 135 |
|
| 136 |
# Get action from LLM
|
| 137 |
action_payload = get_llm_action(llm_observation)
|
| 138 |
action_str = action_payload["action_type"]
|
| 139 |
if "Error code: 402" in action_payload.get("reasoning", ""):
|
| 140 |
+
done = True
|
| 141 |
+
log_step(step=steps_taken, action=action_str, reward=0.0, done=True, error="API credits depleted")
|
| 142 |
+
break
|
| 143 |
# Execute action in environment
|
| 144 |
step_res = requests.post(f"{ENV_URL}/step", json={"action": action_payload})
|
| 145 |
step_data = step_res.json()
|
|
|
|
| 148 |
observation = step_data.get("observation", {})
|
| 149 |
done = step_data.get("done", False)
|
| 150 |
reward = step_data.get("reward", 0.0)
|
| 151 |
+
|
| 152 |
rewards.append(reward)
|
| 153 |
+
|
| 154 |
+
# Track only actions that actually advanced state. Skip API-failure
|
| 155 |
+
# / invalid-action / wrong-order cases so the agent is free to retry.
|
| 156 |
+
status_msg = (step_data.get("status_message") or "").lower()
|
| 157 |
+
action_failed = (
|
| 158 |
+
"api failure" in status_msg
|
| 159 |
+
or "retryable" in status_msg
|
| 160 |
+
or "invalid action" in status_msg
|
| 161 |
+
or "must call" in status_msg
|
| 162 |
+
)
|
| 163 |
+
if not action_failed and action_str not in actions_taken_list:
|
| 164 |
+
actions_taken_list.append(action_str)
|
| 165 |
+
|
| 166 |
log_step(step=steps_taken, action=action_str, reward=reward, done=done, error=None)
|
| 167 |
|
| 168 |
# 4. Final Scoring (Single Log)
|
pyproject.toml
CHANGED
|
@@ -6,14 +6,31 @@ build-backend = "setuptools.build_meta"
|
|
| 6 |
name = "meta-ad-policy-sandbox"
|
| 7 |
version = "0.2.3"
|
| 8 |
description = "Meta Ad-Policy RL Sandbox"
|
|
|
|
| 9 |
dependencies = [
|
| 10 |
"fastapi",
|
| 11 |
"uvicorn",
|
| 12 |
"pydantic",
|
| 13 |
"requests",
|
| 14 |
"openai",
|
| 15 |
-
"openenv-core>=0.2.0"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 16 |
]
|
| 17 |
|
| 18 |
[project.scripts]
|
| 19 |
-
server = "server.app:main"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 6 |
name = "meta-ad-policy-sandbox"
|
| 7 |
version = "0.2.3"
|
| 8 |
description = "Meta Ad-Policy RL Sandbox"
|
| 9 |
+
requires-python = ">=3.9"
|
| 10 |
dependencies = [
|
| 11 |
"fastapi",
|
| 12 |
"uvicorn",
|
| 13 |
"pydantic",
|
| 14 |
"requests",
|
| 15 |
"openai",
|
| 16 |
+
"openenv-core>=0.2.0",
|
| 17 |
+
]
|
| 18 |
+
|
| 19 |
+
[project.optional-dependencies]
|
| 20 |
+
train = [
|
| 21 |
+
"torch",
|
| 22 |
+
"datasets",
|
| 23 |
+
"trl",
|
| 24 |
+
"unsloth",
|
| 25 |
+
"accelerate",
|
| 26 |
+
"bitsandbytes",
|
| 27 |
+
"peft",
|
| 28 |
]
|
| 29 |
|
| 30 |
[project.scripts]
|
| 31 |
+
server = "server.app:main"
|
| 32 |
+
|
| 33 |
+
[tool.setuptools.packages.find]
|
| 34 |
+
where = ["."]
|
| 35 |
+
include = ["server*", "src*"]
|
| 36 |
+
exclude = ["apps*", "tests*"]
|
requirements.txt
CHANGED
|
@@ -2,4 +2,5 @@ openenv-core>=0.2.1
|
|
| 2 |
fastapi
|
| 3 |
uvicorn
|
| 4 |
pydantic
|
| 5 |
-
requests
|
|
|
|
|
|
| 2 |
fastapi
|
| 3 |
uvicorn
|
| 4 |
pydantic
|
| 5 |
+
requests
|
| 6 |
+
openai
|
server/app.py
CHANGED
|
@@ -3,21 +3,17 @@ from openenv.core.env_server import create_fastapi_app
|
|
| 3 |
from src.environment import AdPolicyEnvironment
|
| 4 |
from src.models import AdAction, AdObservation
|
| 5 |
|
| 6 |
-
# 1. Create the App
|
| 7 |
-
# NOTICE: We pass the CLASS NAME (AdPolicyEnvironment), not 'env' or 'AdPolicyEnvironment()'
|
| 8 |
app = create_fastapi_app(
|
| 9 |
-
AdPolicyEnvironment,
|
| 10 |
-
AdAction,
|
| 11 |
-
AdObservation
|
| 12 |
)
|
| 13 |
|
| 14 |
-
if __name__ == "__main__":
|
| 15 |
-
print("🚀 Starting Meta Ad-Policy Sandbox on http://localhost:8000")
|
| 16 |
-
uvicorn.run(app, host="0.0.0.0", port=8000)
|
| 17 |
-
|
| 18 |
|
| 19 |
def main():
|
|
|
|
| 20 |
uvicorn.run("server.app:app", host="0.0.0.0", port=8000)
|
| 21 |
|
|
|
|
| 22 |
if __name__ == "__main__":
|
| 23 |
main()
|
|
|
|
| 3 |
from src.environment import AdPolicyEnvironment
|
| 4 |
from src.models import AdAction, AdObservation
|
| 5 |
|
|
|
|
|
|
|
| 6 |
app = create_fastapi_app(
|
| 7 |
+
AdPolicyEnvironment,
|
| 8 |
+
AdAction,
|
| 9 |
+
AdObservation,
|
| 10 |
)
|
| 11 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 12 |
|
| 13 |
def main():
|
| 14 |
+
print("Starting Meta Ad-Policy Sandbox on http://localhost:8000")
|
| 15 |
uvicorn.run("server.app:app", host="0.0.0.0", port=8000)
|
| 16 |
|
| 17 |
+
|
| 18 |
if __name__ == "__main__":
|
| 19 |
main()
|