ps2181 Claude Sonnet 4.6 commited on
Commit
8afb151
·
1 Parent(s): 4390d4f

Add Gradio web UI mounted at /web for interactive agent testing

Browse files

Provides a browser-based interface to select a task, reset an episode,
view invoice/reference data, run the configured LLM agent or submit
custom JSON, and inspect grader feedback and per-field reward breakdown.
Gradio import is wrapped in try/except so the server starts even if
gradio is absent.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>

Files changed (4) hide show
  1. pyproject.toml +2 -0
  2. requirements.txt +3 -1
  3. server/app.py +10 -0
  4. server/web_ui.py +330 -0
pyproject.toml CHANGED
@@ -15,6 +15,8 @@ dependencies = [
15
  "openai>=1.0.0",
16
  "python-dotenv>=0.13",
17
  "openenv-core>=0.2.0",
 
 
18
  ]
19
 
20
  [project.scripts]
 
15
  "openai>=1.0.0",
16
  "python-dotenv>=0.13",
17
  "openenv-core>=0.2.0",
18
+ "gradio>=4.0.0",
19
+ "python-dotenv>=0.13",
20
  ]
21
 
22
  [project.scripts]
requirements.txt CHANGED
@@ -3,4 +3,6 @@ uvicorn[standard]>=0.24.0
3
  pydantic>=2.5.0
4
  httpx>=0.25.0
5
  openai>=1.0.0
6
- openenv-core>=0.2.0
 
 
 
3
  pydantic>=2.5.0
4
  httpx>=0.25.0
5
  openai>=1.0.0
6
+ openenv-core>=0.2.0
7
+ gradio>=4.0.0
8
+ python-dotenv>=0.13
server/app.py CHANGED
@@ -28,6 +28,16 @@ app = FastAPI(
28
  version="1.0.0",
29
  )
30
 
 
 
 
 
 
 
 
 
 
 
31
  # ---------------------------------------------------------------------------
32
  # Session registry — one InvoiceEnvironment per episode_id
33
  # Thread-safe, capped at MAX_SESSIONS to bound memory on vcpu=2 / 8gb
 
28
  version="1.0.0",
29
  )
30
 
31
+ # Mount Gradio web UI at /web
32
+ try:
33
+ import gradio as gr
34
+ from server.web_ui import build_ui
35
+ _gradio_app = build_ui()
36
+ app = gr.mount_gradio_app(app, _gradio_app, path="/web")
37
+ except Exception as _e:
38
+ import warnings
39
+ warnings.warn(f"Gradio UI not loaded: {_e}")
40
+
41
  # ---------------------------------------------------------------------------
42
  # Session registry — one InvoiceEnvironment per episode_id
43
  # Thread-safe, capped at MAX_SESSIONS to bound memory on vcpu=2 / 8gb
server/web_ui.py ADDED
@@ -0,0 +1,330 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Gradio Web UI for Invoice Processing Pipeline
3
+ =============================================
4
+ Interactive tester — pick a task, see the invoice, run the LLM agent
5
+ or paste your own JSON, then inspect the grader feedback & score.
6
+
7
+ Mounted at /web on the main FastAPI app.
8
+ """
9
+
10
+ from __future__ import annotations
11
+
12
+ import json
13
+ import os
14
+ import sys
15
+ from typing import Any, Dict, Tuple
16
+
17
+ import gradio as gr
18
+ import httpx
19
+
20
+ sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
21
+
22
+ # ---------------------------------------------------------------------------
23
+ # Helpers — thin HTTP client talking to the same server
24
+ # ---------------------------------------------------------------------------
25
+
26
+ _SERVER_URL = "http://localhost:7860"
27
+
28
+
29
+ def _post(path: str, body: Dict[str, Any]) -> Dict[str, Any]:
30
+ try:
31
+ r = httpx.post(f"{_SERVER_URL}{path}", json=body, timeout=30)
32
+ r.raise_for_status()
33
+ return r.json()
34
+ except Exception as e:
35
+ return {"error": str(e)}
36
+
37
+
38
+ def _get(path: str) -> Dict[str, Any]:
39
+ try:
40
+ r = httpx.get(f"{_SERVER_URL}{path}", timeout=10)
41
+ r.raise_for_status()
42
+ return r.json()
43
+ except Exception as e:
44
+ return {"error": str(e)}
45
+
46
+
47
+ # ---------------------------------------------------------------------------
48
+ # LLM agent helper
49
+ # ---------------------------------------------------------------------------
50
+
51
+ def _call_llm(task_id: str, obs: Dict[str, Any], step: int) -> Tuple[str, str]:
52
+ """Call the configured LLM and return (json_str, status_msg)."""
53
+ try:
54
+ from openai import OpenAI
55
+ from inference import SYSTEM_PROMPTS, build_user_prompt, MODEL_NAME, API_BASE_URL, API_KEY
56
+
57
+ if not API_KEY:
58
+ return "{}", "⚠️ No API key found — set HF_TOKEN or API_KEY env var."
59
+
60
+ client = OpenAI(base_url=API_BASE_URL, api_key=API_KEY)
61
+ user_prompt = build_user_prompt(task_id, obs, step)
62
+
63
+ completion = client.chat.completions.create(
64
+ model=MODEL_NAME,
65
+ messages=[
66
+ {"role": "system", "content": SYSTEM_PROMPTS[task_id]},
67
+ {"role": "user", "content": user_prompt},
68
+ ],
69
+ temperature=0.3,
70
+ max_tokens=2048,
71
+ )
72
+ raw = (completion.choices[0].message.content or "").strip()
73
+ if raw.startswith("```"):
74
+ raw = raw.split("\n", 1)[-1] if "\n" in raw else raw[3:]
75
+ if raw.endswith("```"):
76
+ raw = raw[:-3]
77
+ raw = raw.strip()
78
+
79
+ parsed = json.loads(raw)
80
+ return json.dumps(parsed, indent=2), f"✅ LLM ({MODEL_NAME}) responded. Review then Submit."
81
+ except json.JSONDecodeError as e:
82
+ return "{}", f"❌ LLM returned invalid JSON: {e}"
83
+ except Exception as e:
84
+ return "{}", f"❌ LLM error: {e}"
85
+
86
+
87
+ # ---------------------------------------------------------------------------
88
+ # Build Gradio app
89
+ # ---------------------------------------------------------------------------
90
+
91
+ TASK_DESCRIPTIONS = {
92
+ "easy": "Extract structured fields from a single clean invoice.",
93
+ "medium": "Clean & normalise a batch of messy invoices (typos, date formats, currencies).",
94
+ "hard": "Clean invoices AND reconcile against purchase orders. Flag discrepancies.",
95
+ "expert": "Audit invoices for fraud: phantom vendors, price gouging, duplicates, math errors.",
96
+ "adversarial": "Extract from an invoice with OCR corruption, fake SUBTOTAL, and FX noise lines.",
97
+ "negotiate": "Ask clarification questions, then submit full extraction. Bonus for ≤2 questions.",
98
+ "supply_chain": "Detect anomalies in delivery records: shortfalls, price spikes, substitutions, phantoms.",
99
+ }
100
+
101
+ PLACEHOLDER_JSON = "// Reset an episode first, then paste or generate JSON here."
102
+
103
+
104
+ def build_ui() -> gr.Blocks:
105
+
106
+ # ---- State per Gradio session ----------------------------------------
107
+ # Stores: episode_id (str), last observation dict, step count
108
+ init_state = {"episode_id": None, "obs": None, "step": 0, "history": []}
109
+
110
+ # ---- Callbacks -------------------------------------------------------
111
+
112
+ def do_reset(task_id: str, state: dict):
113
+ data = _post("/reset", {"task_id": task_id})
114
+ if "error" in data:
115
+ return (
116
+ state,
117
+ gr.update(value=f"❌ Error: {data['error']}"),
118
+ gr.update(value=""),
119
+ gr.update(value=""),
120
+ gr.update(value=""),
121
+ gr.update(value=PLACEHOLDER_JSON),
122
+ gr.update(value=""),
123
+ gr.update(value=""),
124
+ gr.update(interactive=False),
125
+ gr.update(interactive=False),
126
+ )
127
+
128
+ obs = data["observation"]
129
+ ep = data["info"]["episode_id"]
130
+ new_state = {"episode_id": ep, "obs": obs, "step": 0, "history": []}
131
+
132
+ ref = obs.get("reference_data") or ""
133
+ status = (
134
+ f"✅ Episode started | task={task_id} | id={ep[:12]}…\n"
135
+ f"Max attempts: {obs['max_attempts']}"
136
+ )
137
+
138
+ return (
139
+ new_state,
140
+ gr.update(value=status),
141
+ gr.update(value=obs["task_description"]),
142
+ gr.update(value=obs["raw_text"]),
143
+ gr.update(value=ref),
144
+ gr.update(value=PLACEHOLDER_JSON),
145
+ gr.update(value=""), # feedback
146
+ gr.update(value=""), # history
147
+ gr.update(interactive=True), # llm btn
148
+ gr.update(interactive=True), # submit btn
149
+ )
150
+
151
+ def do_llm(task_id: str, state: dict):
152
+ if not state.get("obs"):
153
+ return PLACEHOLDER_JSON, "⚠️ Reset an episode first."
154
+ step = state["step"] + 1
155
+ json_str, status = _call_llm(task_id, state["obs"], step)
156
+ return json_str, status
157
+
158
+ def do_submit(json_str: str, state: dict):
159
+ if not state.get("episode_id"):
160
+ return state, "⚠️ Reset an episode first.", "", "", ""
161
+
162
+ try:
163
+ extracted = json.loads(json_str)
164
+ except json.JSONDecodeError as e:
165
+ return state, f"❌ Invalid JSON: {e}", "", "", ""
166
+
167
+ data = _post("/step", {
168
+ "extracted_data": extracted,
169
+ "episode_id": state["episode_id"],
170
+ })
171
+
172
+ if "error" in data:
173
+ return state, f"❌ Error: {data['error']}", "", "", ""
174
+
175
+ obs = data["observation"]
176
+ reward = data.get("reward", 0.0)
177
+ done = data.get("done", False)
178
+ state["obs"] = obs
179
+ state["step"] += 1
180
+
181
+ # history
182
+ entry = f"Step {state['step']}: reward={reward:.3f}" + (" ✓ done" if done else "")
183
+ state["history"].append(entry)
184
+ history_str = "\n".join(state["history"])
185
+
186
+ feedback = obs.get("feedback") or "No feedback yet."
187
+
188
+ bd = obs.get("reward_breakdown")
189
+ breakdown_str = json.dumps(bd, indent=2) if bd else ""
190
+
191
+ status = (
192
+ f"Step {state['step']} / {obs['max_attempts']} | "
193
+ f"Reward: {reward:.3f} | "
194
+ f"{'🏁 Done' if done else 'In progress…'}"
195
+ )
196
+
197
+ return state, status, feedback, history_str, breakdown_str
198
+
199
+ # ---- Layout ----------------------------------------------------------
200
+
201
+ with gr.Blocks(
202
+ title="Invoice Processing Pipeline",
203
+ theme=gr.themes.Soft(),
204
+ css=".gr-prose { font-family: monospace; }",
205
+ ) as demo:
206
+
207
+ gr.Markdown(
208
+ "# 🧾 Invoice Processing Pipeline\n"
209
+ "Interactive agent tester — select a task, reset to load an invoice, "
210
+ "then use the LLM agent or paste your own JSON and submit."
211
+ )
212
+
213
+ session_state = gr.State(init_state)
214
+
215
+ # --- Controls row -------------------------------------------------
216
+ with gr.Row():
217
+ task_dd = gr.Dropdown(
218
+ choices=list(TASK_DESCRIPTIONS.keys()),
219
+ value="easy",
220
+ label="Task",
221
+ scale=1,
222
+ )
223
+ reset_btn = gr.Button("🔄 Reset Episode", variant="primary", scale=1)
224
+ status_box = gr.Textbox(
225
+ label="Status",
226
+ interactive=False,
227
+ scale=3,
228
+ lines=2,
229
+ )
230
+
231
+ task_info = gr.Textbox(label="Task Description", interactive=False, lines=1)
232
+
233
+ # --- Main two-column layout ---------------------------------------
234
+ with gr.Row():
235
+ # Left — environment data
236
+ with gr.Column(scale=5):
237
+ invoice_box = gr.Textbox(
238
+ label="Invoice Data (raw text)",
239
+ interactive=False,
240
+ lines=16,
241
+ max_lines=30,
242
+ )
243
+ ref_box = gr.Textbox(
244
+ label="Reference Data (PO / vendor registry / catalog)",
245
+ interactive=False,
246
+ lines=8,
247
+ max_lines=16,
248
+ )
249
+
250
+ # Right — agent interaction
251
+ with gr.Column(scale=5):
252
+ json_box = gr.Code(
253
+ label="Extracted JSON",
254
+ language="json",
255
+ lines=16,
256
+ value=PLACEHOLDER_JSON,
257
+ )
258
+ with gr.Row():
259
+ llm_btn = gr.Button(
260
+ "🤖 Run LLM Agent",
261
+ variant="secondary",
262
+ interactive=False,
263
+ )
264
+ submit_btn = gr.Button(
265
+ "✅ Submit",
266
+ variant="primary",
267
+ interactive=False,
268
+ )
269
+ llm_status = gr.Textbox(
270
+ label="LLM status",
271
+ interactive=False,
272
+ lines=1,
273
+ )
274
+
275
+ # --- Results row --------------------------------------------------
276
+ with gr.Row():
277
+ feedback_box = gr.Textbox(
278
+ label="Grader Feedback",
279
+ interactive=False,
280
+ lines=5,
281
+ scale=3,
282
+ )
283
+ breakdown_box = gr.Code(
284
+ label="Reward Breakdown",
285
+ language="json",
286
+ lines=5,
287
+ interactive=False,
288
+ scale=2,
289
+ )
290
+
291
+ history_box = gr.Textbox(
292
+ label="Step History",
293
+ interactive=False,
294
+ lines=3,
295
+ )
296
+
297
+ # --- Update task description on dropdown change -------------------
298
+ task_dd.change(
299
+ fn=lambda t: TASK_DESCRIPTIONS.get(t, ""),
300
+ inputs=[task_dd],
301
+ outputs=[task_info],
302
+ )
303
+
304
+ # --- Reset --------------------------------------------------------
305
+ reset_btn.click(
306
+ fn=do_reset,
307
+ inputs=[task_dd, session_state],
308
+ outputs=[
309
+ session_state, status_box, task_info,
310
+ invoice_box, ref_box, json_box,
311
+ feedback_box, history_box,
312
+ llm_btn, submit_btn,
313
+ ],
314
+ )
315
+
316
+ # --- LLM agent ----------------------------------------------------
317
+ llm_btn.click(
318
+ fn=do_llm,
319
+ inputs=[task_dd, session_state],
320
+ outputs=[json_box, llm_status],
321
+ )
322
+
323
+ # --- Submit -------------------------------------------------------
324
+ submit_btn.click(
325
+ fn=do_submit,
326
+ inputs=[json_box, session_state],
327
+ outputs=[session_state, status_box, feedback_box, history_box, breakdown_box],
328
+ )
329
+
330
+ return demo