Dolphin-Syndrom commited on
Commit
795ec69
·
1 Parent(s): dc07fcf

feat: Upgrade UI to Production Analytics Dashboard

Browse files
Files changed (1) hide show
  1. server/app.py +111 -38
server/app.py CHANGED
@@ -11,6 +11,7 @@ from collections.abc import Callable
11
  from pydantic import BaseModel, Field
12
 
13
  import gradio as gr
 
14
  try:
15
  from openenv.core.env_server.http_server import create_fastapi_app
16
  except Exception as e: # pragma: no cover
@@ -137,21 +138,53 @@ def update_task_view(task_id: str):
137
  return desc_md, task.code
138
 
139
 
 
 
 
 
 
 
 
 
 
 
 
 
140
  def run_agent_simulation(task_id: str):
141
  task = TASKS[task_id]
142
  issues = detect_issues_rule_based(task)
143
  comment = build_rule_comment(issues)
144
  score = grade_review(issues, comment, task)
145
 
146
- score_md = f"### 🤖 Agent simulated successfully\n\n**Calculated Score:** `{score:.3f}` \n**Issues Found:** {', '.join(issues) if issues else 'None'}"
147
- return issues, comment, score, score_md
148
 
149
 
150
  def manual_submit(task_id: str, issues: list[str], comment: str):
151
  task = TASKS[task_id]
152
  score = grade_review(issues, comment, task)
153
- score_md = f"### 📝 Manual review parsed\n\n**Calculated Score:** `{score:.3f}`"
154
- return score, score_md
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
155
 
156
  hf_theme = gr.themes.Monochrome(
157
  font=[gr.themes.GoogleFont("Inter"), "ui-sans-serif", "system-ui", "sans-serif"],
@@ -160,47 +193,87 @@ hf_theme = gr.themes.Monochrome(
160
  text_size=gr.themes.sizes.text_md,
161
  )
162
 
163
- with gr.Blocks(theme=hf_theme, title="Code Review Environment") as custom_ui:
164
- gr.Markdown("# 🛡️ Code Review Agent Simulator", elem_id="header")
165
- gr.Markdown("Evaluate LLM agent performance on deterministic code review tasks with immediate rule-based grading.")
166
 
167
- with gr.Row():
168
- with gr.Column(scale=5):
169
- default_task_id = list(TASKS.keys())[0]
170
- t = TASKS[default_task_id]
171
 
172
- task_selector = gr.Dropdown(label="Select Task Matrix", choices=list(TASKS.keys()), value=default_task_id)
173
- task_desc = gr.Markdown(value=f"**File:** `{t.file_name}` | **Difficulty:** `{t.difficulty}`\n\n{t.description}")
174
- task_code = gr.Code(language="python", value=t.code, interactive=False, label="Environment File")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
175
 
176
- task_selector.change(
177
- fn=update_task_view,
178
- inputs=task_selector,
179
- outputs=[task_desc, task_code]
180
- )
181
-
182
- with gr.Column(scale=4):
183
- gr.Markdown("### Agent Output Sandbox")
184
- agent_issues = gr.CheckboxGroup(label="Taxonomy Tags Outputted by Agent", choices=list(DETECTION_RULES.keys()))
185
- agent_comment = gr.Textbox(label="Agent Review Comment", lines=4, placeholder="The agent's freeform text response goes here...")
186
 
187
  with gr.Row():
188
- manual_btn = gr.Button("Evaluate Manual Input", variant="secondary")
189
- baseline_btn = gr.Button("Simulate Baseline Agent", variant="primary")
 
 
 
 
 
 
 
 
 
 
 
 
 
190
 
191
- output_score_slider = gr.Slider(minimum=0.0, maximum=1.0, value=0.0, label="Task Grader Score", interactive=False)
192
- output_markdown = gr.Markdown(value="_Waiting for action..._")
193
-
194
- manual_btn.click(
195
- fn=manual_submit,
196
- inputs=[task_selector, agent_issues, agent_comment],
197
- outputs=[output_score_slider, output_markdown]
198
- )
199
 
200
- baseline_btn.click(
201
- fn=run_agent_simulation,
202
- inputs=[task_selector],
203
- outputs=[agent_issues, agent_comment, output_score_slider, output_markdown]
 
 
204
  )
205
 
206
  app = gr.mount_gradio_app(app, custom_ui, path="/")
 
11
  from pydantic import BaseModel, Field
12
 
13
  import gradio as gr
14
+ import pandas as pd
15
  try:
16
  from openenv.core.env_server.http_server import create_fastapi_app
17
  except Exception as e: # pragma: no cover
 
138
  return desc_md, task.code
139
 
140
 
141
+ def build_observation_dict(score: float, issues_found: list[str]) -> dict:
142
+ # Mimics actual agent OpenEnv observation output
143
+ return {
144
+ "status": "success",
145
+ "data": {
146
+ "evaluation_score": round(score, 3),
147
+ "true_issues_resolved": len(issues_found),
148
+ "message": "Grading simulation completed."
149
+ }
150
+ }
151
+
152
+
153
  def run_agent_simulation(task_id: str):
154
  task = TASKS[task_id]
155
  issues = detect_issues_rule_based(task)
156
  comment = build_rule_comment(issues)
157
  score = grade_review(issues, comment, task)
158
 
159
+ return issues, comment, build_observation_dict(score, issues)
 
160
 
161
 
162
  def manual_submit(task_id: str, issues: list[str], comment: str):
163
  task = TASKS[task_id]
164
  score = grade_review(issues, comment, task)
165
+ return build_observation_dict(score, issues)
166
+
167
+
168
+ def get_baseline_performance_df():
169
+ data = []
170
+ for t_id, task in TASKS.items():
171
+ issues = detect_issues_rule_based(task)
172
+ score = grade_review(issues, build_rule_comment(issues), task)
173
+ data.append({"Task Matrix": t_id, "Difficulty": task.difficulty, "Baseline Score (0-1.0)": score})
174
+ return pd.DataFrame(data)
175
+
176
+
177
+ def get_ground_truth_df():
178
+ data = []
179
+ for t_id, task in TASKS.items():
180
+ data.append({
181
+ "Task Matrix": t_id,
182
+ "Difficulty": task.difficulty,
183
+ "Target File": task.file_name,
184
+ "Ground Truth Issues": ", ".join(task.planted_issues)
185
+ })
186
+ return pd.DataFrame(data)
187
+
188
 
189
  hf_theme = gr.themes.Monochrome(
190
  font=[gr.themes.GoogleFont("Inter"), "ui-sans-serif", "system-ui", "sans-serif"],
 
193
  text_size=gr.themes.sizes.text_md,
194
  )
195
 
196
+ with gr.Blocks(theme=hf_theme, title="Code Review Environment Dashboard") as custom_ui:
197
+ gr.Markdown("# 🛡️ Code Review Environment", elem_id="header")
 
198
 
199
+ with gr.Tabs():
200
+ # TAB 1: INTERACTIVE PLAYGROUND
201
+ with gr.TabItem("🎮 Agent Evaluation Playground"):
202
+ gr.Markdown("Directly evaluate agents and deterministic code-review tasks through the environment proxy window.")
203
 
204
+ with gr.Row():
205
+ with gr.Column(scale=5):
206
+ default_task_id = list(TASKS.keys())[0]
207
+ t = TASKS[default_task_id]
208
+
209
+ task_selector = gr.Dropdown(label="Select Task Matrix", choices=list(TASKS.keys()), value=default_task_id)
210
+ task_desc = gr.Markdown(value=f"**File:** `{t.file_name}` | **Difficulty:** `{t.difficulty}`\n\n{t.description}")
211
+ task_code = gr.Code(language="python", value=t.code, interactive=False, label="Environment File")
212
+
213
+ task_selector.change(
214
+ fn=update_task_view,
215
+ inputs=task_selector,
216
+ outputs=[task_desc, task_code]
217
+ )
218
+
219
+ with gr.Column(scale=4):
220
+ gr.Markdown("### Agent Output Sandbox")
221
+ agent_issues = gr.CheckboxGroup(label="Taxonomy Tags Outputted by Agent", choices=list(DETECTION_RULES.keys()))
222
+ agent_comment = gr.Textbox(label="Agent Review Comment", lines=3, placeholder="The agent's freeform text response goes here...")
223
+
224
+ with gr.Row():
225
+ manual_btn = gr.Button("Evaluate Manual Input", variant="secondary")
226
+ baseline_btn = gr.Button("Simulate Baseline System", variant="primary")
227
+
228
+ gr.Markdown("### OpenEnv Observation Response")
229
+ output_json = gr.JSON(value={"status": "waiting", "data": {}}, label="Environment Feedback")
230
+
231
+ manual_btn.click(
232
+ fn=manual_submit,
233
+ inputs=[task_selector, agent_issues, agent_comment],
234
+ outputs=[output_json]
235
+ )
236
+
237
+ baseline_btn.click(
238
+ fn=run_agent_simulation,
239
+ inputs=[task_selector],
240
+ outputs=[agent_issues, agent_comment, output_json]
241
+ )
242
+
243
+ # TAB 2: ANALYTICS DASHBOARD
244
+ with gr.TabItem("📊 Environment Analytics"):
245
+ with gr.Row():
246
+ gr.Markdown(f"### 🧪 **{len(TASKS)}** Production Tasks")
247
+ gr.Markdown(f"### 🛡️ **{len(DETECTION_RULES)}** Taxonomy Flags")
248
+ gr.Markdown(f"### ⚙️ Deterministic Grading")
249
 
250
+ gr.Markdown("---")
 
 
 
 
 
 
 
 
 
251
 
252
  with gr.Row():
253
+ with gr.Column(scale=1):
254
+ gr.Markdown("### 📈 Baseline Policy Evaluation")
255
+ gr.Markdown("This chart renders a real-time `gr.BarPlot` showing the default rule-based LLM scanner performance across testing tasks in the environment. Agents must eclipse this score to be considered frontier models.")
256
+ bar_plot = gr.BarPlot(
257
+ value=get_baseline_performance_df(),
258
+ x="Task Matrix",
259
+ y="Baseline Score (0-1.0)",
260
+ color="Difficulty",
261
+ title="Scores by Agent Baseline"
262
+ )
263
+
264
+ with gr.Column(scale=1):
265
+ gr.Markdown("### 🗃️ Ground Truth Map")
266
+ gr.Markdown("The underlying ground truth configuration natively driving the environment metrics.")
267
+ db_view = gr.DataFrame(value=get_ground_truth_df())
268
 
269
+ gr.Markdown("---")
 
 
 
 
 
 
 
270
 
271
+ gr.Markdown(
272
+ "### ⚖️ Multi-Tier Evaluation Policy\n\n"
273
+ "The environment utilizes a robust, deterministic multi-dimensional reward function mimicking senior engineering review standards:\n\n"
274
+ "1. **Recall Reward (True Positives)**: Agents gain heavy fractional rewards specifically for correctly identifying underlying seeded vulnerabilities from the core taxonomy.\n"
275
+ "2. **Precision Penalty (False Positives)**: Hallucinations or overly aggressive linting (identifying bugs that aren't planted) will significantly drag down the score, enforcing conciseness.\n"
276
+ "3. **Articulation Bonus**: Agents submitting free-text comments highlighting root causes successfully grab a minor articulation bonus representing communication skills."
277
  )
278
 
279
  app = gr.mount_gradio_app(app, custom_ui, path="/")