Spaces:
Sleeping
Sleeping
Commit ·
795ec69
1
Parent(s): dc07fcf
feat: Upgrade UI to Production Analytics Dashboard
Browse files- server/app.py +111 -38
server/app.py
CHANGED
|
@@ -11,6 +11,7 @@ from collections.abc import Callable
|
|
| 11 |
from pydantic import BaseModel, Field
|
| 12 |
|
| 13 |
import gradio as gr
|
|
|
|
| 14 |
try:
|
| 15 |
from openenv.core.env_server.http_server import create_fastapi_app
|
| 16 |
except Exception as e: # pragma: no cover
|
|
@@ -137,21 +138,53 @@ def update_task_view(task_id: str):
|
|
| 137 |
return desc_md, task.code
|
| 138 |
|
| 139 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 140 |
def run_agent_simulation(task_id: str):
|
| 141 |
task = TASKS[task_id]
|
| 142 |
issues = detect_issues_rule_based(task)
|
| 143 |
comment = build_rule_comment(issues)
|
| 144 |
score = grade_review(issues, comment, task)
|
| 145 |
|
| 146 |
-
|
| 147 |
-
return issues, comment, score, score_md
|
| 148 |
|
| 149 |
|
| 150 |
def manual_submit(task_id: str, issues: list[str], comment: str):
|
| 151 |
task = TASKS[task_id]
|
| 152 |
score = grade_review(issues, comment, task)
|
| 153 |
-
|
| 154 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 155 |
|
| 156 |
hf_theme = gr.themes.Monochrome(
|
| 157 |
font=[gr.themes.GoogleFont("Inter"), "ui-sans-serif", "system-ui", "sans-serif"],
|
|
@@ -160,47 +193,87 @@ hf_theme = gr.themes.Monochrome(
|
|
| 160 |
text_size=gr.themes.sizes.text_md,
|
| 161 |
)
|
| 162 |
|
| 163 |
-
with gr.Blocks(theme=hf_theme, title="Code Review Environment") as custom_ui:
|
| 164 |
-
gr.Markdown("# 🛡️ Code Review
|
| 165 |
-
gr.Markdown("Evaluate LLM agent performance on deterministic code review tasks with immediate rule-based grading.")
|
| 166 |
|
| 167 |
-
with gr.
|
| 168 |
-
|
| 169 |
-
|
| 170 |
-
|
| 171 |
|
| 172 |
-
|
| 173 |
-
|
| 174 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 175 |
|
| 176 |
-
|
| 177 |
-
fn=update_task_view,
|
| 178 |
-
inputs=task_selector,
|
| 179 |
-
outputs=[task_desc, task_code]
|
| 180 |
-
)
|
| 181 |
-
|
| 182 |
-
with gr.Column(scale=4):
|
| 183 |
-
gr.Markdown("### Agent Output Sandbox")
|
| 184 |
-
agent_issues = gr.CheckboxGroup(label="Taxonomy Tags Outputted by Agent", choices=list(DETECTION_RULES.keys()))
|
| 185 |
-
agent_comment = gr.Textbox(label="Agent Review Comment", lines=4, placeholder="The agent's freeform text response goes here...")
|
| 186 |
|
| 187 |
with gr.Row():
|
| 188 |
-
|
| 189 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 190 |
|
| 191 |
-
|
| 192 |
-
output_markdown = gr.Markdown(value="_Waiting for action..._")
|
| 193 |
-
|
| 194 |
-
manual_btn.click(
|
| 195 |
-
fn=manual_submit,
|
| 196 |
-
inputs=[task_selector, agent_issues, agent_comment],
|
| 197 |
-
outputs=[output_score_slider, output_markdown]
|
| 198 |
-
)
|
| 199 |
|
| 200 |
-
|
| 201 |
-
|
| 202 |
-
|
| 203 |
-
|
|
|
|
|
|
|
| 204 |
)
|
| 205 |
|
| 206 |
app = gr.mount_gradio_app(app, custom_ui, path="/")
|
|
|
|
| 11 |
from pydantic import BaseModel, Field
|
| 12 |
|
| 13 |
import gradio as gr
|
| 14 |
+
import pandas as pd
|
| 15 |
try:
|
| 16 |
from openenv.core.env_server.http_server import create_fastapi_app
|
| 17 |
except Exception as e: # pragma: no cover
|
|
|
|
| 138 |
return desc_md, task.code
|
| 139 |
|
| 140 |
|
| 141 |
+
def build_observation_dict(score: float, issues_found: list[str]) -> dict:
|
| 142 |
+
# Mimics actual agent OpenEnv observation output
|
| 143 |
+
return {
|
| 144 |
+
"status": "success",
|
| 145 |
+
"data": {
|
| 146 |
+
"evaluation_score": round(score, 3),
|
| 147 |
+
"true_issues_resolved": len(issues_found),
|
| 148 |
+
"message": "Grading simulation completed."
|
| 149 |
+
}
|
| 150 |
+
}
|
| 151 |
+
|
| 152 |
+
|
| 153 |
def run_agent_simulation(task_id: str):
|
| 154 |
task = TASKS[task_id]
|
| 155 |
issues = detect_issues_rule_based(task)
|
| 156 |
comment = build_rule_comment(issues)
|
| 157 |
score = grade_review(issues, comment, task)
|
| 158 |
|
| 159 |
+
return issues, comment, build_observation_dict(score, issues)
|
|
|
|
| 160 |
|
| 161 |
|
| 162 |
def manual_submit(task_id: str, issues: list[str], comment: str):
|
| 163 |
task = TASKS[task_id]
|
| 164 |
score = grade_review(issues, comment, task)
|
| 165 |
+
return build_observation_dict(score, issues)
|
| 166 |
+
|
| 167 |
+
|
| 168 |
+
def get_baseline_performance_df():
|
| 169 |
+
data = []
|
| 170 |
+
for t_id, task in TASKS.items():
|
| 171 |
+
issues = detect_issues_rule_based(task)
|
| 172 |
+
score = grade_review(issues, build_rule_comment(issues), task)
|
| 173 |
+
data.append({"Task Matrix": t_id, "Difficulty": task.difficulty, "Baseline Score (0-1.0)": score})
|
| 174 |
+
return pd.DataFrame(data)
|
| 175 |
+
|
| 176 |
+
|
| 177 |
+
def get_ground_truth_df():
|
| 178 |
+
data = []
|
| 179 |
+
for t_id, task in TASKS.items():
|
| 180 |
+
data.append({
|
| 181 |
+
"Task Matrix": t_id,
|
| 182 |
+
"Difficulty": task.difficulty,
|
| 183 |
+
"Target File": task.file_name,
|
| 184 |
+
"Ground Truth Issues": ", ".join(task.planted_issues)
|
| 185 |
+
})
|
| 186 |
+
return pd.DataFrame(data)
|
| 187 |
+
|
| 188 |
|
| 189 |
hf_theme = gr.themes.Monochrome(
|
| 190 |
font=[gr.themes.GoogleFont("Inter"), "ui-sans-serif", "system-ui", "sans-serif"],
|
|
|
|
| 193 |
text_size=gr.themes.sizes.text_md,
|
| 194 |
)
|
| 195 |
|
| 196 |
+
with gr.Blocks(theme=hf_theme, title="Code Review Environment Dashboard") as custom_ui:
|
| 197 |
+
gr.Markdown("# 🛡️ Code Review Environment", elem_id="header")
|
|
|
|
| 198 |
|
| 199 |
+
with gr.Tabs():
|
| 200 |
+
# TAB 1: INTERACTIVE PLAYGROUND
|
| 201 |
+
with gr.TabItem("🎮 Agent Evaluation Playground"):
|
| 202 |
+
gr.Markdown("Directly evaluate agents and deterministic code-review tasks through the environment proxy window.")
|
| 203 |
|
| 204 |
+
with gr.Row():
|
| 205 |
+
with gr.Column(scale=5):
|
| 206 |
+
default_task_id = list(TASKS.keys())[0]
|
| 207 |
+
t = TASKS[default_task_id]
|
| 208 |
+
|
| 209 |
+
task_selector = gr.Dropdown(label="Select Task Matrix", choices=list(TASKS.keys()), value=default_task_id)
|
| 210 |
+
task_desc = gr.Markdown(value=f"**File:** `{t.file_name}` | **Difficulty:** `{t.difficulty}`\n\n{t.description}")
|
| 211 |
+
task_code = gr.Code(language="python", value=t.code, interactive=False, label="Environment File")
|
| 212 |
+
|
| 213 |
+
task_selector.change(
|
| 214 |
+
fn=update_task_view,
|
| 215 |
+
inputs=task_selector,
|
| 216 |
+
outputs=[task_desc, task_code]
|
| 217 |
+
)
|
| 218 |
+
|
| 219 |
+
with gr.Column(scale=4):
|
| 220 |
+
gr.Markdown("### Agent Output Sandbox")
|
| 221 |
+
agent_issues = gr.CheckboxGroup(label="Taxonomy Tags Outputted by Agent", choices=list(DETECTION_RULES.keys()))
|
| 222 |
+
agent_comment = gr.Textbox(label="Agent Review Comment", lines=3, placeholder="The agent's freeform text response goes here...")
|
| 223 |
+
|
| 224 |
+
with gr.Row():
|
| 225 |
+
manual_btn = gr.Button("Evaluate Manual Input", variant="secondary")
|
| 226 |
+
baseline_btn = gr.Button("Simulate Baseline System", variant="primary")
|
| 227 |
+
|
| 228 |
+
gr.Markdown("### OpenEnv Observation Response")
|
| 229 |
+
output_json = gr.JSON(value={"status": "waiting", "data": {}}, label="Environment Feedback")
|
| 230 |
+
|
| 231 |
+
manual_btn.click(
|
| 232 |
+
fn=manual_submit,
|
| 233 |
+
inputs=[task_selector, agent_issues, agent_comment],
|
| 234 |
+
outputs=[output_json]
|
| 235 |
+
)
|
| 236 |
+
|
| 237 |
+
baseline_btn.click(
|
| 238 |
+
fn=run_agent_simulation,
|
| 239 |
+
inputs=[task_selector],
|
| 240 |
+
outputs=[agent_issues, agent_comment, output_json]
|
| 241 |
+
)
|
| 242 |
+
|
| 243 |
+
# TAB 2: ANALYTICS DASHBOARD
|
| 244 |
+
with gr.TabItem("📊 Environment Analytics"):
|
| 245 |
+
with gr.Row():
|
| 246 |
+
gr.Markdown(f"### 🧪 **{len(TASKS)}** Production Tasks")
|
| 247 |
+
gr.Markdown(f"### 🛡️ **{len(DETECTION_RULES)}** Taxonomy Flags")
|
| 248 |
+
gr.Markdown(f"### ⚙️ Deterministic Grading")
|
| 249 |
|
| 250 |
+
gr.Markdown("---")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 251 |
|
| 252 |
with gr.Row():
|
| 253 |
+
with gr.Column(scale=1):
|
| 254 |
+
gr.Markdown("### 📈 Baseline Policy Evaluation")
|
| 255 |
+
gr.Markdown("This chart renders a real-time `gr.BarPlot` showing the default rule-based LLM scanner performance across testing tasks in the environment. Agents must eclipse this score to be considered frontier models.")
|
| 256 |
+
bar_plot = gr.BarPlot(
|
| 257 |
+
value=get_baseline_performance_df(),
|
| 258 |
+
x="Task Matrix",
|
| 259 |
+
y="Baseline Score (0-1.0)",
|
| 260 |
+
color="Difficulty",
|
| 261 |
+
title="Scores by Agent Baseline"
|
| 262 |
+
)
|
| 263 |
+
|
| 264 |
+
with gr.Column(scale=1):
|
| 265 |
+
gr.Markdown("### 🗃️ Ground Truth Map")
|
| 266 |
+
gr.Markdown("The underlying ground truth configuration natively driving the environment metrics.")
|
| 267 |
+
db_view = gr.DataFrame(value=get_ground_truth_df())
|
| 268 |
|
| 269 |
+
gr.Markdown("---")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 270 |
|
| 271 |
+
gr.Markdown(
|
| 272 |
+
"### ⚖️ Multi-Tier Evaluation Policy\n\n"
|
| 273 |
+
"The environment utilizes a robust, deterministic multi-dimensional reward function mimicking senior engineering review standards:\n\n"
|
| 274 |
+
"1. **Recall Reward (True Positives)**: Agents gain heavy fractional rewards specifically for correctly identifying underlying seeded vulnerabilities from the core taxonomy.\n"
|
| 275 |
+
"2. **Precision Penalty (False Positives)**: Hallucinations or overly aggressive linting (identifying bugs that aren't planted) will significantly drag down the score, enforcing conciseness.\n"
|
| 276 |
+
"3. **Articulation Bonus**: Agents submitting free-text comments highlighting root causes successfully grab a minor articulation bonus representing communication skills."
|
| 277 |
)
|
| 278 |
|
| 279 |
app = gr.mount_gradio_app(app, custom_ui, path="/")
|