Dolphin-Syndrom commited on
Commit
dc07fcf
·
1 Parent(s): 58eaf0f

feat: Custom Gradio code review dashboard at root

Browse files
Files changed (1) hide show
  1. server/app.py +80 -2
server/app.py CHANGED
@@ -10,8 +10,9 @@ from collections.abc import Callable
10
 
11
  from pydantic import BaseModel, Field
12
 
 
13
  try:
14
- from openenv.core.env_server.http_server import create_app
15
  except Exception as e: # pragma: no cover
16
  raise ImportError(
17
  "openenv is required for the web interface. Install dependencies with '\n uv sync\n'"
@@ -33,7 +34,7 @@ def _env_factory() -> CodeReviewEnvironment:
33
  return CodeReviewEnvironment()
34
 
35
 
36
- app = create_app(
37
  _env_factory,
38
  ReviewAction,
39
  ReviewObservation,
@@ -128,6 +129,83 @@ def run_baseline() -> dict:
128
  return {"baseline_scores": baseline_scores}
129
 
130
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
131
  def main(host: str = "0.0.0.0", port: int = 8000):
132
  """
133
  Entry point for direct execution via uv run or python -m.
 
10
 
11
  from pydantic import BaseModel, Field
12
 
13
+ import gradio as gr
14
  try:
15
+ from openenv.core.env_server.http_server import create_fastapi_app
16
  except Exception as e: # pragma: no cover
17
  raise ImportError(
18
  "openenv is required for the web interface. Install dependencies with '\n uv sync\n'"
 
34
  return CodeReviewEnvironment()
35
 
36
 
37
+ app = create_fastapi_app(
38
  _env_factory,
39
  ReviewAction,
40
  ReviewObservation,
 
129
  return {"baseline_scores": baseline_scores}
130
 
131
 
132
+ # --- CUSTOM GRADIO UI FOR HUGGING FACE SPACE ---
133
+
134
+ def update_task_view(task_id: str):
135
+ task = TASKS[task_id]
136
+ desc_md = f"**File:** `{task.file_name}` | **Difficulty:** `{task.difficulty}`\n\n{task.description}"
137
+ return desc_md, task.code
138
+
139
+
140
+ def run_agent_simulation(task_id: str):
141
+ task = TASKS[task_id]
142
+ issues = detect_issues_rule_based(task)
143
+ comment = build_rule_comment(issues)
144
+ score = grade_review(issues, comment, task)
145
+
146
+ score_md = f"### 🤖 Agent simulated successfully\n\n**Calculated Score:** `{score:.3f}` \n**Issues Found:** {', '.join(issues) if issues else 'None'}"
147
+ return issues, comment, score, score_md
148
+
149
+
150
+ def manual_submit(task_id: str, issues: list[str], comment: str):
151
+ task = TASKS[task_id]
152
+ score = grade_review(issues, comment, task)
153
+ score_md = f"### 📝 Manual review parsed\n\n**Calculated Score:** `{score:.3f}`"
154
+ return score, score_md
155
+
156
+ hf_theme = gr.themes.Monochrome(
157
+ font=[gr.themes.GoogleFont("Inter"), "ui-sans-serif", "system-ui", "sans-serif"],
158
+ primary_hue="zinc",
159
+ neutral_hue="slate",
160
+ text_size=gr.themes.sizes.text_md,
161
+ )
162
+
163
+ with gr.Blocks(theme=hf_theme, title="Code Review Environment") as custom_ui:
164
+ gr.Markdown("# 🛡️ Code Review Agent Simulator", elem_id="header")
165
+ gr.Markdown("Evaluate LLM agent performance on deterministic code review tasks with immediate rule-based grading.")
166
+
167
+ with gr.Row():
168
+ with gr.Column(scale=5):
169
+ default_task_id = list(TASKS.keys())[0]
170
+ t = TASKS[default_task_id]
171
+
172
+ task_selector = gr.Dropdown(label="Select Task Matrix", choices=list(TASKS.keys()), value=default_task_id)
173
+ task_desc = gr.Markdown(value=f"**File:** `{t.file_name}` | **Difficulty:** `{t.difficulty}`\n\n{t.description}")
174
+ task_code = gr.Code(language="python", value=t.code, interactive=False, label="Environment File")
175
+
176
+ task_selector.change(
177
+ fn=update_task_view,
178
+ inputs=task_selector,
179
+ outputs=[task_desc, task_code]
180
+ )
181
+
182
+ with gr.Column(scale=4):
183
+ gr.Markdown("### Agent Output Sandbox")
184
+ agent_issues = gr.CheckboxGroup(label="Taxonomy Tags Outputted by Agent", choices=list(DETECTION_RULES.keys()))
185
+ agent_comment = gr.Textbox(label="Agent Review Comment", lines=4, placeholder="The agent's freeform text response goes here...")
186
+
187
+ with gr.Row():
188
+ manual_btn = gr.Button("Evaluate Manual Input", variant="secondary")
189
+ baseline_btn = gr.Button("Simulate Baseline Agent", variant="primary")
190
+
191
+ output_score_slider = gr.Slider(minimum=0.0, maximum=1.0, value=0.0, label="Task Grader Score", interactive=False)
192
+ output_markdown = gr.Markdown(value="_Waiting for action..._")
193
+
194
+ manual_btn.click(
195
+ fn=manual_submit,
196
+ inputs=[task_selector, agent_issues, agent_comment],
197
+ outputs=[output_score_slider, output_markdown]
198
+ )
199
+
200
+ baseline_btn.click(
201
+ fn=run_agent_simulation,
202
+ inputs=[task_selector],
203
+ outputs=[agent_issues, agent_comment, output_score_slider, output_markdown]
204
+ )
205
+
206
+ app = gr.mount_gradio_app(app, custom_ui, path="/")
207
+
208
+
209
  def main(host: str = "0.0.0.0", port: int = 8000):
210
  """
211
  Entry point for direct execution via uv run or python -m.