Spaces:
Sleeping
Sleeping
Commit ·
dc07fcf
1
Parent(s): 58eaf0f
feat: Custom Gradio code review dashboard at root
Browse files- server/app.py +80 -2
server/app.py
CHANGED
|
@@ -10,8 +10,9 @@ from collections.abc import Callable
|
|
| 10 |
|
| 11 |
from pydantic import BaseModel, Field
|
| 12 |
|
|
|
|
| 13 |
try:
|
| 14 |
-
from openenv.core.env_server.http_server import
|
| 15 |
except Exception as e: # pragma: no cover
|
| 16 |
raise ImportError(
|
| 17 |
"openenv is required for the web interface. Install dependencies with '\n uv sync\n'"
|
|
@@ -33,7 +34,7 @@ def _env_factory() -> CodeReviewEnvironment:
|
|
| 33 |
return CodeReviewEnvironment()
|
| 34 |
|
| 35 |
|
| 36 |
-
app =
|
| 37 |
_env_factory,
|
| 38 |
ReviewAction,
|
| 39 |
ReviewObservation,
|
|
@@ -128,6 +129,83 @@ def run_baseline() -> dict:
|
|
| 128 |
return {"baseline_scores": baseline_scores}
|
| 129 |
|
| 130 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 131 |
def main(host: str = "0.0.0.0", port: int = 8000):
|
| 132 |
"""
|
| 133 |
Entry point for direct execution via uv run or python -m.
|
|
|
|
| 10 |
|
| 11 |
from pydantic import BaseModel, Field
|
| 12 |
|
| 13 |
+
import gradio as gr
|
| 14 |
try:
|
| 15 |
+
from openenv.core.env_server.http_server import create_fastapi_app
|
| 16 |
except Exception as e: # pragma: no cover
|
| 17 |
raise ImportError(
|
| 18 |
"openenv is required for the web interface. Install dependencies with '\n uv sync\n'"
|
|
|
|
| 34 |
return CodeReviewEnvironment()
|
| 35 |
|
| 36 |
|
| 37 |
+
app = create_fastapi_app(
|
| 38 |
_env_factory,
|
| 39 |
ReviewAction,
|
| 40 |
ReviewObservation,
|
|
|
|
| 129 |
return {"baseline_scores": baseline_scores}
|
| 130 |
|
| 131 |
|
| 132 |
+
# --- CUSTOM GRADIO UI FOR HUGGING FACE SPACE ---
|
| 133 |
+
|
| 134 |
+
def update_task_view(task_id: str):
|
| 135 |
+
task = TASKS[task_id]
|
| 136 |
+
desc_md = f"**File:** `{task.file_name}` | **Difficulty:** `{task.difficulty}`\n\n{task.description}"
|
| 137 |
+
return desc_md, task.code
|
| 138 |
+
|
| 139 |
+
|
| 140 |
+
def run_agent_simulation(task_id: str):
|
| 141 |
+
task = TASKS[task_id]
|
| 142 |
+
issues = detect_issues_rule_based(task)
|
| 143 |
+
comment = build_rule_comment(issues)
|
| 144 |
+
score = grade_review(issues, comment, task)
|
| 145 |
+
|
| 146 |
+
score_md = f"### 🤖 Agent simulated successfully\n\n**Calculated Score:** `{score:.3f}` \n**Issues Found:** {', '.join(issues) if issues else 'None'}"
|
| 147 |
+
return issues, comment, score, score_md
|
| 148 |
+
|
| 149 |
+
|
| 150 |
+
def manual_submit(task_id: str, issues: list[str], comment: str):
|
| 151 |
+
task = TASKS[task_id]
|
| 152 |
+
score = grade_review(issues, comment, task)
|
| 153 |
+
score_md = f"### 📝 Manual review parsed\n\n**Calculated Score:** `{score:.3f}`"
|
| 154 |
+
return score, score_md
|
| 155 |
+
|
| 156 |
+
hf_theme = gr.themes.Monochrome(
|
| 157 |
+
font=[gr.themes.GoogleFont("Inter"), "ui-sans-serif", "system-ui", "sans-serif"],
|
| 158 |
+
primary_hue="zinc",
|
| 159 |
+
neutral_hue="slate",
|
| 160 |
+
text_size=gr.themes.sizes.text_md,
|
| 161 |
+
)
|
| 162 |
+
|
| 163 |
+
with gr.Blocks(theme=hf_theme, title="Code Review Environment") as custom_ui:
|
| 164 |
+
gr.Markdown("# 🛡️ Code Review Agent Simulator", elem_id="header")
|
| 165 |
+
gr.Markdown("Evaluate LLM agent performance on deterministic code review tasks with immediate rule-based grading.")
|
| 166 |
+
|
| 167 |
+
with gr.Row():
|
| 168 |
+
with gr.Column(scale=5):
|
| 169 |
+
default_task_id = list(TASKS.keys())[0]
|
| 170 |
+
t = TASKS[default_task_id]
|
| 171 |
+
|
| 172 |
+
task_selector = gr.Dropdown(label="Select Task Matrix", choices=list(TASKS.keys()), value=default_task_id)
|
| 173 |
+
task_desc = gr.Markdown(value=f"**File:** `{t.file_name}` | **Difficulty:** `{t.difficulty}`\n\n{t.description}")
|
| 174 |
+
task_code = gr.Code(language="python", value=t.code, interactive=False, label="Environment File")
|
| 175 |
+
|
| 176 |
+
task_selector.change(
|
| 177 |
+
fn=update_task_view,
|
| 178 |
+
inputs=task_selector,
|
| 179 |
+
outputs=[task_desc, task_code]
|
| 180 |
+
)
|
| 181 |
+
|
| 182 |
+
with gr.Column(scale=4):
|
| 183 |
+
gr.Markdown("### Agent Output Sandbox")
|
| 184 |
+
agent_issues = gr.CheckboxGroup(label="Taxonomy Tags Outputted by Agent", choices=list(DETECTION_RULES.keys()))
|
| 185 |
+
agent_comment = gr.Textbox(label="Agent Review Comment", lines=4, placeholder="The agent's freeform text response goes here...")
|
| 186 |
+
|
| 187 |
+
with gr.Row():
|
| 188 |
+
manual_btn = gr.Button("Evaluate Manual Input", variant="secondary")
|
| 189 |
+
baseline_btn = gr.Button("Simulate Baseline Agent", variant="primary")
|
| 190 |
+
|
| 191 |
+
output_score_slider = gr.Slider(minimum=0.0, maximum=1.0, value=0.0, label="Task Grader Score", interactive=False)
|
| 192 |
+
output_markdown = gr.Markdown(value="_Waiting for action..._")
|
| 193 |
+
|
| 194 |
+
manual_btn.click(
|
| 195 |
+
fn=manual_submit,
|
| 196 |
+
inputs=[task_selector, agent_issues, agent_comment],
|
| 197 |
+
outputs=[output_score_slider, output_markdown]
|
| 198 |
+
)
|
| 199 |
+
|
| 200 |
+
baseline_btn.click(
|
| 201 |
+
fn=run_agent_simulation,
|
| 202 |
+
inputs=[task_selector],
|
| 203 |
+
outputs=[agent_issues, agent_comment, output_score_slider, output_markdown]
|
| 204 |
+
)
|
| 205 |
+
|
| 206 |
+
app = gr.mount_gradio_app(app, custom_ui, path="/")
|
| 207 |
+
|
| 208 |
+
|
| 209 |
def main(host: str = "0.0.0.0", port: int = 8000):
|
| 210 |
"""
|
| 211 |
Entry point for direct execution via uv run or python -m.
|