Spaces:
Sleeping
Sleeping
Commit ·
491c280
1
Parent(s): 27f6fe4
Stabilize reset API startup path
Browse files- app.py +140 -130
- tests/test_server_api.py +21 -0
app.py
CHANGED
|
@@ -3,12 +3,12 @@
|
|
| 3 |
from __future__ import annotations
|
| 4 |
|
| 5 |
import json
|
|
|
|
| 6 |
from collections import Counter
|
| 7 |
from pathlib import Path
|
| 8 |
from threading import Lock
|
| 9 |
from typing import Any, Dict
|
| 10 |
|
| 11 |
-
import gradio as gr
|
| 12 |
from fastapi import FastAPI
|
| 13 |
from fastapi.responses import RedirectResponse
|
| 14 |
|
|
@@ -21,6 +21,16 @@ if str(PROJECT_ROOT) not in sys.path:
|
|
| 21 |
from environment.env import CodeReviewEnv
|
| 22 |
from environment.tasks import TaskDefinitions
|
| 23 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 24 |
|
| 25 |
app = FastAPI(title="code-review-agent-env")
|
| 26 |
_env = CodeReviewEnv()
|
|
@@ -502,99 +512,98 @@ body, .gradio-container {
|
|
| 502 |
"""
|
| 503 |
|
| 504 |
|
| 505 |
-
|
| 506 |
-
|
| 507 |
-
|
| 508 |
-
gr.
|
| 509 |
-
|
| 510 |
-
gr.
|
| 511 |
-
|
| 512 |
-
|
| 513 |
-
|
| 514 |
-
|
| 515 |
-
|
| 516 |
-
|
| 517 |
-
|
| 518 |
-
|
| 519 |
-
|
| 520 |
-
|
| 521 |
-
|
| 522 |
-
|
| 523 |
-
with gr.
|
| 524 |
-
with gr.
|
| 525 |
-
gr.
|
| 526 |
-
|
| 527 |
-
|
| 528 |
-
|
| 529 |
-
with gr.
|
| 530 |
-
with gr.
|
| 531 |
-
|
| 532 |
-
|
| 533 |
-
|
| 534 |
-
|
| 535 |
-
|
| 536 |
-
|
| 537 |
-
|
| 538 |
-
|
| 539 |
-
|
| 540 |
-
|
| 541 |
-
|
| 542 |
-
|
| 543 |
-
|
| 544 |
-
|
| 545 |
-
|
| 546 |
-
|
| 547 |
-
|
| 548 |
-
|
| 549 |
-
|
| 550 |
-
|
| 551 |
-
|
| 552 |
-
|
| 553 |
-
|
| 554 |
-
|
| 555 |
-
|
| 556 |
-
with gr.
|
| 557 |
-
|
| 558 |
-
|
| 559 |
-
|
| 560 |
-
|
| 561 |
-
|
| 562 |
-
|
| 563 |
-
|
| 564 |
-
|
| 565 |
-
|
| 566 |
-
with gr.
|
| 567 |
-
|
| 568 |
-
|
| 569 |
-
|
| 570 |
-
|
| 571 |
-
|
| 572 |
-
|
| 573 |
-
|
| 574 |
-
|
| 575 |
-
|
| 576 |
-
|
| 577 |
-
|
| 578 |
-
|
| 579 |
-
with gr.
|
| 580 |
-
gr.
|
| 581 |
-
|
| 582 |
-
|
| 583 |
-
|
| 584 |
-
|
| 585 |
-
|
| 586 |
-
|
| 587 |
-
|
| 588 |
-
|
| 589 |
-
|
| 590 |
-
|
| 591 |
-
|
| 592 |
-
|
| 593 |
-
|
| 594 |
-
|
| 595 |
-
|
| 596 |
-
|
| 597 |
-
task_cards.append(
|
| 598 |
gr.Markdown(
|
| 599 |
f"""
|
| 600 |
<div class='task-row'>
|
|
@@ -606,45 +615,46 @@ with gr.Blocks(title="Code Review Agent Environment") as demo:
|
|
| 606 |
</div>
|
| 607 |
"""
|
| 608 |
)
|
| 609 |
-
)
|
| 610 |
-
|
| 611 |
-
def _update_playground_metrics(payload: Dict[str, Any]) -> tuple[str, str, str]:
|
| 612 |
-
score_value = payload.get("task_score", 0.0)
|
| 613 |
-
step_value = payload.get("current_step", 0)
|
| 614 |
-
status_value = "complete" if payload.get("is_complete") else "active"
|
| 615 |
-
return (
|
| 616 |
-
f"<div class='metric'><div class='metric-label'>Current Score</div><div class='metric-value'>{float(score_value):.2f}</div></div>",
|
| 617 |
-
f"<div class='metric'><div class='metric-label'>Step</div><div class='metric-value'>{step_value}</div></div>",
|
| 618 |
-
f"<div class='metric'><div class='metric-label'>Status</div><div class='metric-value'>{status_value}</div></div>",
|
| 619 |
-
)
|
| 620 |
-
|
| 621 |
-
def _refresh_leaderboard() -> tuple[list[list[str]], str]:
|
| 622 |
-
summary_data = _benchmark_summary()
|
| 623 |
-
avg_score = float(summary_data.get("average_task_score", 0.0)) if isinstance(summary_data, dict) else 0.0
|
| 624 |
-
avg_reward = float(summary_data.get("average_total_reward", 0.0)) if isinstance(summary_data, dict) else 0.0
|
| 625 |
-
return _leaderboard_rows(), f"### Benchmark Leaderboard\n\n**Average Task Score:** {avg_score:.3f} | **Average Reward:** {avg_reward:.3f}"
|
| 626 |
-
|
| 627 |
-
def _load_trace(model_name: str, task_id: str) -> str:
|
| 628 |
-
return _trace_lookup(model_name, task_id)
|
| 629 |
|
| 630 |
-
|
| 631 |
-
|
| 632 |
-
|
| 633 |
-
|
| 634 |
-
|
| 635 |
-
|
| 636 |
-
|
| 637 |
-
|
| 638 |
-
|
| 639 |
-
|
| 640 |
-
|
| 641 |
-
|
| 642 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 643 |
|
| 644 |
|
| 645 |
@app.get("/ui")
|
| 646 |
def ui_alias() -> Any:
|
| 647 |
-
|
| 648 |
-
|
|
|
|
| 649 |
|
| 650 |
-
|
|
|
|
|
|
| 3 |
from __future__ import annotations
|
| 4 |
|
| 5 |
import json
|
| 6 |
+
import os
|
| 7 |
from collections import Counter
|
| 8 |
from pathlib import Path
|
| 9 |
from threading import Lock
|
| 10 |
from typing import Any, Dict
|
| 11 |
|
|
|
|
| 12 |
from fastapi import FastAPI
|
| 13 |
from fastapi.responses import RedirectResponse
|
| 14 |
|
|
|
|
| 21 |
from environment.env import CodeReviewEnv
|
| 22 |
from environment.tasks import TaskDefinitions
|
| 23 |
|
| 24 |
+
ENABLE_GRADIO_UI = os.getenv("ENABLE_GRADIO_UI", "").strip().lower() in {"1", "true", "yes"}
|
| 25 |
+
|
| 26 |
+
if ENABLE_GRADIO_UI:
|
| 27 |
+
try:
|
| 28 |
+
import gradio as gr
|
| 29 |
+
except Exception:
|
| 30 |
+
gr = None
|
| 31 |
+
ENABLE_GRADIO_UI = False
|
| 32 |
+
else:
|
| 33 |
+
gr = None
|
| 34 |
|
| 35 |
app = FastAPI(title="code-review-agent-env")
|
| 36 |
_env = CodeReviewEnv()
|
|
|
|
| 512 |
"""
|
| 513 |
|
| 514 |
|
| 515 |
+
def _build_demo():
|
| 516 |
+
task_choices = [t["task_id"] for t in TaskDefinitions.get_all_tasks()]
|
| 517 |
+
|
| 518 |
+
with gr.Blocks(title="Code Review Agent Environment") as demo:
|
| 519 |
+
gr.HTML(f"<style>{CUSTOM_CSS}</style>")
|
| 520 |
+
with gr.Column(elem_classes=["app-shell"]):
|
| 521 |
+
gr.HTML(
|
| 522 |
+
"""
|
| 523 |
+
<section class=\"hero\">
|
| 524 |
+
<h1>Code Review Mission Control</h1>
|
| 525 |
+
<p>High-clarity operator UI for environment resets, action stepping, and live scoring telemetry.</p>
|
| 526 |
+
<span class=\"chip mono\">UI: /ui</span>
|
| 527 |
+
<span class=\"chip mono\">API: /reset /step /state /score /tasks</span>
|
| 528 |
+
<span class=\"chip mono\">Validation: 3+ graded tasks</span>
|
| 529 |
+
</section>
|
| 530 |
+
"""
|
| 531 |
+
)
|
| 532 |
+
|
| 533 |
+
with gr.Tabs():
|
| 534 |
+
with gr.Tab("README"):
|
| 535 |
+
with gr.Column(elem_id="telemetry-panel"):
|
| 536 |
+
gr.Markdown(_readme_markdown())
|
| 537 |
+
gr.Markdown(_validation_markdown())
|
| 538 |
+
|
| 539 |
+
with gr.Tab("Playground"):
|
| 540 |
+
with gr.Column(elem_id="control-panel"):
|
| 541 |
+
with gr.Row():
|
| 542 |
+
task_id_input = gr.Dropdown(choices=task_choices, value=task_choices[0], label="Task ID")
|
| 543 |
+
reset_btn = gr.Button("Reset Task", variant="primary")
|
| 544 |
+
score_btn = gr.Button("Get Score")
|
| 545 |
+
state_btn = gr.Button("Get State")
|
| 546 |
+
|
| 547 |
+
with gr.Row():
|
| 548 |
+
score_card = gr.HTML("<div class='metric'><div class='metric-label'>Current Score</div><div class='metric-value'>0.00</div></div>")
|
| 549 |
+
step_card = gr.HTML("<div class='metric'><div class='metric-label'>Step</div><div class='metric-value'>0</div></div>")
|
| 550 |
+
status_card = gr.HTML("<div class='metric'><div class='metric-label'>Status</div><div class='metric-value'>idle</div></div>")
|
| 551 |
+
|
| 552 |
+
action_input = gr.Textbox(
|
| 553 |
+
label="Action JSON",
|
| 554 |
+
lines=10,
|
| 555 |
+
value=_starter_action_json(task_choices[0]),
|
| 556 |
+
elem_classes=["mono"],
|
| 557 |
+
)
|
| 558 |
+
with gr.Row():
|
| 559 |
+
step_btn = gr.Button("Execute Step", variant="primary")
|
| 560 |
+
starter_btn = gr.Button("Run Starter Step")
|
| 561 |
+
report_btn = gr.Button("Export Episode Report")
|
| 562 |
+
gr.Markdown("If you are new, click **Run Starter Step**. It resets the selected task and submits a safe example action.")
|
| 563 |
+
output = gr.Code(label="API Response", language="json")
|
| 564 |
+
report_out = gr.Code(label="Episode Report", language="json")
|
| 565 |
+
|
| 566 |
+
with gr.Tab("Traces"):
|
| 567 |
+
with gr.Column(elem_id="atlas-panel"):
|
| 568 |
+
models, trace_tasks = _trace_choices()
|
| 569 |
+
gr.Markdown("### Recorded Traces")
|
| 570 |
+
with gr.Row():
|
| 571 |
+
trace_model = gr.Dropdown(choices=models, value=models[0], label="Model")
|
| 572 |
+
trace_task = gr.Dropdown(choices=trace_tasks, value=trace_tasks[0], label="Task")
|
| 573 |
+
trace_refresh = gr.Button("Load Trace")
|
| 574 |
+
trace_out = gr.Code(label="Trace Payload", language="json")
|
| 575 |
+
|
| 576 |
+
with gr.Tab("Leaderboard"):
|
| 577 |
+
with gr.Column(elem_id="atlas-panel"):
|
| 578 |
+
summary = _benchmark_summary()
|
| 579 |
+
gr.Markdown("### Benchmark Leaderboard")
|
| 580 |
+
leaderboard_summary = gr.Markdown(f"**Average Task Score:** {summary.get('average_task_score', 0):.3f} | **Average Reward:** {summary.get('average_total_reward', 0):.3f}")
|
| 581 |
+
leaderboard = gr.Dataframe(
|
| 582 |
+
headers=["Rank", "Task", "Task Score", "Total Reward", "Steps", "Model"],
|
| 583 |
+
value=_leaderboard_rows(),
|
| 584 |
+
interactive=False,
|
| 585 |
+
wrap=True,
|
| 586 |
+
)
|
| 587 |
+
leaderboard_refresh = gr.Button("Refresh Leaderboard")
|
| 588 |
+
|
| 589 |
+
with gr.Tab("Tasks"):
|
| 590 |
+
with gr.Column(elem_id="atlas-panel"):
|
| 591 |
+
gr.Markdown("### Task Catalogue")
|
| 592 |
+
diff_summary = gr.Textbox(
|
| 593 |
+
label="Difficulty Split",
|
| 594 |
+
value=_difficulty_summary(),
|
| 595 |
+
interactive=False,
|
| 596 |
+
elem_classes=["mono"],
|
| 597 |
+
)
|
| 598 |
+
task_grid = gr.Dataframe(
|
| 599 |
+
headers=["Task ID", "Difficulty", "Language", "Name"],
|
| 600 |
+
value=_task_table(),
|
| 601 |
+
interactive=False,
|
| 602 |
+
wrap=True,
|
| 603 |
+
)
|
| 604 |
+
refresh_tasks_btn = gr.Button("Refresh Task Atlas")
|
| 605 |
+
|
| 606 |
+
for task in TaskDefinitions.get_all_tasks():
|
|
|
|
| 607 |
gr.Markdown(
|
| 608 |
f"""
|
| 609 |
<div class='task-row'>
|
|
|
|
| 615 |
</div>
|
| 616 |
"""
|
| 617 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 618 |
|
| 619 |
+
def _update_playground_metrics(payload: Dict[str, Any]) -> tuple[str, str, str]:
|
| 620 |
+
score_value = payload.get("task_score", 0.0)
|
| 621 |
+
step_value = payload.get("current_step", 0)
|
| 622 |
+
status_value = "complete" if payload.get("is_complete") else "active"
|
| 623 |
+
return (
|
| 624 |
+
f"<div class='metric'><div class='metric-label'>Current Score</div><div class='metric-value'>{float(score_value):.2f}</div></div>",
|
| 625 |
+
f"<div class='metric'><div class='metric-label'>Step</div><div class='metric-value'>{step_value}</div></div>",
|
| 626 |
+
f"<div class='metric'><div class='metric-label'>Status</div><div class='metric-value'>{status_value}</div></div>",
|
| 627 |
+
)
|
| 628 |
+
|
| 629 |
+
def _refresh_leaderboard() -> tuple[list[list[str]], str]:
|
| 630 |
+
summary_data = _benchmark_summary()
|
| 631 |
+
avg_score = float(summary_data.get("average_task_score", 0.0)) if isinstance(summary_data, dict) else 0.0
|
| 632 |
+
avg_reward = float(summary_data.get("average_total_reward", 0.0)) if isinstance(summary_data, dict) else 0.0
|
| 633 |
+
return _leaderboard_rows(), f"### Benchmark Leaderboard\n\n**Average Task Score:** {avg_score:.3f} | **Average Reward:** {avg_reward:.3f}"
|
| 634 |
+
|
| 635 |
+
def _load_trace(model_name: str, task_id: str) -> str:
|
| 636 |
+
return _trace_lookup(model_name, task_id)
|
| 637 |
+
|
| 638 |
+
reset_btn.click(fn=_ui_reset, inputs=[task_id_input], outputs=[output])
|
| 639 |
+
step_btn.click(fn=_ui_step, inputs=[action_input], outputs=[output])
|
| 640 |
+
starter_btn.click(fn=_ui_run_starter_step, inputs=[task_id_input], outputs=[output])
|
| 641 |
+
state_btn.click(fn=_ui_state, inputs=None, outputs=[output])
|
| 642 |
+
score_btn.click(fn=_ui_score, inputs=None, outputs=[output])
|
| 643 |
+
report_btn.click(fn=_episode_report, inputs=None, outputs=[report_out])
|
| 644 |
+
score_btn.click(fn=lambda: _update_playground_metrics(score()), inputs=None, outputs=[score_card, step_card, status_card])
|
| 645 |
+
trace_refresh.click(fn=_load_trace, inputs=[trace_model, trace_task], outputs=[trace_out])
|
| 646 |
+
leaderboard_refresh.click(fn=_refresh_leaderboard, inputs=None, outputs=[leaderboard, leaderboard_summary])
|
| 647 |
+
refresh_tasks_btn.click(fn=_difficulty_summary, inputs=None, outputs=[diff_summary])
|
| 648 |
+
refresh_tasks_btn.click(fn=_task_table, inputs=None, outputs=[task_grid])
|
| 649 |
+
|
| 650 |
+
return demo
|
| 651 |
|
| 652 |
|
| 653 |
@app.get("/ui")
|
| 654 |
def ui_alias() -> Any:
|
| 655 |
+
if ENABLE_GRADIO_UI and gr is not None:
|
| 656 |
+
return RedirectResponse(url="/", status_code=307)
|
| 657 |
+
return RedirectResponse(url="/docs", status_code=307)
|
| 658 |
|
| 659 |
+
if ENABLE_GRADIO_UI and gr is not None:
|
| 660 |
+
app = gr.mount_gradio_app(app, _build_demo(), path="/")
|
tests/test_server_api.py
CHANGED
|
@@ -1,5 +1,8 @@
|
|
| 1 |
import unittest
|
| 2 |
|
|
|
|
|
|
|
|
|
|
| 3 |
from server.app import app
|
| 4 |
|
| 5 |
|
|
@@ -33,5 +36,23 @@ class TestServerAPI(unittest.TestCase):
|
|
| 33 |
self.assertIn("task_id", payload)
|
| 34 |
|
| 35 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 36 |
if __name__ == "__main__":
|
| 37 |
unittest.main()
|
|
|
|
| 1 |
import unittest
|
| 2 |
|
| 3 |
+
from fastapi.testclient import TestClient
|
| 4 |
+
|
| 5 |
+
from app import app as fastapi_app
|
| 6 |
from server.app import app
|
| 7 |
|
| 8 |
|
|
|
|
| 36 |
self.assertIn("task_id", payload)
|
| 37 |
|
| 38 |
|
| 39 |
+
class TestFastAPIReset(unittest.TestCase):
|
| 40 |
+
def setUp(self):
|
| 41 |
+
self.client = TestClient(fastapi_app)
|
| 42 |
+
|
| 43 |
+
def test_post_reset_without_body(self):
|
| 44 |
+
response = self.client.post("/reset")
|
| 45 |
+
self.assertEqual(response.status_code, 200)
|
| 46 |
+
payload = response.json()
|
| 47 |
+
self.assertIn("observation", payload)
|
| 48 |
+
self.assertIn("task_description", payload["observation"])
|
| 49 |
+
|
| 50 |
+
def test_post_reset_with_task_id_body(self):
|
| 51 |
+
response = self.client.post("/reset", json={"task_id": "bug_detection_easy_1"})
|
| 52 |
+
self.assertEqual(response.status_code, 200)
|
| 53 |
+
payload = response.json()
|
| 54 |
+
self.assertEqual(payload["observation"]["task_difficulty"], "easy")
|
| 55 |
+
|
| 56 |
+
|
| 57 |
if __name__ == "__main__":
|
| 58 |
unittest.main()
|