ashishbaberwal commited on
Commit
491c280
·
1 Parent(s): 27f6fe4

Stabilize reset API startup path

Browse files
Files changed (2) hide show
  1. app.py +140 -130
  2. tests/test_server_api.py +21 -0
app.py CHANGED
@@ -3,12 +3,12 @@
3
  from __future__ import annotations
4
 
5
  import json
 
6
  from collections import Counter
7
  from pathlib import Path
8
  from threading import Lock
9
  from typing import Any, Dict
10
 
11
- import gradio as gr
12
  from fastapi import FastAPI
13
  from fastapi.responses import RedirectResponse
14
 
@@ -21,6 +21,16 @@ if str(PROJECT_ROOT) not in sys.path:
21
  from environment.env import CodeReviewEnv
22
  from environment.tasks import TaskDefinitions
23
 
 
 
 
 
 
 
 
 
 
 
24
 
25
  app = FastAPI(title="code-review-agent-env")
26
  _env = CodeReviewEnv()
@@ -502,99 +512,98 @@ body, .gradio-container {
502
  """
503
 
504
 
505
- task_choices = [t["task_id"] for t in TaskDefinitions.get_all_tasks()]
506
-
507
- with gr.Blocks(title="Code Review Agent Environment") as demo:
508
- gr.HTML(f"<style>{CUSTOM_CSS}</style>")
509
- with gr.Column(elem_classes=["app-shell"]):
510
- gr.HTML(
511
- """
512
- <section class=\"hero\">
513
- <h1>Code Review Mission Control</h1>
514
- <p>High-clarity operator UI for environment resets, action stepping, and live scoring telemetry.</p>
515
- <span class=\"chip mono\">UI: /ui</span>
516
- <span class=\"chip mono\">API: /reset /step /state /score /tasks</span>
517
- <span class=\"chip mono\">Validation: 3+ graded tasks</span>
518
- </section>
519
- """
520
- )
521
-
522
- with gr.Tabs():
523
- with gr.Tab("README"):
524
- with gr.Column(elem_id="telemetry-panel"):
525
- gr.Markdown(_readme_markdown())
526
- gr.Markdown(_validation_markdown())
527
-
528
- with gr.Tab("Playground"):
529
- with gr.Column(elem_id="control-panel"):
530
- with gr.Row():
531
- task_id_input = gr.Dropdown(choices=task_choices, value=task_choices[0], label="Task ID")
532
- reset_btn = gr.Button("Reset Task", variant="primary")
533
- score_btn = gr.Button("Get Score")
534
- state_btn = gr.Button("Get State")
535
-
536
- with gr.Row():
537
- score_card = gr.HTML("<div class='metric'><div class='metric-label'>Current Score</div><div class='metric-value'>0.00</div></div>")
538
- step_card = gr.HTML("<div class='metric'><div class='metric-label'>Step</div><div class='metric-value'>0</div></div>")
539
- status_card = gr.HTML("<div class='metric'><div class='metric-label'>Status</div><div class='metric-value'>idle</div></div>")
540
-
541
- action_input = gr.Textbox(
542
- label="Action JSON",
543
- lines=10,
544
- value=_starter_action_json(task_choices[0]),
545
- elem_classes=["mono"],
546
- )
547
- with gr.Row():
548
- step_btn = gr.Button("Execute Step", variant="primary")
549
- starter_btn = gr.Button("Run Starter Step")
550
- report_btn = gr.Button("Export Episode Report")
551
- gr.Markdown("If you are new, click **Run Starter Step**. It resets the selected task and submits a safe example action.")
552
- output = gr.Code(label="API Response", language="json")
553
- report_out = gr.Code(label="Episode Report", language="json")
554
-
555
- with gr.Tab("Traces"):
556
- with gr.Column(elem_id="atlas-panel"):
557
- models, trace_tasks = _trace_choices()
558
- gr.Markdown("### Recorded Traces")
559
- with gr.Row():
560
- trace_model = gr.Dropdown(choices=models, value=models[0], label="Model")
561
- trace_task = gr.Dropdown(choices=trace_tasks, value=trace_tasks[0], label="Task")
562
- trace_refresh = gr.Button("Load Trace")
563
- trace_out = gr.Code(label="Trace Payload", language="json")
564
-
565
- with gr.Tab("Leaderboard"):
566
- with gr.Column(elem_id="atlas-panel"):
567
- summary = _benchmark_summary()
568
- gr.Markdown("### Benchmark Leaderboard")
569
- leaderboard_summary = gr.Markdown(f"**Average Task Score:** {summary.get('average_task_score', 0):.3f} | **Average Reward:** {summary.get('average_total_reward', 0):.3f}")
570
- leaderboard = gr.Dataframe(
571
- headers=["Rank", "Task", "Task Score", "Total Reward", "Steps", "Model"],
572
- value=_leaderboard_rows(),
573
- interactive=False,
574
- wrap=True,
575
- )
576
- leaderboard_refresh = gr.Button("Refresh Leaderboard")
577
-
578
- with gr.Tab("Tasks"):
579
- with gr.Column(elem_id="atlas-panel"):
580
- gr.Markdown("### Task Catalogue")
581
- diff_summary = gr.Textbox(
582
- label="Difficulty Split",
583
- value=_difficulty_summary(),
584
- interactive=False,
585
- elem_classes=["mono"],
586
- )
587
- task_grid = gr.Dataframe(
588
- headers=["Task ID", "Difficulty", "Language", "Name"],
589
- value=_task_table(),
590
- interactive=False,
591
- wrap=True,
592
- )
593
- refresh_tasks_btn = gr.Button("Refresh Task Atlas")
594
-
595
- task_cards = []
596
- for task in TaskDefinitions.get_all_tasks():
597
- task_cards.append(
598
  gr.Markdown(
599
  f"""
600
  <div class='task-row'>
@@ -606,45 +615,46 @@ with gr.Blocks(title="Code Review Agent Environment") as demo:
606
  </div>
607
  """
608
  )
609
- )
610
-
611
- def _update_playground_metrics(payload: Dict[str, Any]) -> tuple[str, str, str]:
612
- score_value = payload.get("task_score", 0.0)
613
- step_value = payload.get("current_step", 0)
614
- status_value = "complete" if payload.get("is_complete") else "active"
615
- return (
616
- f"<div class='metric'><div class='metric-label'>Current Score</div><div class='metric-value'>{float(score_value):.2f}</div></div>",
617
- f"<div class='metric'><div class='metric-label'>Step</div><div class='metric-value'>{step_value}</div></div>",
618
- f"<div class='metric'><div class='metric-label'>Status</div><div class='metric-value'>{status_value}</div></div>",
619
- )
620
-
621
- def _refresh_leaderboard() -> tuple[list[list[str]], str]:
622
- summary_data = _benchmark_summary()
623
- avg_score = float(summary_data.get("average_task_score", 0.0)) if isinstance(summary_data, dict) else 0.0
624
- avg_reward = float(summary_data.get("average_total_reward", 0.0)) if isinstance(summary_data, dict) else 0.0
625
- return _leaderboard_rows(), f"### Benchmark Leaderboard\n\n**Average Task Score:** {avg_score:.3f} | **Average Reward:** {avg_reward:.3f}"
626
-
627
- def _load_trace(model_name: str, task_id: str) -> str:
628
- return _trace_lookup(model_name, task_id)
629
 
630
- reset_btn.click(fn=_ui_reset, inputs=[task_id_input], outputs=[output])
631
- step_btn.click(fn=_ui_step, inputs=[action_input], outputs=[output])
632
- starter_btn.click(fn=_ui_run_starter_step, inputs=[task_id_input], outputs=[output])
633
- state_btn.click(fn=_ui_state, inputs=None, outputs=[output])
634
- score_btn.click(fn=_ui_score, inputs=None, outputs=[output])
635
- report_btn.click(fn=_episode_report, inputs=None, outputs=[report_out])
636
-
637
- score_btn.click(fn=lambda: _update_playground_metrics(score()), inputs=None, outputs=[score_card, step_card, status_card])
638
-
639
- trace_refresh.click(fn=_load_trace, inputs=[trace_model, trace_task], outputs=[trace_out])
640
- leaderboard_refresh.click(fn=_refresh_leaderboard, inputs=None, outputs=[leaderboard, leaderboard_summary])
641
- refresh_tasks_btn.click(fn=_difficulty_summary, inputs=None, outputs=[diff_summary])
642
- refresh_tasks_btn.click(fn=_task_table, inputs=None, outputs=[task_grid])
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
643
 
644
 
645
  @app.get("/ui")
646
  def ui_alias() -> Any:
647
- return RedirectResponse(url="/", status_code=307)
648
-
 
649
 
650
- app = gr.mount_gradio_app(app, demo, path="/")
 
 
3
  from __future__ import annotations
4
 
5
  import json
6
+ import os
7
  from collections import Counter
8
  from pathlib import Path
9
  from threading import Lock
10
  from typing import Any, Dict
11
 
 
12
  from fastapi import FastAPI
13
  from fastapi.responses import RedirectResponse
14
 
 
21
  from environment.env import CodeReviewEnv
22
  from environment.tasks import TaskDefinitions
23
 
24
+ ENABLE_GRADIO_UI = os.getenv("ENABLE_GRADIO_UI", "").strip().lower() in {"1", "true", "yes"}
25
+
26
+ if ENABLE_GRADIO_UI:
27
+ try:
28
+ import gradio as gr
29
+ except Exception:
30
+ gr = None
31
+ ENABLE_GRADIO_UI = False
32
+ else:
33
+ gr = None
34
 
35
  app = FastAPI(title="code-review-agent-env")
36
  _env = CodeReviewEnv()
 
512
  """
513
 
514
 
515
+ def _build_demo():
516
+ task_choices = [t["task_id"] for t in TaskDefinitions.get_all_tasks()]
517
+
518
+ with gr.Blocks(title="Code Review Agent Environment") as demo:
519
+ gr.HTML(f"<style>{CUSTOM_CSS}</style>")
520
+ with gr.Column(elem_classes=["app-shell"]):
521
+ gr.HTML(
522
+ """
523
+ <section class=\"hero\">
524
+ <h1>Code Review Mission Control</h1>
525
+ <p>High-clarity operator UI for environment resets, action stepping, and live scoring telemetry.</p>
526
+ <span class=\"chip mono\">UI: /ui</span>
527
+ <span class=\"chip mono\">API: /reset /step /state /score /tasks</span>
528
+ <span class=\"chip mono\">Validation: 3+ graded tasks</span>
529
+ </section>
530
+ """
531
+ )
532
+
533
+ with gr.Tabs():
534
+ with gr.Tab("README"):
535
+ with gr.Column(elem_id="telemetry-panel"):
536
+ gr.Markdown(_readme_markdown())
537
+ gr.Markdown(_validation_markdown())
538
+
539
+ with gr.Tab("Playground"):
540
+ with gr.Column(elem_id="control-panel"):
541
+ with gr.Row():
542
+ task_id_input = gr.Dropdown(choices=task_choices, value=task_choices[0], label="Task ID")
543
+ reset_btn = gr.Button("Reset Task", variant="primary")
544
+ score_btn = gr.Button("Get Score")
545
+ state_btn = gr.Button("Get State")
546
+
547
+ with gr.Row():
548
+ score_card = gr.HTML("<div class='metric'><div class='metric-label'>Current Score</div><div class='metric-value'>0.00</div></div>")
549
+ step_card = gr.HTML("<div class='metric'><div class='metric-label'>Step</div><div class='metric-value'>0</div></div>")
550
+ status_card = gr.HTML("<div class='metric'><div class='metric-label'>Status</div><div class='metric-value'>idle</div></div>")
551
+
552
+ action_input = gr.Textbox(
553
+ label="Action JSON",
554
+ lines=10,
555
+ value=_starter_action_json(task_choices[0]),
556
+ elem_classes=["mono"],
557
+ )
558
+ with gr.Row():
559
+ step_btn = gr.Button("Execute Step", variant="primary")
560
+ starter_btn = gr.Button("Run Starter Step")
561
+ report_btn = gr.Button("Export Episode Report")
562
+ gr.Markdown("If you are new, click **Run Starter Step**. It resets the selected task and submits a safe example action.")
563
+ output = gr.Code(label="API Response", language="json")
564
+ report_out = gr.Code(label="Episode Report", language="json")
565
+
566
+ with gr.Tab("Traces"):
567
+ with gr.Column(elem_id="atlas-panel"):
568
+ models, trace_tasks = _trace_choices()
569
+ gr.Markdown("### Recorded Traces")
570
+ with gr.Row():
571
+ trace_model = gr.Dropdown(choices=models, value=models[0], label="Model")
572
+ trace_task = gr.Dropdown(choices=trace_tasks, value=trace_tasks[0], label="Task")
573
+ trace_refresh = gr.Button("Load Trace")
574
+ trace_out = gr.Code(label="Trace Payload", language="json")
575
+
576
+ with gr.Tab("Leaderboard"):
577
+ with gr.Column(elem_id="atlas-panel"):
578
+ summary = _benchmark_summary()
579
+ gr.Markdown("### Benchmark Leaderboard")
580
+ leaderboard_summary = gr.Markdown(f"**Average Task Score:** {summary.get('average_task_score', 0):.3f} | **Average Reward:** {summary.get('average_total_reward', 0):.3f}")
581
+ leaderboard = gr.Dataframe(
582
+ headers=["Rank", "Task", "Task Score", "Total Reward", "Steps", "Model"],
583
+ value=_leaderboard_rows(),
584
+ interactive=False,
585
+ wrap=True,
586
+ )
587
+ leaderboard_refresh = gr.Button("Refresh Leaderboard")
588
+
589
+ with gr.Tab("Tasks"):
590
+ with gr.Column(elem_id="atlas-panel"):
591
+ gr.Markdown("### Task Catalogue")
592
+ diff_summary = gr.Textbox(
593
+ label="Difficulty Split",
594
+ value=_difficulty_summary(),
595
+ interactive=False,
596
+ elem_classes=["mono"],
597
+ )
598
+ task_grid = gr.Dataframe(
599
+ headers=["Task ID", "Difficulty", "Language", "Name"],
600
+ value=_task_table(),
601
+ interactive=False,
602
+ wrap=True,
603
+ )
604
+ refresh_tasks_btn = gr.Button("Refresh Task Atlas")
605
+
606
+ for task in TaskDefinitions.get_all_tasks():
 
607
  gr.Markdown(
608
  f"""
609
  <div class='task-row'>
 
615
  </div>
616
  """
617
  )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
618
 
619
+ def _update_playground_metrics(payload: Dict[str, Any]) -> tuple[str, str, str]:
620
+ score_value = payload.get("task_score", 0.0)
621
+ step_value = payload.get("current_step", 0)
622
+ status_value = "complete" if payload.get("is_complete") else "active"
623
+ return (
624
+ f"<div class='metric'><div class='metric-label'>Current Score</div><div class='metric-value'>{float(score_value):.2f}</div></div>",
625
+ f"<div class='metric'><div class='metric-label'>Step</div><div class='metric-value'>{step_value}</div></div>",
626
+ f"<div class='metric'><div class='metric-label'>Status</div><div class='metric-value'>{status_value}</div></div>",
627
+ )
628
+
629
+ def _refresh_leaderboard() -> tuple[list[list[str]], str]:
630
+ summary_data = _benchmark_summary()
631
+ avg_score = float(summary_data.get("average_task_score", 0.0)) if isinstance(summary_data, dict) else 0.0
632
+ avg_reward = float(summary_data.get("average_total_reward", 0.0)) if isinstance(summary_data, dict) else 0.0
633
+ return _leaderboard_rows(), f"### Benchmark Leaderboard\n\n**Average Task Score:** {avg_score:.3f} | **Average Reward:** {avg_reward:.3f}"
634
+
635
+ def _load_trace(model_name: str, task_id: str) -> str:
636
+ return _trace_lookup(model_name, task_id)
637
+
638
+ reset_btn.click(fn=_ui_reset, inputs=[task_id_input], outputs=[output])
639
+ step_btn.click(fn=_ui_step, inputs=[action_input], outputs=[output])
640
+ starter_btn.click(fn=_ui_run_starter_step, inputs=[task_id_input], outputs=[output])
641
+ state_btn.click(fn=_ui_state, inputs=None, outputs=[output])
642
+ score_btn.click(fn=_ui_score, inputs=None, outputs=[output])
643
+ report_btn.click(fn=_episode_report, inputs=None, outputs=[report_out])
644
+ score_btn.click(fn=lambda: _update_playground_metrics(score()), inputs=None, outputs=[score_card, step_card, status_card])
645
+ trace_refresh.click(fn=_load_trace, inputs=[trace_model, trace_task], outputs=[trace_out])
646
+ leaderboard_refresh.click(fn=_refresh_leaderboard, inputs=None, outputs=[leaderboard, leaderboard_summary])
647
+ refresh_tasks_btn.click(fn=_difficulty_summary, inputs=None, outputs=[diff_summary])
648
+ refresh_tasks_btn.click(fn=_task_table, inputs=None, outputs=[task_grid])
649
+
650
+ return demo
651
 
652
 
653
  @app.get("/ui")
654
  def ui_alias() -> Any:
655
+ if ENABLE_GRADIO_UI and gr is not None:
656
+ return RedirectResponse(url="/", status_code=307)
657
+ return RedirectResponse(url="/docs", status_code=307)
658
 
659
+ if ENABLE_GRADIO_UI and gr is not None:
660
+ app = gr.mount_gradio_app(app, _build_demo(), path="/")
tests/test_server_api.py CHANGED
@@ -1,5 +1,8 @@
1
  import unittest
2
 
 
 
 
3
  from server.app import app
4
 
5
 
@@ -33,5 +36,23 @@ class TestServerAPI(unittest.TestCase):
33
  self.assertIn("task_id", payload)
34
 
35
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
36
  if __name__ == "__main__":
37
  unittest.main()
 
1
  import unittest
2
 
3
+ from fastapi.testclient import TestClient
4
+
5
+ from app import app as fastapi_app
6
  from server.app import app
7
 
8
 
 
36
  self.assertIn("task_id", payload)
37
 
38
 
39
+ class TestFastAPIReset(unittest.TestCase):
40
+ def setUp(self):
41
+ self.client = TestClient(fastapi_app)
42
+
43
+ def test_post_reset_without_body(self):
44
+ response = self.client.post("/reset")
45
+ self.assertEqual(response.status_code, 200)
46
+ payload = response.json()
47
+ self.assertIn("observation", payload)
48
+ self.assertIn("task_description", payload["observation"])
49
+
50
+ def test_post_reset_with_task_id_body(self):
51
+ response = self.client.post("/reset", json={"task_id": "bug_detection_easy_1"})
52
+ self.assertEqual(response.status_code, 200)
53
+ payload = response.json()
54
+ self.assertEqual(payload["observation"]["task_difficulty"], "easy")
55
+
56
+
57
  if __name__ == "__main__":
58
  unittest.main()