fix: resolve critical startup bugs, port mismatch, and leaderboard crash
Browse files- Fix hardcoded port 8000 → 7860 in scripts/baseline.py
- Add proper argparse CLI with --url, --task, --seed flags
- Fix leaderboard rank calculation crashing after list slicing
- Fix WebSocket disconnect to catch WebSocketDisconnect and use discard()
- Fix incoherent grading weights in openenv.yaml
- Add .env.example with documented environment variables
- Add missing package init files for codereview_env and graders
- .DS_Store +0 -0
- .env.example +17 -0
- app.py +7 -5
- codereview_env/__init__.py +1 -1
- codereview_env/graders/__init__.py +1 -0
- files/.DS_Store +0 -0
- openenv.yaml +6 -9
- scripts/baseline.py +18 -3
.DS_Store
ADDED
|
Binary file (8.2 kB). View file
|
|
|
.env.example
ADDED
|
@@ -0,0 +1,17 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# AgentOrg CodeReview — Environment Variables
|
| 2 |
+
# Copy this file to .env and fill in your values.
|
| 3 |
+
|
| 4 |
+
# API Configuration
|
| 5 |
+
APP_HOST=0.0.0.0
|
| 6 |
+
APP_PORT=7860
|
| 7 |
+
APP_ENV=development # development | production
|
| 8 |
+
|
| 9 |
+
# Security
|
| 10 |
+
API_KEY=changeme # Required in production; sent as X-API-Key header
|
| 11 |
+
API_KEY_ENABLED=false # Set to true in production
|
| 12 |
+
|
| 13 |
+
# Leaderboard
|
| 14 |
+
LEADERBOARD_MAX_ENTRIES=10 # Top-N entries to keep per task
|
| 15 |
+
|
| 16 |
+
# Logging
|
| 17 |
+
LOG_LEVEL=INFO # DEBUG | INFO | WARNING | ERROR
|
app.py
CHANGED
|
@@ -87,11 +87,13 @@ def get_leaderboard():
|
|
| 87 |
@app.post("/submit")
|
| 88 |
def submit_to_leaderboard(submission: SubmitScore):
|
| 89 |
entries = leaderboard.get(submission.task_id, [])
|
| 90 |
-
|
| 91 |
-
|
| 92 |
entries.sort(key=lambda x: x["score"], reverse=True)
|
|
|
|
| 93 |
leaderboard[submission.task_id] = entries[:5]
|
| 94 |
-
|
|
|
|
| 95 |
|
| 96 |
@app.websocket("/ws/events")
|
| 97 |
async def websocket_endpoint(websocket: WebSocket):
|
|
@@ -100,10 +102,10 @@ async def websocket_endpoint(websocket: WebSocket):
|
|
| 100 |
try:
|
| 101 |
while True:
|
| 102 |
await websocket.receive_text()
|
| 103 |
-
except
|
| 104 |
pass
|
| 105 |
finally:
|
| 106 |
-
clients.
|
| 107 |
|
| 108 |
if __name__ == "__main__":
|
| 109 |
import uvicorn
|
|
|
|
| 87 |
@app.post("/submit")
|
| 88 |
def submit_to_leaderboard(submission: SubmitScore):
|
| 89 |
entries = leaderboard.get(submission.task_id, [])
|
| 90 |
+
new_entry = submission.model_dump()
|
| 91 |
+
entries.append(new_entry)
|
| 92 |
entries.sort(key=lambda x: x["score"], reverse=True)
|
| 93 |
+
rank = entries.index(new_entry) + 1 # capture rank before slicing
|
| 94 |
leaderboard[submission.task_id] = entries[:5]
|
| 95 |
+
in_top5 = rank <= 5
|
| 96 |
+
return {"status": "submitted", "rank": rank if in_top5 else None}
|
| 97 |
|
| 98 |
@app.websocket("/ws/events")
|
| 99 |
async def websocket_endpoint(websocket: WebSocket):
|
|
|
|
| 102 |
try:
|
| 103 |
while True:
|
| 104 |
await websocket.receive_text()
|
| 105 |
+
except WebSocketDisconnect:
|
| 106 |
pass
|
| 107 |
finally:
|
| 108 |
+
clients.discard(websocket)
|
| 109 |
|
| 110 |
if __name__ == "__main__":
|
| 111 |
import uvicorn
|
codereview_env/__init__.py
CHANGED
|
@@ -1 +1 @@
|
|
| 1 |
-
|
|
|
|
| 1 |
+
"""AgentOrg CodeReview Environment package."""
|
codereview_env/graders/__init__.py
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
"""Grader modules for each task type."""
|
files/.DS_Store
ADDED
|
Binary file (6.15 kB). View file
|
|
|
openenv.yaml
CHANGED
|
@@ -16,12 +16,9 @@ tasks:
|
|
| 16 |
|
| 17 |
grading:
|
| 18 |
type: "deterministic"
|
| 19 |
-
|
| 20 |
-
|
| 21 |
-
|
| 22 |
-
|
| 23 |
-
|
| 24 |
-
|
| 25 |
-
weight: 0.7
|
| 26 |
-
- name: "keyword_accuracy"
|
| 27 |
-
weight: 0.3
|
|
|
|
| 16 |
|
| 17 |
grading:
|
| 18 |
type: "deterministic"
|
| 19 |
+
issue_matching:
|
| 20 |
+
coverage_weight: 0.4
|
| 21 |
+
precision_weight: 0.6
|
| 22 |
+
quality_scoring:
|
| 23 |
+
severity_weight: 0.7
|
| 24 |
+
keyword_weight: 0.3
|
|
|
|
|
|
|
|
|
scripts/baseline.py
CHANGED
|
@@ -1,7 +1,7 @@
|
|
| 1 |
import requests
|
| 2 |
from codereview_env.models import TaskId, ActionType, Category, Severity, Verdict
|
| 3 |
|
| 4 |
-
API_URL = "http://localhost:
|
| 5 |
|
| 6 |
def run_baseline(task_id: TaskId, seed: int = 42):
|
| 7 |
# 1. Reset
|
|
@@ -67,8 +67,23 @@ def run_baseline(task_id: TaskId, seed: int = 42):
|
|
| 67 |
print(f"Final Score: {result_resp.json()['final_score']}")
|
| 68 |
|
| 69 |
if __name__ == "__main__":
|
| 70 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 71 |
try:
|
| 72 |
-
run_baseline(
|
| 73 |
except Exception as e:
|
| 74 |
print(f"Baseline failed (is the API running?): {e}")
|
|
|
|
| 1 |
import requests
|
| 2 |
from codereview_env.models import TaskId, ActionType, Category, Severity, Verdict
|
| 3 |
|
| 4 |
+
API_URL = "http://localhost:7860"
|
| 5 |
|
| 6 |
def run_baseline(task_id: TaskId, seed: int = 42):
|
| 7 |
# 1. Reset
|
|
|
|
| 67 |
print(f"Final Score: {result_resp.json()['final_score']}")
|
| 68 |
|
| 69 |
if __name__ == "__main__":
|
| 70 |
+
import argparse
|
| 71 |
+
|
| 72 |
+
parser = argparse.ArgumentParser(description="Run the baseline agent against the CodeReview API.")
|
| 73 |
+
parser.add_argument("--url", default="http://localhost:7860", help="Base URL of the running API (default: http://localhost:7860)")
|
| 74 |
+
parser.add_argument("--task", default="bug_detection", help="Task ID to run (default: bug_detection)")
|
| 75 |
+
parser.add_argument("--seed", type=int, default=0, help="Random seed (default: 0)")
|
| 76 |
+
args = parser.parse_args()
|
| 77 |
+
|
| 78 |
+
# Override module-level API_URL with CLI argument
|
| 79 |
+
API_URL = args.url
|
| 80 |
+
|
| 81 |
+
# Map string task id to TaskId enum
|
| 82 |
+
task_map = {t.value: t for t in TaskId}
|
| 83 |
+
if args.task not in task_map:
|
| 84 |
+
parser.error(f"Unknown task '{args.task}'. Choose from: {list(task_map.keys())}")
|
| 85 |
+
|
| 86 |
try:
|
| 87 |
+
run_baseline(task_map[args.task], seed=args.seed)
|
| 88 |
except Exception as e:
|
| 89 |
print(f"Baseline failed (is the API running?): {e}")
|