diff --git "a/Roadmap.html" "b/Roadmap.html" deleted file mode 100644--- "a/Roadmap.html" +++ /dev/null @@ -1,2140 +0,0 @@ - - -
- - -- A production-grade OpenEnv environment where AI agents act as senior code reviewers inside a software organization. Built from your actual architecture patterns — MoE routing, DAG orchestration, event-driven agents, Kafka messaging. Every design decision traced back to your repos. -
- -- A simulation of the QA/Reviewer agent role inside your Autonomous-Multi-Agent-AI-Organization. The agent under evaluation receives a synthetic pull request — exactly like what the Backend Engineer agent would produce — and must act as a senior code reviewer: identifying bugs, security vulnerabilities, and architectural problems, then deciding whether to approve or request changes. -
-- Every software company does code review. It's a bottleneck — senior engineers spend 3–5 hours per week reviewing code. An agent that can reliably catch 80% of security vulnerabilities and architectural problems before human review would be immediately deployable. This is the exact use case companies pay for. The graders are deterministic (no LLM calls) and reproducible. The difficulty progression from "spot the null dereference" to "evaluate service-level architectural tradeoffs" is genuine — it mirrors how human engineers progress from junior to senior. -
-- Most agent environments test information retrieval, math reasoning, or game-playing. Code review requires multi-step reasoning over structured text with domain-specific knowledge, precise output format requirements, severity calibration, and a terminal verdict — a unique combination that no existing OpenEnv environment covers. -
-Every piece of data that flows through the system lives here. Zero imports from logic files. This forces you to finalize the contract before anything depends on it — the same discipline your multi-agent repo applied to Kafka message schemas.
- -# critical: Action validator -@model_validator(mode='after') -def validate_flag_issue(self): - if self.action_type == ActionType.FLAG_ISSUE: - if not self.severity or not self.category: - raise ValueError( - "flag_issue requires severity and category" - ) - if self.action_type in (ActionType.APPROVE, ActionType.REQUEST_CHANGES): - if not self.verdict: - raise ValueError( - "approve/request_changes requires verdict" - ) - return self
This is what makes the environment actually useful for benchmarking. Inspired by how your orchestrator generates task configs from a project idea — here you generate synthetic PR scenarios from a seed. Every scenario must be realistic enough that a human engineer would recognize it as a real PR.
- -def get_scenario(task_id: TaskId, seed: int) -> Scenario: - rng = random.Random(seed) - bank = SCENARIOS[task_id] # list of Scenario - idx = rng.randint(0, len(bank) - 1) - scenario = bank[idx] - # optionally shuffle distractor issues using rng - return scenario
The environment class holds all episode state. Borrowing the pattern from your DAG orchestrator — state transitions are explicit and every transition is logged to history. The done condition mirrors your agent task lifecycle: episodes end on a terminal action OR max_steps exceeded.
- -class CodeReviewEnv: - TASK_MAX_STEPS = { - "bug_detection": 10, - "security_audit": 15, - "architectural_review": 20, - } - - def reset(self, task_id: str, seed: int = 42) -> ResetResult: - scenario = get_scenario(task_id, seed) - self._state = EpisodeState( - task_id=task_id, seed=seed, scenario=scenario, - step_count=0, noise_budget=5, - max_steps=self.TASK_MAX_STEPS[task_id], - actions_taken=[], running_score=0.0, done=False, - ) - return ResetResult( - observation=self._build_obs(), - task_id=task_id, seed=seed, - scenario_hash=scenario.hash - ) - - def step(self, action: Action) -> StepResult: - s = self._state - if s.done: - raise RuntimeError("episode is done, call reset()") - s.step_count += 1 - s.actions_taken.append(action) - reward = _apply_action(s, action) # calls grader - s.done = ( - action.action_type in (APPROVE, REQUEST_CHANGES) - or s.step_count >= s.max_steps - or s.noise_budget <= 0 - ) - return StepResult( - observation=self._build_obs(), - reward=reward, done=s.done, - info={"step": s.step_count, "score": s.running_score, - "noise_budget": s.noise_budget} - )
Graders are pure functions — same input always produces same output. They never call an LLM. Inspired by your Rust MoE scorer which assigns confidence values to expert routing decisions: here each matched issue gets a confidence_score based on keyword overlap between the agent's body text and the ground truth keyword list.
- -def grade_bug(actions: list[Action], ground_truth: list[GroundTruthIssue]) -> float: - tp, fp = 0, 0 - matched = set() - for action in actions: - if action.action_type != FLAG_ISSUE: continue - match = find_best_match(action, ground_truth, matched) - if match: - confidence = keyword_overlap(action.body, match.keywords) - tp += confidence # partial credit on confidence - matched.add(match.id) - else: - fp += 1 - recall = tp / len(ground_truth) if ground_truth else 0 - precision = tp / (tp + fp) if (tp + fp) > 0 else 0 - score = 0.7 * recall + 0.3 * precision - return round(min(1.0, max(0.0, score)), 4)
def keyword_overlap(body: str, keywords: list[str]) -> float: - """Returns 0.0–1.0 confidence score based on keyword coverage.""" - if not body or not keywords: return 0.5 # missing body = half credit - body_lower = body.lower() - hits = sum(1 for kw in keywords if kw.lower() in body_lower) - return min(1.0, hits / max(4, len(keywords) * 0.6)) - -def find_best_match(action, ground_truth, already_matched): - """Line-number match (exact) OR category+file fuzzy match.""" - for gt in ground_truth: - if gt.id in already_matched: continue - line_match = (action.line_number and - abs(action.line_number - gt.line_number) <= 3) - cat_match = (action.category == gt.category and - action.filename == gt.filename) - if line_match or cat_match: return gt - return None
Directly inspired by your Go gateway pattern (Fiber HTTP in the multi-agent repo) — the API layer does nothing except serialize and deserialize. All logic lives in env.py. Single global env instance per process. No session management needed for the hackathon.
- -@app.post("/reset") -async def reset_env(req: ResetRequest) -> ResetResult: - return env.reset(req.task_id, req.seed) - -@app.post("/step") -async def step_env(action: Action) -> StepResult: - result = env.step(action) - await broadcast_event(result) # → /ws/events - return result - -@app.websocket("/ws/events") -async def ws_events(ws: WebSocket): - await ws.accept() - clients.add(ws) - try: - while True: await ws.receive_text() - finally: clients.discard(ws)
FROM python:3.11-slim -WORKDIR /app -COPY requirements.txt . -RUN pip install --no-cache-dir -r requirements.txt -COPY . . -EXPOSE 7860 -HEALTHCHECK --interval=30s --timeout=10s --retries=3 \ - CMD curl -f http://localhost:7860/health || exit 1 -CMD ["uvicorn", "app:app", \ - "--host", "0.0.0.0", \ - "--port", "7860", \ - "--workers", "1"]
fastapi==0.110.0 -uvicorn[standard]==0.27.0 -pydantic>=2.0 -websockets==12.0
---
-title: AgentOrg CodeReview Env
-emoji: 🔍
-colorFrom: purple
-colorTo: teal
-sdk: docker
-pinned: false
-tags:
- - openenv
- - code-review
- - agent-evaluation
- - reinforcement-learning
----Directly mirrors your run_demo.py. A naive agent that uses only keyword matching — no LLM, no reasoning. Its low scores anchor the scale: a strong LLM agent should score 3–5× higher. Run with --url to point at your live HF Space for judge verification.
- -# scripts/baseline.py -def run_episode(url: str, task_id: str, seed: int) -> float: - reset = requests.post(f"{url}/reset", - json={"task_id": task_id, "seed": seed}) - obs = reset.json()["observation"] - diff = obs["diff"] - - for pattern, cat, sev in KEYWORD_RULES: - if re.search(pattern, diff, re.IGNORECASE): - requests.post(f"{url}/step", json={ - "action_type": "flag_issue", - "category": cat, "severity": sev, - "body": f"Detected {cat} pattern: {pattern}" - }) - - final = requests.post(f"{url}/step", json={ - "action_type": "request_changes", - "verdict": "REQUEST_CHANGES", - "body": "Baseline review complete" - }) - return final.json()["reward"]
# ── TASK 1: Bug Detection ────────────────────────────────────────────── -score = (0.7 × recall) + (0.3 × precision) -recall = true_positives / total_ground_truth_bugs -precision = true_positives / (true_positives + false_positives) -# confidence modifier per match: keyword_overlap(agent.body, gt.keywords) - -# ── TASK 2: Security Audit ───────────────────────────────────────────── -weights = {critical: 0.40, high: 0.25, medium: 0.15, low: 0.05} -score = Σ(weight[sev] × confidence) / max_possible -penalty = false_critical_alarms × 0.15 # dangerous false alarm = heavy cost -score = max(0.0, score - penalty) - -# ── TASK 3: Architectural Review ─────────────────────────────────────── -issue_score = 0.60 × (weighted issue detection, same as task 2) -verdict_score = 0.20 × (1 if verdict == required_verdict else 0) -quality_score = 0.20 × (proportion of correctly flagged issues where len(body) > 80) -score = issue_score + verdict_score + quality_score - -# ── NOISE BUDGET (all tasks) ─────────────────────────────────────────── -noise_budget starts at 5 -each false_positive flag_issue: budget -= 1 -budget == 0: done = True, score = current running_score (no further credit)-
| Task | -Naive Baseline | -Expected Strong LLM | -Score Spread | -
|---|---|---|---|
| bug_detection | -- - | -- - | -0.03 – 0.35 | -
| security_audit | -- - | -- - | -0.10 – 0.25 | -
| architectural_review | -- - | -- - | -0.04 – 0.15 | -