Spaces:
Sleeping
Sleeping
| """ | |
| inference.py — Bug Triage Env | |
| OpenEnv Hackathon submission inference script. | |
| Required env vars: | |
| API_BASE_URL LiteLLM proxy base URL (injected by validator) | |
| HF_TOKEN API key (injected by validator) | |
| ENV_BASE_URL Bug Triage env URL (optional) | |
| MODEL_NAME Model identifier (optional) | |
| """ | |
| import os | |
| import json | |
| import time | |
| import textwrap | |
| import requests | |
| from typing import List, Optional | |
| from openai import OpenAI | |
| from model import TriageAction, TriageObservation, BugReport | |
| # --------------------------------------------------------------------------- | |
| # CONFIG — uses env vars required by hackathon spec | |
| # --------------------------------------------------------------------------- | |
| API_BASE_URL = os.getenv("API_BASE_URL") or "https://router.huggingface.co/v1" | |
| API_KEY = os.getenv("HF_TOKEN") or os.getenv("API_KEY") or os.getenv("OPENAI_API_KEY") | |
| MODEL_NAME = os.getenv("MODEL_NAME") or "meta-llama/Llama-3.3-70B-Instruct" | |
| ENV_BASE_URL = os.getenv("ENV_BASE_URL") or "https://siteshcodes-bug-triage-env.hf.space" | |
| if not API_KEY: | |
| raise RuntimeError("HF_TOKEN is not set") | |
| TASK_IDS = ["easy", "medium", "hard"] | |
| BENCHMARK = "bug-triage-env" | |
| TEMPERATURE = 0.0 | |
| MAX_TOKENS = 500 | |
| MAX_STEPS = 4 # Max steps per task (investigate + submit) | |
| MAX_TOTAL_REWARD = 1.0 | |
| SUCCESS_SCORE_THRESHOLD = 0.4 | |
| print(f"[CONFIG] API_BASE_URL={API_BASE_URL}", flush=True) | |
| print(f"[CONFIG] MODEL_NAME={MODEL_NAME}", flush=True) | |
| print(f"[CONFIG] ENV_BASE_URL={ENV_BASE_URL}", flush=True) | |
| print(f"[CONFIG] API_KEY={'set' if API_KEY else 'MISSING'}", flush=True) | |
| # --------------------------------------------------------------------------- | |
| # INLINED CLIENT — self-contained, no external dependency | |
| # --------------------------------------------------------------------------- | |
| def _parse_observation(data: dict) -> TriageObservation: | |
| try: | |
| bug = BugReport.model_validate(data["bug_report"]) | |
| except Exception: | |
| bug = BugReport(**data["bug_report"]) | |
| return TriageObservation( | |
| bug_report=bug, | |
| task_id=data.get("task_id", "easy"), | |
| score=data.get("score", 0.0), | |
| feedback=data.get("feedback", ""), | |
| done=data.get("done", False), | |
| reward=data.get("reward", 0.0), | |
| body_visible=data.get("body_visible", False), | |
| comments_visible=data.get("comments_visible", False), | |
| logs_visible=data.get("logs_visible", False), | |
| similar_visible=data.get("similar_visible", False), | |
| steps_taken=data.get("steps_taken", 0), | |
| max_steps=data.get("max_steps", 6), | |
| ) | |
| class StepResult: | |
| def __init__(self, observation: TriageObservation, reward: float, | |
| done: bool, info: dict): | |
| self.observation = observation | |
| self.reward = reward | |
| self.done = done | |
| self.info = info | |
| class BugTriageClient: | |
| def __init__(self, base_url: Optional[str] = None): | |
| self.base_url = (base_url or ENV_BASE_URL).rstrip("/") | |
| self.session = requests.Session() | |
| self.session.headers.update({"Content-Type": "application/json"}) | |
| self._session_id: Optional[str] = None | |
| def reset(self, task_id: str = "easy") -> TriageObservation: | |
| print(f"[ENV] Resetting env for task={task_id}", flush=True) | |
| payload = {"task_id": task_id} | |
| if self._session_id: | |
| payload["session_id"] = self._session_id | |
| response = self.session.post( | |
| f"{self.base_url}/reset", json=payload, timeout=30, | |
| ) | |
| response.raise_for_status() | |
| data = response.json() | |
| self._session_id = data.get("session_id") | |
| return _parse_observation(data.get("observation", data)) | |
| def step(self, action: TriageAction) -> StepResult: | |
| print(f"[ENV] Sending step: action_type={action.action_type}", flush=True) | |
| try: | |
| action_dict = action.model_dump() | |
| except AttributeError: | |
| action_dict = action.dict() | |
| payload = {"action": action_dict} | |
| if self._session_id: | |
| payload["session_id"] = self._session_id | |
| response = self.session.post( | |
| f"{self.base_url}/step", json=payload, timeout=30, | |
| ) | |
| response.raise_for_status() | |
| data = response.json() | |
| obs = _parse_observation(data.get("observation", data)) | |
| reward = data.get("reward", obs.reward) | |
| if reward is None: | |
| reward = 0.0 | |
| reward = float(reward) | |
| if obs.done: | |
| reward = max(0.01, min(0.99, reward)) | |
| if "session_id" in data: | |
| self._session_id = data["session_id"] | |
| return StepResult( | |
| observation=obs, reward=reward, | |
| done=data.get("done", obs.done), info={}, | |
| ) | |
| def close(self): | |
| self.session.close() | |
| def __enter__(self): | |
| return self | |
| def __exit__(self, *args): | |
| self.close() | |
| # --------------------------------------------------------------------------- | |
| # LLM PROMPTS | |
| # --------------------------------------------------------------------------- | |
| SYSTEM_PROMPT = textwrap.dedent(""" | |
| You are a senior software engineering manager triaging a bug report. | |
| You will receive a bug report (possibly with partial information). | |
| Respond ONLY with valid JSON — no markdown, no explanation, no backticks. | |
| Return exactly this structure: | |
| { | |
| "priority": "P0", | |
| "labels": ["bug"], | |
| "assigned_team": "backend", | |
| "milestone": "hotfix", | |
| "reasoning": "one sentence explaining your decision" | |
| } | |
| Priority guide: | |
| P0 — production down, data loss, security vulnerability, 100% user impact | |
| P1 — major feature broken, significant user impact, no workaround | |
| P2 — degraded experience, workaround exists | |
| P3 — minor, cosmetic, docs, low impact | |
| Teams: backend | frontend | infra | security | devx | |
| Milestones: hotfix | v2.1 | backlog | |
| Important: Pay attention to security signals (SQL injection, XSS, auth bypass, | |
| data exposure). Security bugs should almost always be P0 + security team + hotfix. | |
| """).strip() | |
| INVESTIGATION_PROMPT = textwrap.dedent(""" | |
| You are deciding whether to investigate further or submit your triage. | |
| You have seen the following information about a bug. Based on what you see, | |
| decide if you need more information or can triage now. | |
| Respond with ONLY one of these JSON formats: | |
| To investigate: {"action": "read_body"} or {"action": "read_comments"} or {"action": "check_logs"} | |
| To submit: | |
| { | |
| "action": "submit", | |
| "priority": "P0", | |
| "labels": ["bug"], | |
| "assigned_team": "backend", | |
| "milestone": "hotfix", | |
| "reasoning": "explanation" | |
| } | |
| Only investigate if the title and preview are genuinely ambiguous. | |
| If the bug is clearly a typo or clearly critical, submit immediately. | |
| """).strip() | |
| # --------------------------------------------------------------------------- | |
| # STRUCTURED LOGGING — strict [START]/[STEP]/[END] format | |
| # --------------------------------------------------------------------------- | |
| def log_start(task: str, env: str, model: str) -> None: | |
| print(f"[START] task={task} env={env} model={model}", flush=True) | |
| def log_step(step: int, action: str, reward: float, done: bool, | |
| error: Optional[str] = None) -> None: | |
| print( | |
| f"[STEP] step={step} action={action} " | |
| f"reward={reward:.2f} done={str(done).lower()} error={error or 'null'}", | |
| flush=True, | |
| ) | |
| def log_end(success: bool, steps: int, score: float, | |
| rewards: List[float]) -> None: | |
| rewards_str = ",".join(f"{r:.2f}" for r in rewards) | |
| print( | |
| f"[END] success={str(success).lower()} steps={steps} " | |
| f"score={score:.2f} rewards={rewards_str}", | |
| flush=True, | |
| ) | |
| # --------------------------------------------------------------------------- | |
| # BUG FORMATTING | |
| # --------------------------------------------------------------------------- | |
| def format_bug(obs: TriageObservation) -> str: | |
| """Format a bug observation into text the LLM can read.""" | |
| bug = obs.bug_report | |
| parts = [f"Title: {bug.title}"] | |
| parts.append(f"\nDescription:\n{bug.body}") | |
| if obs.comments_visible and bug.comments: | |
| comments = "\n".join(f" - {c}" for c in bug.comments) | |
| parts.append(f"\nComments:\n{comments}") | |
| if bug.labels_hint: | |
| parts.append(f"\nExisting labels: {', '.join(bug.labels_hint)}") | |
| if obs.logs_visible: | |
| if bug.stack_trace: | |
| parts.append(f"\nStack trace: {bug.stack_trace}") | |
| if bug.affected_component: | |
| parts.append(f"\nAffected component: {bug.affected_component}") | |
| if bug.severity_signals: | |
| parts.append(f"\nSeverity signals: {', '.join(bug.severity_signals)}") | |
| if obs.similar_visible and bug.related_bugs: | |
| parts.append(f"\nRelated bugs: {', '.join(bug.related_bugs)}") | |
| # Add visibility context | |
| visibility = [] | |
| if not obs.body_visible: | |
| visibility.append("body (truncated)") | |
| if not obs.comments_visible: | |
| visibility.append("comments (hidden)") | |
| if not obs.logs_visible: | |
| visibility.append("logs (hidden)") | |
| if visibility: | |
| parts.append(f"\n[Hidden info: {', '.join(visibility)}]") | |
| parts.append(f"\nSteps used: {obs.steps_taken}/{obs.max_steps}") | |
| return "\n".join(parts) | |
| def format_bug_for_decision(obs: TriageObservation) -> str: | |
| """Shorter format for the investigation decision.""" | |
| bug = obs.bug_report | |
| text = f"Title: {bug.title}\nPreview: {bug.body[:150]}" | |
| if obs.body_visible: | |
| text += f"\n\nFull body visible." | |
| if obs.comments_visible and bug.comments: | |
| text += f"\nComments: {len(bug.comments)} visible." | |
| text += f"\nSteps remaining: {obs.max_steps - obs.steps_taken}" | |
| return text | |
| # --------------------------------------------------------------------------- | |
| # MODEL CALLS | |
| # --------------------------------------------------------------------------- | |
| def decide_action(client: OpenAI, obs: TriageObservation) -> dict: | |
| """Ask the LLM whether to investigate or submit.""" | |
| bug_text = format_bug_for_decision(obs) | |
| try: | |
| completion = client.chat.completions.create( | |
| model=MODEL_NAME, | |
| messages=[ | |
| {"role": "system", "content": INVESTIGATION_PROMPT}, | |
| {"role": "user", "content": bug_text}, | |
| ], | |
| temperature=TEMPERATURE, | |
| max_tokens=200, | |
| stream=False, | |
| ) | |
| raw = (completion.choices[0].message.content or "").strip() | |
| if raw.startswith("```"): | |
| parts = raw.split("```") | |
| raw = parts[1] if len(parts) > 1 else raw | |
| if raw.startswith("json"): | |
| raw = raw[4:].strip() | |
| return json.loads(raw) | |
| except Exception as e: | |
| print(f"[DEBUG] Decision model call failed: {e}", flush=True) | |
| return {"action": "submit"} | |
| def call_model(client: OpenAI, bug_text: str) -> TriageAction: | |
| """Ask the LLM to triage the bug report.""" | |
| print("[LLM] Sending triage request to model...", flush=True) | |
| completion = client.chat.completions.create( | |
| model=MODEL_NAME, | |
| messages=[ | |
| {"role": "system", "content": SYSTEM_PROMPT}, | |
| {"role": "user", "content": bug_text}, | |
| ], | |
| temperature=TEMPERATURE, | |
| max_tokens=MAX_TOKENS, | |
| stream=False, | |
| ) | |
| raw = (completion.choices[0].message.content or "").strip() | |
| print(f"[LLM] Raw response: {raw[:200]}", flush=True) | |
| if raw.startswith("```"): | |
| parts = raw.split("```") | |
| raw = parts[1] if len(parts) > 1 else raw | |
| if raw.startswith("json"): | |
| raw = raw[4:].strip() | |
| try: | |
| data = json.loads(raw) | |
| except json.JSONDecodeError as e: | |
| print(f"[LLM] JSON parse failed: {e}. Using defaults.", flush=True) | |
| data = {} | |
| action = TriageAction( | |
| action_type="submit", | |
| priority=data.get("priority", "P2"), | |
| labels=data.get("labels", ["bug"]), | |
| assigned_team=data.get("assigned_team", "backend"), | |
| milestone=data.get("milestone", "backlog"), | |
| reasoning=data.get("reasoning", ""), | |
| ) | |
| print( | |
| f"[LLM] Parsed: priority={action.priority} " | |
| f"team={action.assigned_team} milestone={action.milestone}", | |
| flush=True, | |
| ) | |
| return action | |
| # --------------------------------------------------------------------------- | |
| # MAIN — multi-step agent with per-task [START]/[STEP]/[END] logging | |
| # --------------------------------------------------------------------------- | |
| def main() -> None: | |
| client = OpenAI(base_url=API_BASE_URL, api_key=API_KEY) | |
| all_scores = [] | |
| with BugTriageClient(base_url=ENV_BASE_URL) as env: | |
| for task_id in TASK_IDS: | |
| rewards: List[float] = [] | |
| score = 0.0 | |
| success = False | |
| steps_taken = 0 | |
| log_start(task=task_id, env=BENCHMARK, model=MODEL_NAME) | |
| try: | |
| obs = env.reset(task_id=task_id) | |
| for step_num in range(1, MAX_STEPS + 1): | |
| if obs.done: | |
| break | |
| # Decide: investigate or submit? | |
| # For efficiency, check if we have enough info | |
| # On step 1, always read full body; on later steps, decide | |
| if step_num == 1 and not obs.body_visible: | |
| # First step: read the full body | |
| action = TriageAction(action_type="read_body") | |
| result = env.step(action) | |
| obs = result.observation | |
| steps_taken = step_num | |
| log_step( | |
| step=step_num, | |
| action="investigate:read_body", | |
| reward=0.0, | |
| done=result.done, | |
| ) | |
| if result.done: | |
| rewards.append(result.reward) | |
| break | |
| continue | |
| elif step_num == 2 and not obs.comments_visible: | |
| # Second step: read comments for extra context | |
| action = TriageAction(action_type="read_comments") | |
| result = env.step(action) | |
| obs = result.observation | |
| steps_taken = step_num | |
| log_step( | |
| step=step_num, | |
| action="investigate:read_comments", | |
| reward=0.0, | |
| done=result.done, | |
| ) | |
| if result.done: | |
| rewards.append(result.reward) | |
| break | |
| continue | |
| # Now submit the triage decision | |
| bug_text = format_bug(obs) | |
| action = call_model(client, bug_text) | |
| result = env.step(action) | |
| obs = result.observation | |
| steps_taken = step_num | |
| reward = float(result.reward or 0.0) | |
| if result.done: | |
| reward = max(0.01, min(0.99, reward)) | |
| rewards.append(reward) | |
| action_str = ( | |
| f"priority={action.priority}," | |
| f"team={action.assigned_team}," | |
| f"milestone={action.milestone}" | |
| ) | |
| log_step( | |
| step=step_num, | |
| action=action_str, | |
| reward=reward, | |
| done=result.done, | |
| ) | |
| if result.done: | |
| break | |
| # Calculate score | |
| if rewards: | |
| score = sum(rewards) / MAX_TOTAL_REWARD | |
| else: | |
| score = 0.0 | |
| score = min(max(score, 0.01), 0.99) | |
| success = score >= SUCCESS_SCORE_THRESHOLD | |
| except Exception as exc: | |
| print(f"[ERROR] {type(exc).__name__}: {exc}", flush=True) | |
| score = sum(rewards) / MAX_TOTAL_REWARD if rewards else 0.05 | |
| score = min(max(score, 0.01), 0.99) | |
| success = False | |
| log_end(success, steps_taken, score, rewards) | |
| all_scores.append(score) | |
| time.sleep(0.5) | |
| avg_score = sum(all_scores) / len(all_scores) if all_scores else 0.0 | |
| print( | |
| f"[SUMMARY] tasks={len(all_scores)} avg_score={avg_score:.2f} " | |
| f"scores={all_scores}", | |
| flush=True, | |
| ) | |
| if __name__ == "__main__": | |
| main() |