Spaces:

Siteshcodes
/

bug-triage-env

Sleeping

File size: 16,997 Bytes

"""
inference.py — Bug Triage Env
OpenEnv Hackathon submission inference script.

Required env vars:
    API_BASE_URL   LiteLLM proxy base URL (injected by validator)
    HF_TOKEN       API key (injected by validator)
    ENV_BASE_URL   Bug Triage env URL (optional)
    MODEL_NAME     Model identifier (optional)
"""

import os
import json
import time
import textwrap
import requests
from typing import List, Optional

from openai import OpenAI
from model import TriageAction, TriageObservation, BugReport


# ---------------------------------------------------------------------------
#  CONFIG — uses env vars required by hackathon spec
# ---------------------------------------------------------------------------

API_BASE_URL = os.getenv("API_BASE_URL") or "https://router.huggingface.co/v1"
API_KEY      = os.getenv("HF_TOKEN") or os.getenv("API_KEY") or os.getenv("OPENAI_API_KEY")
MODEL_NAME   = os.getenv("MODEL_NAME") or "meta-llama/Llama-3.3-70B-Instruct"
ENV_BASE_URL = os.getenv("ENV_BASE_URL") or "https://siteshcodes-bug-triage-env.hf.space"

if not API_KEY:
    raise RuntimeError("HF_TOKEN is not set")

TASK_IDS                = ["easy", "medium", "hard"]
BENCHMARK               = "bug-triage-env"
TEMPERATURE             = 0.0
MAX_TOKENS              = 500
MAX_STEPS               = 4       # Max steps per task (investigate + submit)
MAX_TOTAL_REWARD        = 1.0
SUCCESS_SCORE_THRESHOLD = 0.4

print(f"[CONFIG] API_BASE_URL={API_BASE_URL}", flush=True)
print(f"[CONFIG] MODEL_NAME={MODEL_NAME}", flush=True)
print(f"[CONFIG] ENV_BASE_URL={ENV_BASE_URL}", flush=True)
print(f"[CONFIG] API_KEY={'set' if API_KEY else 'MISSING'}", flush=True)


# ---------------------------------------------------------------------------
#  INLINED CLIENT — self-contained, no external dependency
# ---------------------------------------------------------------------------

def _parse_observation(data: dict) -> TriageObservation:
    try:
        bug = BugReport.model_validate(data["bug_report"])
    except Exception:
        bug = BugReport(**data["bug_report"])
    return TriageObservation(
        bug_report=bug,
        task_id=data.get("task_id", "easy"),
        score=data.get("score", 0.0),
        feedback=data.get("feedback", ""),
        done=data.get("done", False),
        reward=data.get("reward", 0.0),
        body_visible=data.get("body_visible", False),
        comments_visible=data.get("comments_visible", False),
        logs_visible=data.get("logs_visible", False),
        similar_visible=data.get("similar_visible", False),
        steps_taken=data.get("steps_taken", 0),
        max_steps=data.get("max_steps", 6),
    )


class StepResult:
    def __init__(self, observation: TriageObservation, reward: float,
                 done: bool, info: dict):
        self.observation = observation
        self.reward = reward
        self.done = done
        self.info = info


class BugTriageClient:
    def __init__(self, base_url: Optional[str] = None):
        self.base_url = (base_url or ENV_BASE_URL).rstrip("/")
        self.session = requests.Session()
        self.session.headers.update({"Content-Type": "application/json"})
        self._session_id: Optional[str] = None

    def reset(self, task_id: str = "easy") -> TriageObservation:
        print(f"[ENV] Resetting env for task={task_id}", flush=True)
        payload = {"task_id": task_id}
        if self._session_id:
            payload["session_id"] = self._session_id

        response = self.session.post(
            f"{self.base_url}/reset", json=payload, timeout=30,
        )
        response.raise_for_status()
        data = response.json()
        self._session_id = data.get("session_id")
        return _parse_observation(data.get("observation", data))

    def step(self, action: TriageAction) -> StepResult:
        print(f"[ENV] Sending step: action_type={action.action_type}", flush=True)
        try:
            action_dict = action.model_dump()
        except AttributeError:
            action_dict = action.dict()

        payload = {"action": action_dict}
        if self._session_id:
            payload["session_id"] = self._session_id

        response = self.session.post(
            f"{self.base_url}/step", json=payload, timeout=30,
        )
        response.raise_for_status()
        data = response.json()
        obs = _parse_observation(data.get("observation", data))

        reward = data.get("reward", obs.reward)
        if reward is None:
            reward = 0.0
        reward = float(reward)
        if obs.done:
            reward = max(0.01, min(0.99, reward))

        if "session_id" in data:
            self._session_id = data["session_id"]

        return StepResult(
            observation=obs, reward=reward,
            done=data.get("done", obs.done), info={},
        )

    def close(self):
        self.session.close()

    def __enter__(self):
        return self

    def __exit__(self, *args):
        self.close()


# ---------------------------------------------------------------------------
#  LLM PROMPTS
# ---------------------------------------------------------------------------

SYSTEM_PROMPT = textwrap.dedent("""
    You are a senior software engineering manager triaging a bug report.
    You will receive a bug report (possibly with partial information).
    Respond ONLY with valid JSON — no markdown, no explanation, no backticks.

    Return exactly this structure:
    {
      "priority": "P0",
      "labels": ["bug"],
      "assigned_team": "backend",
      "milestone": "hotfix",
      "reasoning": "one sentence explaining your decision"
    }

    Priority guide:
      P0 — production down, data loss, security vulnerability, 100% user impact
      P1 — major feature broken, significant user impact, no workaround
      P2 — degraded experience, workaround exists
      P3 — minor, cosmetic, docs, low impact

    Teams: backend | frontend | infra | security | devx
    Milestones: hotfix | v2.1 | backlog

    Important: Pay attention to security signals (SQL injection, XSS, auth bypass,
    data exposure). Security bugs should almost always be P0 + security team + hotfix.
""").strip()

INVESTIGATION_PROMPT = textwrap.dedent("""
    You are deciding whether to investigate further or submit your triage.
    You have seen the following information about a bug. Based on what you see,
    decide if you need more information or can triage now.

    Respond with ONLY one of these JSON formats:

    To investigate: {"action": "read_body"} or {"action": "read_comments"} or {"action": "check_logs"}
    To submit:
    {
      "action": "submit",
      "priority": "P0",
      "labels": ["bug"],
      "assigned_team": "backend",
      "milestone": "hotfix",
      "reasoning": "explanation"
    }

    Only investigate if the title and preview are genuinely ambiguous.
    If the bug is clearly a typo or clearly critical, submit immediately.
""").strip()


# ---------------------------------------------------------------------------
#  STRUCTURED LOGGING — strict [START]/[STEP]/[END] format
# ---------------------------------------------------------------------------

def log_start(task: str, env: str, model: str) -> None:
    print(f"[START] task={task} env={env} model={model}", flush=True)


def log_step(step: int, action: str, reward: float, done: bool,
             error: Optional[str] = None) -> None:
    print(
        f"[STEP] step={step} action={action} "
        f"reward={reward:.2f} done={str(done).lower()} error={error or 'null'}",
        flush=True,
    )


def log_end(success: bool, steps: int, score: float,
            rewards: List[float]) -> None:
    rewards_str = ",".join(f"{r:.2f}" for r in rewards)
    print(
        f"[END] success={str(success).lower()} steps={steps} "
        f"score={score:.2f} rewards={rewards_str}",
        flush=True,
    )


# ---------------------------------------------------------------------------
#  BUG FORMATTING
# ---------------------------------------------------------------------------

def format_bug(obs: TriageObservation) -> str:
    """Format a bug observation into text the LLM can read."""
    bug = obs.bug_report
    parts = [f"Title: {bug.title}"]

    parts.append(f"\nDescription:\n{bug.body}")

    if obs.comments_visible and bug.comments:
        comments = "\n".join(f"  - {c}" for c in bug.comments)
        parts.append(f"\nComments:\n{comments}")

    if bug.labels_hint:
        parts.append(f"\nExisting labels: {', '.join(bug.labels_hint)}")

    if obs.logs_visible:
        if bug.stack_trace:
            parts.append(f"\nStack trace: {bug.stack_trace}")
        if bug.affected_component:
            parts.append(f"\nAffected component: {bug.affected_component}")
        if bug.severity_signals:
            parts.append(f"\nSeverity signals: {', '.join(bug.severity_signals)}")

    if obs.similar_visible and bug.related_bugs:
        parts.append(f"\nRelated bugs: {', '.join(bug.related_bugs)}")

    # Add visibility context
    visibility = []
    if not obs.body_visible:
        visibility.append("body (truncated)")
    if not obs.comments_visible:
        visibility.append("comments (hidden)")
    if not obs.logs_visible:
        visibility.append("logs (hidden)")
    if visibility:
        parts.append(f"\n[Hidden info: {', '.join(visibility)}]")

    parts.append(f"\nSteps used: {obs.steps_taken}/{obs.max_steps}")

    return "\n".join(parts)


def format_bug_for_decision(obs: TriageObservation) -> str:
    """Shorter format for the investigation decision."""
    bug = obs.bug_report
    text = f"Title: {bug.title}\nPreview: {bug.body[:150]}"
    if obs.body_visible:
        text += f"\n\nFull body visible."
    if obs.comments_visible and bug.comments:
        text += f"\nComments: {len(bug.comments)} visible."
    text += f"\nSteps remaining: {obs.max_steps - obs.steps_taken}"
    return text


# ---------------------------------------------------------------------------
#  MODEL CALLS
# ---------------------------------------------------------------------------

def decide_action(client: OpenAI, obs: TriageObservation) -> dict:
    """Ask the LLM whether to investigate or submit."""
    bug_text = format_bug_for_decision(obs)

    try:
        completion = client.chat.completions.create(
            model=MODEL_NAME,
            messages=[
                {"role": "system", "content": INVESTIGATION_PROMPT},
                {"role": "user", "content": bug_text},
            ],
            temperature=TEMPERATURE,
            max_tokens=200,
            stream=False,
        )
        raw = (completion.choices[0].message.content or "").strip()
        if raw.startswith("```"):
            parts = raw.split("```")
            raw = parts[1] if len(parts) > 1 else raw
            if raw.startswith("json"):
                raw = raw[4:].strip()
        return json.loads(raw)
    except Exception as e:
        print(f"[DEBUG] Decision model call failed: {e}", flush=True)
        return {"action": "submit"}


def call_model(client: OpenAI, bug_text: str) -> TriageAction:
    """Ask the LLM to triage the bug report."""
    print("[LLM] Sending triage request to model...", flush=True)

    completion = client.chat.completions.create(
        model=MODEL_NAME,
        messages=[
            {"role": "system", "content": SYSTEM_PROMPT},
            {"role": "user",   "content": bug_text},
        ],
        temperature=TEMPERATURE,
        max_tokens=MAX_TOKENS,
        stream=False,
    )

    raw = (completion.choices[0].message.content or "").strip()
    print(f"[LLM] Raw response: {raw[:200]}", flush=True)

    if raw.startswith("```"):
        parts = raw.split("```")
        raw = parts[1] if len(parts) > 1 else raw
        if raw.startswith("json"):
            raw = raw[4:].strip()

    try:
        data = json.loads(raw)
    except json.JSONDecodeError as e:
        print(f"[LLM] JSON parse failed: {e}. Using defaults.", flush=True)
        data = {}

    action = TriageAction(
        action_type="submit",
        priority=data.get("priority", "P2"),
        labels=data.get("labels", ["bug"]),
        assigned_team=data.get("assigned_team", "backend"),
        milestone=data.get("milestone", "backlog"),
        reasoning=data.get("reasoning", ""),
    )

    print(
        f"[LLM] Parsed: priority={action.priority} "
        f"team={action.assigned_team} milestone={action.milestone}",
        flush=True,
    )
    return action


# ---------------------------------------------------------------------------
#  MAIN — multi-step agent with per-task [START]/[STEP]/[END] logging
# ---------------------------------------------------------------------------

def main() -> None:
    client = OpenAI(base_url=API_BASE_URL, api_key=API_KEY)

    all_scores = []

    with BugTriageClient(base_url=ENV_BASE_URL) as env:
        for task_id in TASK_IDS:
            rewards: List[float] = []
            score = 0.0
            success = False
            steps_taken = 0

            log_start(task=task_id, env=BENCHMARK, model=MODEL_NAME)

            try:
                obs = env.reset(task_id=task_id)

                for step_num in range(1, MAX_STEPS + 1):
                    if obs.done:
                        break

                    # Decide: investigate or submit?
                    # For efficiency, check if we have enough info
                    # On step 1, always read full body; on later steps, decide
                    if step_num == 1 and not obs.body_visible:
                        # First step: read the full body
                        action = TriageAction(action_type="read_body")
                        result = env.step(action)
                        obs = result.observation
                        steps_taken = step_num

                        log_step(
                            step=step_num,
                            action="investigate:read_body",
                            reward=0.0,
                            done=result.done,
                        )

                        if result.done:
                            rewards.append(result.reward)
                            break
                        continue

                    elif step_num == 2 and not obs.comments_visible:
                        # Second step: read comments for extra context
                        action = TriageAction(action_type="read_comments")
                        result = env.step(action)
                        obs = result.observation
                        steps_taken = step_num

                        log_step(
                            step=step_num,
                            action="investigate:read_comments",
                            reward=0.0,
                            done=result.done,
                        )

                        if result.done:
                            rewards.append(result.reward)
                            break
                        continue

                    # Now submit the triage decision
                    bug_text = format_bug(obs)
                    action = call_model(client, bug_text)
                    result = env.step(action)
                    obs = result.observation
                    steps_taken = step_num

                    reward = float(result.reward or 0.0)
                    if result.done:
                        reward = max(0.01, min(0.99, reward))
                    rewards.append(reward)

                    action_str = (
                        f"priority={action.priority},"
                        f"team={action.assigned_team},"
                        f"milestone={action.milestone}"
                    )

                    log_step(
                        step=step_num,
                        action=action_str,
                        reward=reward,
                        done=result.done,
                    )

                    if result.done:
                        break

                # Calculate score
                if rewards:
                    score = sum(rewards) / MAX_TOTAL_REWARD
                else:
                    score = 0.0
                score = min(max(score, 0.01), 0.99)
                success = score >= SUCCESS_SCORE_THRESHOLD

            except Exception as exc:
                print(f"[ERROR] {type(exc).__name__}: {exc}", flush=True)
                score = sum(rewards) / MAX_TOTAL_REWARD if rewards else 0.05
                score = min(max(score, 0.01), 0.99)
                success = False

            log_end(success, steps_taken, score, rewards)
            all_scores.append(score)

            time.sleep(0.5)

    avg_score = sum(all_scores) / len(all_scores) if all_scores else 0.0
    print(
        f"[SUMMARY] tasks={len(all_scores)} avg_score={avg_score:.2f} "
        f"scores={all_scores}",
        flush=True,
    )


if __name__ == "__main__":
    main()