File size: 16,997 Bytes
9e050fb
 
 
 
 
cc1335d
e8f71a0
25b9a21
 
9e050fb
 
 
 
 
 
20f2ce3
9e050fb
 
 
20f2ce3
9e050fb
25b9a21
703aa57
 
 
 
e8f71a0
 
 
 
cc1335d
ab6f40e
e8f71a0
ab6f40e
25b9a21
 
 
703aa57
 
 
9e050fb
 
cc1335d
 
 
 
 
703aa57
 
 
 
20f2ce3
 
ff292ff
 
 
 
20f2ce3
 
 
703aa57
20f2ce3
 
703aa57
 
 
 
 
 
 
20f2ce3
 
 
 
703aa57
 
20f2ce3
 
 
 
 
 
 
 
 
 
 
703aa57
20f2ce3
 
 
703aa57
 
 
 
20f2ce3
703aa57
20f2ce3
 
 
703aa57
20f2ce3
 
 
703aa57
ff292ff
e8f71a0
ff292ff
e8f71a0
703aa57
 
 
 
 
20f2ce3
703aa57
20f2ce3
 
 
 
703aa57
6174aa3
703aa57
 
6174aa3
703aa57
 
 
 
 
 
20f2ce3
703aa57
 
20f2ce3
 
 
 
 
 
 
 
 
 
 
 
703aa57
 
 
cc1335d
9e050fb
703aa57
 
 
9e050fb
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
703aa57
 
 
9e050fb
 
703aa57
 
 
 
9e050fb
703aa57
4ba1053
703aa57
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
25b9a21
e8f71a0
 
9e050fb
 
703aa57
 
9e050fb
e8f71a0
25b9a21
9e050fb
 
 
 
703aa57
 
9e050fb
 
25b9a21
e8f71a0
9e050fb
 
 
 
703aa57
 
 
9e050fb
20f2ce3
703aa57
9e050fb
703aa57
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9e050fb
 
 
703aa57
 
cc1335d
e48bc80
 
 
 
 
 
 
 
 
 
cc1335d
e48bc80
cc1335d
 
e48bc80
cc1335d
 
e48bc80
cc1335d
 
ff292ff
 
 
e8f71a0
ff292ff
cc1335d
 
703aa57
e48bc80
 
 
 
 
 
9e050fb
cc1335d
e8f71a0
cc1335d
 
 
 
 
9e050fb
703aa57
 
 
9e050fb
 
7eb0325
6174aa3
7eb0325
9e050fb
7eb0325
 
 
 
 
 
703aa57
7eb0325
 
 
 
703aa57
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
7eb0325
 
 
 
 
 
 
 
 
 
 
 
 
 
 
703aa57
 
 
 
 
25b9a21
9e050fb
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
"""
inference.py β€” Bug Triage Env
OpenEnv Hackathon submission inference script.

Required env vars:
    API_BASE_URL   LiteLLM proxy base URL (injected by validator)
    HF_TOKEN       API key (injected by validator)
    ENV_BASE_URL   Bug Triage env URL (optional)
    MODEL_NAME     Model identifier (optional)
"""

import os
import json
import time
import textwrap
import requests
from typing import List, Optional

from openai import OpenAI
from model import TriageAction, TriageObservation, BugReport


# ---------------------------------------------------------------------------
#  CONFIG β€” uses env vars required by hackathon spec
# ---------------------------------------------------------------------------

API_BASE_URL = os.getenv("API_BASE_URL") or "https://router.huggingface.co/v1"
API_KEY      = os.getenv("HF_TOKEN") or os.getenv("API_KEY") or os.getenv("OPENAI_API_KEY")
MODEL_NAME   = os.getenv("MODEL_NAME") or "meta-llama/Llama-3.3-70B-Instruct"
ENV_BASE_URL = os.getenv("ENV_BASE_URL") or "https://siteshcodes-bug-triage-env.hf.space"

if not API_KEY:
    raise RuntimeError("HF_TOKEN is not set")

TASK_IDS                = ["easy", "medium", "hard"]
BENCHMARK               = "bug-triage-env"
TEMPERATURE             = 0.0
MAX_TOKENS              = 500
MAX_STEPS               = 4       # Max steps per task (investigate + submit)
MAX_TOTAL_REWARD        = 1.0
SUCCESS_SCORE_THRESHOLD = 0.4

print(f"[CONFIG] API_BASE_URL={API_BASE_URL}", flush=True)
print(f"[CONFIG] MODEL_NAME={MODEL_NAME}", flush=True)
print(f"[CONFIG] ENV_BASE_URL={ENV_BASE_URL}", flush=True)
print(f"[CONFIG] API_KEY={'set' if API_KEY else 'MISSING'}", flush=True)


# ---------------------------------------------------------------------------
#  INLINED CLIENT β€” self-contained, no external dependency
# ---------------------------------------------------------------------------

def _parse_observation(data: dict) -> TriageObservation:
    try:
        bug = BugReport.model_validate(data["bug_report"])
    except Exception:
        bug = BugReport(**data["bug_report"])
    return TriageObservation(
        bug_report=bug,
        task_id=data.get("task_id", "easy"),
        score=data.get("score", 0.0),
        feedback=data.get("feedback", ""),
        done=data.get("done", False),
        reward=data.get("reward", 0.0),
        body_visible=data.get("body_visible", False),
        comments_visible=data.get("comments_visible", False),
        logs_visible=data.get("logs_visible", False),
        similar_visible=data.get("similar_visible", False),
        steps_taken=data.get("steps_taken", 0),
        max_steps=data.get("max_steps", 6),
    )


class StepResult:
    def __init__(self, observation: TriageObservation, reward: float,
                 done: bool, info: dict):
        self.observation = observation
        self.reward = reward
        self.done = done
        self.info = info


class BugTriageClient:
    def __init__(self, base_url: Optional[str] = None):
        self.base_url = (base_url or ENV_BASE_URL).rstrip("/")
        self.session = requests.Session()
        self.session.headers.update({"Content-Type": "application/json"})
        self._session_id: Optional[str] = None

    def reset(self, task_id: str = "easy") -> TriageObservation:
        print(f"[ENV] Resetting env for task={task_id}", flush=True)
        payload = {"task_id": task_id}
        if self._session_id:
            payload["session_id"] = self._session_id

        response = self.session.post(
            f"{self.base_url}/reset", json=payload, timeout=30,
        )
        response.raise_for_status()
        data = response.json()
        self._session_id = data.get("session_id")
        return _parse_observation(data.get("observation", data))

    def step(self, action: TriageAction) -> StepResult:
        print(f"[ENV] Sending step: action_type={action.action_type}", flush=True)
        try:
            action_dict = action.model_dump()
        except AttributeError:
            action_dict = action.dict()

        payload = {"action": action_dict}
        if self._session_id:
            payload["session_id"] = self._session_id

        response = self.session.post(
            f"{self.base_url}/step", json=payload, timeout=30,
        )
        response.raise_for_status()
        data = response.json()
        obs = _parse_observation(data.get("observation", data))

        reward = data.get("reward", obs.reward)
        if reward is None:
            reward = 0.0
        reward = float(reward)
        if obs.done:
            reward = max(0.01, min(0.99, reward))

        if "session_id" in data:
            self._session_id = data["session_id"]

        return StepResult(
            observation=obs, reward=reward,
            done=data.get("done", obs.done), info={},
        )

    def close(self):
        self.session.close()

    def __enter__(self):
        return self

    def __exit__(self, *args):
        self.close()


# ---------------------------------------------------------------------------
#  LLM PROMPTS
# ---------------------------------------------------------------------------

SYSTEM_PROMPT = textwrap.dedent("""
    You are a senior software engineering manager triaging a bug report.
    You will receive a bug report (possibly with partial information).
    Respond ONLY with valid JSON β€” no markdown, no explanation, no backticks.

    Return exactly this structure:
    {
      "priority": "P0",
      "labels": ["bug"],
      "assigned_team": "backend",
      "milestone": "hotfix",
      "reasoning": "one sentence explaining your decision"
    }

    Priority guide:
      P0 β€” production down, data loss, security vulnerability, 100% user impact
      P1 β€” major feature broken, significant user impact, no workaround
      P2 β€” degraded experience, workaround exists
      P3 β€” minor, cosmetic, docs, low impact

    Teams: backend | frontend | infra | security | devx
    Milestones: hotfix | v2.1 | backlog

    Important: Pay attention to security signals (SQL injection, XSS, auth bypass,
    data exposure). Security bugs should almost always be P0 + security team + hotfix.
""").strip()

INVESTIGATION_PROMPT = textwrap.dedent("""
    You are deciding whether to investigate further or submit your triage.
    You have seen the following information about a bug. Based on what you see,
    decide if you need more information or can triage now.

    Respond with ONLY one of these JSON formats:

    To investigate: {"action": "read_body"} or {"action": "read_comments"} or {"action": "check_logs"}
    To submit:
    {
      "action": "submit",
      "priority": "P0",
      "labels": ["bug"],
      "assigned_team": "backend",
      "milestone": "hotfix",
      "reasoning": "explanation"
    }

    Only investigate if the title and preview are genuinely ambiguous.
    If the bug is clearly a typo or clearly critical, submit immediately.
""").strip()


# ---------------------------------------------------------------------------
#  STRUCTURED LOGGING β€” strict [START]/[STEP]/[END] format
# ---------------------------------------------------------------------------

def log_start(task: str, env: str, model: str) -> None:
    print(f"[START] task={task} env={env} model={model}", flush=True)


def log_step(step: int, action: str, reward: float, done: bool,
             error: Optional[str] = None) -> None:
    print(
        f"[STEP] step={step} action={action} "
        f"reward={reward:.2f} done={str(done).lower()} error={error or 'null'}",
        flush=True,
    )


def log_end(success: bool, steps: int, score: float,
            rewards: List[float]) -> None:
    rewards_str = ",".join(f"{r:.2f}" for r in rewards)
    print(
        f"[END] success={str(success).lower()} steps={steps} "
        f"score={score:.2f} rewards={rewards_str}",
        flush=True,
    )


# ---------------------------------------------------------------------------
#  BUG FORMATTING
# ---------------------------------------------------------------------------

def format_bug(obs: TriageObservation) -> str:
    """Format a bug observation into text the LLM can read."""
    bug = obs.bug_report
    parts = [f"Title: {bug.title}"]

    parts.append(f"\nDescription:\n{bug.body}")

    if obs.comments_visible and bug.comments:
        comments = "\n".join(f"  - {c}" for c in bug.comments)
        parts.append(f"\nComments:\n{comments}")

    if bug.labels_hint:
        parts.append(f"\nExisting labels: {', '.join(bug.labels_hint)}")

    if obs.logs_visible:
        if bug.stack_trace:
            parts.append(f"\nStack trace: {bug.stack_trace}")
        if bug.affected_component:
            parts.append(f"\nAffected component: {bug.affected_component}")
        if bug.severity_signals:
            parts.append(f"\nSeverity signals: {', '.join(bug.severity_signals)}")

    if obs.similar_visible and bug.related_bugs:
        parts.append(f"\nRelated bugs: {', '.join(bug.related_bugs)}")

    # Add visibility context
    visibility = []
    if not obs.body_visible:
        visibility.append("body (truncated)")
    if not obs.comments_visible:
        visibility.append("comments (hidden)")
    if not obs.logs_visible:
        visibility.append("logs (hidden)")
    if visibility:
        parts.append(f"\n[Hidden info: {', '.join(visibility)}]")

    parts.append(f"\nSteps used: {obs.steps_taken}/{obs.max_steps}")

    return "\n".join(parts)


def format_bug_for_decision(obs: TriageObservation) -> str:
    """Shorter format for the investigation decision."""
    bug = obs.bug_report
    text = f"Title: {bug.title}\nPreview: {bug.body[:150]}"
    if obs.body_visible:
        text += f"\n\nFull body visible."
    if obs.comments_visible and bug.comments:
        text += f"\nComments: {len(bug.comments)} visible."
    text += f"\nSteps remaining: {obs.max_steps - obs.steps_taken}"
    return text


# ---------------------------------------------------------------------------
#  MODEL CALLS
# ---------------------------------------------------------------------------

def decide_action(client: OpenAI, obs: TriageObservation) -> dict:
    """Ask the LLM whether to investigate or submit."""
    bug_text = format_bug_for_decision(obs)

    try:
        completion = client.chat.completions.create(
            model=MODEL_NAME,
            messages=[
                {"role": "system", "content": INVESTIGATION_PROMPT},
                {"role": "user", "content": bug_text},
            ],
            temperature=TEMPERATURE,
            max_tokens=200,
            stream=False,
        )
        raw = (completion.choices[0].message.content or "").strip()
        if raw.startswith("```"):
            parts = raw.split("```")
            raw = parts[1] if len(parts) > 1 else raw
            if raw.startswith("json"):
                raw = raw[4:].strip()
        return json.loads(raw)
    except Exception as e:
        print(f"[DEBUG] Decision model call failed: {e}", flush=True)
        return {"action": "submit"}


def call_model(client: OpenAI, bug_text: str) -> TriageAction:
    """Ask the LLM to triage the bug report."""
    print("[LLM] Sending triage request to model...", flush=True)

    completion = client.chat.completions.create(
        model=MODEL_NAME,
        messages=[
            {"role": "system", "content": SYSTEM_PROMPT},
            {"role": "user",   "content": bug_text},
        ],
        temperature=TEMPERATURE,
        max_tokens=MAX_TOKENS,
        stream=False,
    )

    raw = (completion.choices[0].message.content or "").strip()
    print(f"[LLM] Raw response: {raw[:200]}", flush=True)

    if raw.startswith("```"):
        parts = raw.split("```")
        raw = parts[1] if len(parts) > 1 else raw
        if raw.startswith("json"):
            raw = raw[4:].strip()

    try:
        data = json.loads(raw)
    except json.JSONDecodeError as e:
        print(f"[LLM] JSON parse failed: {e}. Using defaults.", flush=True)
        data = {}

    action = TriageAction(
        action_type="submit",
        priority=data.get("priority", "P2"),
        labels=data.get("labels", ["bug"]),
        assigned_team=data.get("assigned_team", "backend"),
        milestone=data.get("milestone", "backlog"),
        reasoning=data.get("reasoning", ""),
    )

    print(
        f"[LLM] Parsed: priority={action.priority} "
        f"team={action.assigned_team} milestone={action.milestone}",
        flush=True,
    )
    return action


# ---------------------------------------------------------------------------
#  MAIN β€” multi-step agent with per-task [START]/[STEP]/[END] logging
# ---------------------------------------------------------------------------

def main() -> None:
    client = OpenAI(base_url=API_BASE_URL, api_key=API_KEY)

    all_scores = []

    with BugTriageClient(base_url=ENV_BASE_URL) as env:
        for task_id in TASK_IDS:
            rewards: List[float] = []
            score = 0.0
            success = False
            steps_taken = 0

            log_start(task=task_id, env=BENCHMARK, model=MODEL_NAME)

            try:
                obs = env.reset(task_id=task_id)

                for step_num in range(1, MAX_STEPS + 1):
                    if obs.done:
                        break

                    # Decide: investigate or submit?
                    # For efficiency, check if we have enough info
                    # On step 1, always read full body; on later steps, decide
                    if step_num == 1 and not obs.body_visible:
                        # First step: read the full body
                        action = TriageAction(action_type="read_body")
                        result = env.step(action)
                        obs = result.observation
                        steps_taken = step_num

                        log_step(
                            step=step_num,
                            action="investigate:read_body",
                            reward=0.0,
                            done=result.done,
                        )

                        if result.done:
                            rewards.append(result.reward)
                            break
                        continue

                    elif step_num == 2 and not obs.comments_visible:
                        # Second step: read comments for extra context
                        action = TriageAction(action_type="read_comments")
                        result = env.step(action)
                        obs = result.observation
                        steps_taken = step_num

                        log_step(
                            step=step_num,
                            action="investigate:read_comments",
                            reward=0.0,
                            done=result.done,
                        )

                        if result.done:
                            rewards.append(result.reward)
                            break
                        continue

                    # Now submit the triage decision
                    bug_text = format_bug(obs)
                    action = call_model(client, bug_text)
                    result = env.step(action)
                    obs = result.observation
                    steps_taken = step_num

                    reward = float(result.reward or 0.0)
                    if result.done:
                        reward = max(0.01, min(0.99, reward))
                    rewards.append(reward)

                    action_str = (
                        f"priority={action.priority},"
                        f"team={action.assigned_team},"
                        f"milestone={action.milestone}"
                    )

                    log_step(
                        step=step_num,
                        action=action_str,
                        reward=reward,
                        done=result.done,
                    )

                    if result.done:
                        break

                # Calculate score
                if rewards:
                    score = sum(rewards) / MAX_TOTAL_REWARD
                else:
                    score = 0.0
                score = min(max(score, 0.01), 0.99)
                success = score >= SUCCESS_SCORE_THRESHOLD

            except Exception as exc:
                print(f"[ERROR] {type(exc).__name__}: {exc}", flush=True)
                score = sum(rewards) / MAX_TOTAL_REWARD if rewards else 0.05
                score = min(max(score, 0.01), 0.99)
                success = False

            log_end(success, steps_taken, score, rewards)
            all_scores.append(score)

            time.sleep(0.5)

    avg_score = sum(all_scores) / len(all_scores) if all_scores else 0.0
    print(
        f"[SUMMARY] tasks={len(all_scores)} avg_score={avg_score:.2f} "
        f"scores={all_scores}",
        flush=True,
    )


if __name__ == "__main__":
    main()