File size: 12,604 Bytes
af65fe4
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3ae8479
 
 
 
 
 
 
 
 
 
 
 
 
 
af65fe4
 
 
 
 
 
 
 
 
 
c2ab3bf
af65fe4
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
#!/usr/bin/env python3
"""
Bug Report Structuring Environment - Inference Script

This script runs the LLM agent against the Bug Report Structuring Environment.
It connects to the deployed environment (HF Space), uses an LLM to structure
messy bug reports, and logs results in the required OpenEnv format.

Required environment variables:
  API_BASE_URL  β€” Base URL for the LLM API (e.g., vLLM or HF Inference)
  MODEL_NAME    β€” Model identifier (e.g., meta-llama/Llama-3.1-8B-Instruct)
  HF_TOKEN      β€” Hugging Face authentication token

Log format (STDOUT):
  [START] task=<task> env=<env> model=<model>
  [STEP]  step=<n> action=<summary> reward=<0.00> done=<bool> error=<msg|null>
  [END]   success=<bool> steps=<n> score=<0.00> rewards=<r1,r2,...>
"""

import os
import sys
import json
import time
import requests
from openai import OpenAI
from pathlib import Path

# ─── Load Environment Variables from .env if it exists ───────────
env_file = Path(__file__).parent / ".env"
if env_file.exists():
    with open(env_file) as f:
        for line in f:
            line = line.strip()
            if line and not line.startswith("#"):
                key, _, value = line.partition("=")
                key = key.strip()
                value = value.strip()
                if key and value:
                    os.environ.setdefault(key, value)

# ─── Configuration ────────────────────────────────────────────────

API_BASE_URL = os.environ.get("API_BASE_URL", "")
MODEL_NAME = os.environ.get("MODEL_NAME", "")
HF_TOKEN = os.environ.get("HF_TOKEN", "")

# Environment URL (the deployed HF Space)
ENV_URL = os.environ.get(
    "ENV_URL",
    "https://rahul-13-bug-report-structuring-env.hf.space"
)

BENCHMARK_NAME = "bug_report_structuring"
TASKS = ["easy", "medium", "hard"]
MAX_RETRIES = 2

# ─── LLM Client Setup ────────────────────────────────────────────

client = OpenAI(
    base_url=API_BASE_URL,
    api_key=HF_TOKEN,
)


# ─── Prompt Templates ────────────────────────────────────────────

SYSTEM_PROMPT = """You are an expert bug report analyst. Your job is to take messy, unstructured bug reports and convert them into well-organized, structured formats.

You must output a valid JSON object with exactly these fields:
- "title": A clear, concise title summarizing the bug
- "steps_to_reproduce": Numbered step-by-step instructions to reproduce the bug
- "expected_behavior": What should happen (correct behavior)
- "actual_behavior": What actually happens (the bug symptoms)
- "severity": One of "low", "medium", "high", or "critical"
- "environment": OS, browser, version, platform details
- "additional_notes": Any other relevant details

Rules:
1. Extract ALL information from the original report - don't miss details
2. Use professional, clear language
3. Steps should be specific and actionable
4. Include version numbers, error messages, and technical details
5. Severity should reflect the actual impact described
6. Output ONLY the JSON object, no other text or markdown"""

REFINEMENT_PROMPT = """You previously structured a bug report but the grading feedback indicates room for improvement.

Original messy bug report:
{raw_report}

Your previous submission scored {score:.2f}/1.00.

Feedback:
{feedback}

Previous field scores:
{field_scores}

Please submit an improved version. Focus on the fields with low scores.
Output ONLY a valid JSON object with the same fields: title, steps_to_reproduce, expected_behavior, actual_behavior, severity, environment, additional_notes."""


# ─── Helper Functions ─────────────────────────────────────────────

def call_llm(messages: list) -> str:
    """Call the LLM and return the response text."""
    try:
        response = client.chat.completions.create(
            model=MODEL_NAME,
            messages=messages,
            temperature=0.3,
            max_tokens=2048,
        )
        return response.choices[0].message.content.strip()
    except Exception as e:
        print(f"  [LLM ERROR] {e}", file=sys.stderr)
        return ""


def parse_json_response(text: str) -> dict:
    """Parse JSON from LLM response, handling markdown code blocks."""
    # Strip markdown code blocks if present
    if "```json" in text:
        text = text.split("```json")[1].split("```")[0].strip()
    elif "```" in text:
        text = text.split("```")[1].split("```")[0].strip()

    try:
        return json.loads(text)
    except json.JSONDecodeError:
        # Try to find JSON object in the text
        start = text.find("{")
        end = text.rfind("}") + 1
        if start >= 0 and end > start:
            try:
                return json.loads(text[start:end])
            except json.JSONDecodeError:
                pass
    return {}


def env_reset(task_id: str) -> dict:
    """Call the environment's reset endpoint."""
    try:
        resp = requests.post(
            f"{ENV_URL}/reset",
            json={"task_id": task_id},
            timeout=30,
        )
        resp.raise_for_status()
        return resp.json()
    except Exception as e:
        print(f"  [ENV ERROR] Reset failed: {e}", file=sys.stderr)
        return {}


def env_step(action: dict) -> dict:
    """Call the environment's step endpoint."""
    try:
        resp = requests.post(
            f"{ENV_URL}/step",
            json={"action": action},
            timeout=30,
        )
        resp.raise_for_status()
        return resp.json()
    except Exception as e:
        print(f"  [ENV ERROR] Step failed: {e}", file=sys.stderr)
        return {}


def make_default_action() -> dict:
    """Return a minimal valid action as fallback."""
    return {
        "title": "Bug Report",
        "steps_to_reproduce": "1. See the bug report",
        "expected_behavior": "Application works correctly",
        "actual_behavior": "Application does not work as expected",
        "severity": "medium",
        "environment": "Not specified",
        "additional_notes": "",
    }


# ─── Main Inference Loop ─────────────────────────────────────────

def run_task(task_id: str) -> dict:
    """
    Run the agent on a single task.

    Returns dict with: success, steps, score, rewards
    """
    # ── START ──
    print(f"[START] task={task_id} env={BENCHMARK_NAME} model={MODEL_NAME}")

    rewards = []
    best_score = 0.0
    step_count = 0
    success = False

    # Reset environment
    obs = env_reset(task_id)
    if not obs:
        print(f"[STEP] step=1 action=reset_failed reward=0.00 done=true error=environment_reset_failed")
        print(f"[END] success=false steps=1 score=0.00 rewards=0.00")
        return {"success": False, "steps": 1, "score": 0.0, "rewards": [0.0]}

    raw_report = obs.get("raw_report", "")
    max_steps = obs.get("max_steps", 3)

    # ── First submission ──
    messages = [
        {"role": "system", "content": SYSTEM_PROMPT},
        {"role": "user", "content": f"Structure this bug report:\n\n{raw_report}"},
    ]

    llm_response = call_llm(messages)
    action = parse_json_response(llm_response)

    if not action or "title" not in action:
        action = make_default_action()

    # Ensure all fields exist
    for field in ["title", "steps_to_reproduce", "expected_behavior",
                  "actual_behavior", "severity", "environment", "additional_notes"]:
        if field not in action:
            action[field] = ""

    step_count = 1
    result = env_step(action)

    if result:
        score = result.get("score", 0.0)
        reward = result.get("reward", 0.0)
        done = result.get("done", False)
        error = "null"
    else:
        score = 0.0
        reward = 0.0
        done = True
        error = "step_request_failed"

    rewards.append(reward)
    best_score = max(best_score, score)
    action_summary = action.get("title", "structured_report")[:50].replace(" ", "_")

    print(
        f"[STEP] step={step_count} action={action_summary} "
        f"reward={reward:.2f} done={str(done).lower()} error={error}"
    )

    # ── Refinement steps ──
    while not done and step_count < max_steps:
        feedback = result.get("feedback", "")
        field_scores = result.get("field_scores", {})

        refinement_content = REFINEMENT_PROMPT.format(
            raw_report=raw_report,
            score=score,
            feedback=feedback,
            field_scores=json.dumps(field_scores, indent=2),
        )

        messages = [
            {"role": "system", "content": SYSTEM_PROMPT},
            {"role": "user", "content": refinement_content},
        ]

        llm_response = call_llm(messages)
        action = parse_json_response(llm_response)

        if not action or "title" not in action:
            action = make_default_action()

        for field in ["title", "steps_to_reproduce", "expected_behavior",
                      "actual_behavior", "severity", "environment", "additional_notes"]:
            if field not in action:
                action[field] = ""

        step_count += 1
        result = env_step(action)

        if result:
            score = result.get("score", 0.0)
            reward = result.get("reward", 0.0)
            done = result.get("done", False)
            error = "null"
        else:
            score = 0.0
            reward = 0.0
            done = True
            error = "step_request_failed"

        rewards.append(reward)
        best_score = max(best_score, score)
        action_summary = action.get("title", "refined_report")[:50].replace(" ", "_")

        print(
            f"[STEP] step={step_count} action={action_summary} "
            f"reward={reward:.2f} done={str(done).lower()} error={error}"
        )

    # ── END ──
    success = best_score >= 0.6
    rewards_str = ",".join(f"{r:.2f}" for r in rewards)

    print(
        f"[END] success={str(success).lower()} steps={step_count} "
        f"score={best_score:.2f} rewards={rewards_str}"
    )

    return {
        "success": success,
        "steps": step_count,
        "score": best_score,
        "rewards": rewards,
    }


def main():
    """Run inference on all tasks."""
    # Validate environment variables
    missing = []
    if not API_BASE_URL:
        missing.append("API_BASE_URL")
    if not MODEL_NAME:
        missing.append("MODEL_NAME")
    if not HF_TOKEN:
        missing.append("HF_TOKEN")

    if missing:
        print(f"❌ Missing environment variables: {', '.join(missing)}", file=sys.stderr)
        print("Set them before running:", file=sys.stderr)
        print("  export API_BASE_URL=https://...", file=sys.stderr)
        print("  export MODEL_NAME=meta-llama/...", file=sys.stderr)
        print("  export HF_TOKEN=hf_...", file=sys.stderr)
        sys.exit(1)

    print(f"═══ Bug Report Structuring - Inference ═══", file=sys.stderr)
    print(f"  Model: {MODEL_NAME}", file=sys.stderr)
    print(f"  Env:   {ENV_URL}", file=sys.stderr)
    print(f"  Tasks: {TASKS}", file=sys.stderr)
    print(f"═══════════════════════════════════════════", file=sys.stderr)

    results = {}
    total_score = 0.0
    start_time = time.time()

    for task_id in TASKS:
        print(f"\n--- Task: {task_id} ---", file=sys.stderr)
        result = run_task(task_id)
        results[task_id] = result
        total_score += result["score"]
        print(f"  Score: {result['score']:.2f}", file=sys.stderr)

    elapsed = time.time() - start_time
    avg_score = total_score / len(TASKS)

    print(f"\n═══ Summary ═══", file=sys.stderr)
    print(f"  Average Score: {avg_score:.2f}", file=sys.stderr)
    print(f"  Time Elapsed:  {elapsed:.1f}s", file=sys.stderr)
    for task_id, result in results.items():
        status = "βœ…" if result["success"] else "❌"
        print(
            f"  {status} {task_id}: {result['score']:.2f} "
            f"({result['steps']} steps)",
            file=sys.stderr,
        )
    print(f"═══════════════", file=sys.stderr)


if __name__ == "__main__":
    main()