RAHUL-13 commited on
Commit
af65fe4
Β·
verified Β·
1 Parent(s): afedf8c

Upload inference.py with huggingface_hub

Browse files
Files changed (1) hide show
  1. inference.py +359 -0
inference.py ADDED
@@ -0,0 +1,359 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """
3
+ Bug Report Structuring Environment - Inference Script
4
+
5
+ This script runs the LLM agent against the Bug Report Structuring Environment.
6
+ It connects to the deployed environment (HF Space), uses an LLM to structure
7
+ messy bug reports, and logs results in the required OpenEnv format.
8
+
9
+ Required environment variables:
10
+ API_BASE_URL β€” Base URL for the LLM API (e.g., vLLM or HF Inference)
11
+ MODEL_NAME β€” Model identifier (e.g., meta-llama/Llama-3.1-8B-Instruct)
12
+ HF_TOKEN β€” Hugging Face authentication token
13
+
14
+ Log format (STDOUT):
15
+ [START] task=<task> env=<env> model=<model>
16
+ [STEP] step=<n> action=<summary> reward=<0.00> done=<bool> error=<msg|null>
17
+ [END] success=<bool> steps=<n> score=<0.00> rewards=<r1,r2,...>
18
+ """
19
+
20
+ import os
21
+ import sys
22
+ import json
23
+ import time
24
+ import requests
25
+ from openai import OpenAI
26
+
27
+ # ─── Configuration ────────────────────────────────────────────────
28
+
29
+ API_BASE_URL = os.environ.get("API_BASE_URL", "")
30
+ MODEL_NAME = os.environ.get("MODEL_NAME", "")
31
+ HF_TOKEN = os.environ.get("HF_TOKEN", "")
32
+
33
+ # Environment URL (the deployed HF Space)
34
+ ENV_URL = os.environ.get(
35
+ "ENV_URL",
36
+ "https://SAI-RAHUL-ROKKAM-bug-report-structuring-env.hf.space"
37
+ )
38
+
39
+ BENCHMARK_NAME = "bug_report_structuring"
40
+ TASKS = ["easy", "medium", "hard"]
41
+ MAX_RETRIES = 2
42
+
43
+ # ─── LLM Client Setup ────────────────────────────────────────────
44
+
45
+ client = OpenAI(
46
+ base_url=API_BASE_URL,
47
+ api_key=HF_TOKEN,
48
+ )
49
+
50
+
51
+ # ─── Prompt Templates ────────────────────────────────────────────
52
+
53
+ SYSTEM_PROMPT = """You are an expert bug report analyst. Your job is to take messy, unstructured bug reports and convert them into well-organized, structured formats.
54
+
55
+ You must output a valid JSON object with exactly these fields:
56
+ - "title": A clear, concise title summarizing the bug
57
+ - "steps_to_reproduce": Numbered step-by-step instructions to reproduce the bug
58
+ - "expected_behavior": What should happen (correct behavior)
59
+ - "actual_behavior": What actually happens (the bug symptoms)
60
+ - "severity": One of "low", "medium", "high", or "critical"
61
+ - "environment": OS, browser, version, platform details
62
+ - "additional_notes": Any other relevant details
63
+
64
+ Rules:
65
+ 1. Extract ALL information from the original report - don't miss details
66
+ 2. Use professional, clear language
67
+ 3. Steps should be specific and actionable
68
+ 4. Include version numbers, error messages, and technical details
69
+ 5. Severity should reflect the actual impact described
70
+ 6. Output ONLY the JSON object, no other text or markdown"""
71
+
72
+ REFINEMENT_PROMPT = """You previously structured a bug report but the grading feedback indicates room for improvement.
73
+
74
+ Original messy bug report:
75
+ {raw_report}
76
+
77
+ Your previous submission scored {score:.2f}/1.00.
78
+
79
+ Feedback:
80
+ {feedback}
81
+
82
+ Previous field scores:
83
+ {field_scores}
84
+
85
+ Please submit an improved version. Focus on the fields with low scores.
86
+ Output ONLY a valid JSON object with the same fields: title, steps_to_reproduce, expected_behavior, actual_behavior, severity, environment, additional_notes."""
87
+
88
+
89
+ # ─── Helper Functions ─────────────────────────────────────────────
90
+
91
+ def call_llm(messages: list) -> str:
92
+ """Call the LLM and return the response text."""
93
+ try:
94
+ response = client.chat.completions.create(
95
+ model=MODEL_NAME,
96
+ messages=messages,
97
+ temperature=0.3,
98
+ max_tokens=2048,
99
+ )
100
+ return response.choices[0].message.content.strip()
101
+ except Exception as e:
102
+ print(f" [LLM ERROR] {e}", file=sys.stderr)
103
+ return ""
104
+
105
+
106
+ def parse_json_response(text: str) -> dict:
107
+ """Parse JSON from LLM response, handling markdown code blocks."""
108
+ # Strip markdown code blocks if present
109
+ if "```json" in text:
110
+ text = text.split("```json")[1].split("```")[0].strip()
111
+ elif "```" in text:
112
+ text = text.split("```")[1].split("```")[0].strip()
113
+
114
+ try:
115
+ return json.loads(text)
116
+ except json.JSONDecodeError:
117
+ # Try to find JSON object in the text
118
+ start = text.find("{")
119
+ end = text.rfind("}") + 1
120
+ if start >= 0 and end > start:
121
+ try:
122
+ return json.loads(text[start:end])
123
+ except json.JSONDecodeError:
124
+ pass
125
+ return {}
126
+
127
+
128
+ def env_reset(task_id: str) -> dict:
129
+ """Call the environment's reset endpoint."""
130
+ try:
131
+ resp = requests.post(
132
+ f"{ENV_URL}/reset",
133
+ json={"task_id": task_id},
134
+ timeout=30,
135
+ )
136
+ resp.raise_for_status()
137
+ return resp.json()
138
+ except Exception as e:
139
+ print(f" [ENV ERROR] Reset failed: {e}", file=sys.stderr)
140
+ return {}
141
+
142
+
143
+ def env_step(action: dict) -> dict:
144
+ """Call the environment's step endpoint."""
145
+ try:
146
+ resp = requests.post(
147
+ f"{ENV_URL}/step",
148
+ json={"action": action},
149
+ timeout=30,
150
+ )
151
+ resp.raise_for_status()
152
+ return resp.json()
153
+ except Exception as e:
154
+ print(f" [ENV ERROR] Step failed: {e}", file=sys.stderr)
155
+ return {}
156
+
157
+
158
+ def make_default_action() -> dict:
159
+ """Return a minimal valid action as fallback."""
160
+ return {
161
+ "title": "Bug Report",
162
+ "steps_to_reproduce": "1. See the bug report",
163
+ "expected_behavior": "Application works correctly",
164
+ "actual_behavior": "Application does not work as expected",
165
+ "severity": "medium",
166
+ "environment": "Not specified",
167
+ "additional_notes": "",
168
+ }
169
+
170
+
171
+ # ─── Main Inference Loop ─────────────────────────────────────────
172
+
173
+ def run_task(task_id: str) -> dict:
174
+ """
175
+ Run the agent on a single task.
176
+
177
+ Returns dict with: success, steps, score, rewards
178
+ """
179
+ # ── START ──
180
+ print(f"[START] task={task_id} env={BENCHMARK_NAME} model={MODEL_NAME}")
181
+
182
+ rewards = []
183
+ best_score = 0.0
184
+ step_count = 0
185
+ success = False
186
+
187
+ # Reset environment
188
+ obs = env_reset(task_id)
189
+ if not obs:
190
+ print(f"[STEP] step=1 action=reset_failed reward=0.00 done=true error=environment_reset_failed")
191
+ print(f"[END] success=false steps=1 score=0.00 rewards=0.00")
192
+ return {"success": False, "steps": 1, "score": 0.0, "rewards": [0.0]}
193
+
194
+ raw_report = obs.get("raw_report", "")
195
+ max_steps = obs.get("max_steps", 3)
196
+
197
+ # ── First submission ──
198
+ messages = [
199
+ {"role": "system", "content": SYSTEM_PROMPT},
200
+ {"role": "user", "content": f"Structure this bug report:\n\n{raw_report}"},
201
+ ]
202
+
203
+ llm_response = call_llm(messages)
204
+ action = parse_json_response(llm_response)
205
+
206
+ if not action or "title" not in action:
207
+ action = make_default_action()
208
+
209
+ # Ensure all fields exist
210
+ for field in ["title", "steps_to_reproduce", "expected_behavior",
211
+ "actual_behavior", "severity", "environment", "additional_notes"]:
212
+ if field not in action:
213
+ action[field] = ""
214
+
215
+ step_count = 1
216
+ result = env_step(action)
217
+
218
+ if result:
219
+ score = result.get("score", 0.0)
220
+ reward = result.get("reward", 0.0)
221
+ done = result.get("done", False)
222
+ error = "null"
223
+ else:
224
+ score = 0.0
225
+ reward = 0.0
226
+ done = True
227
+ error = "step_request_failed"
228
+
229
+ rewards.append(reward)
230
+ best_score = max(best_score, score)
231
+ action_summary = action.get("title", "structured_report")[:50].replace(" ", "_")
232
+
233
+ print(
234
+ f"[STEP] step={step_count} action={action_summary} "
235
+ f"reward={reward:.2f} done={str(done).lower()} error={error}"
236
+ )
237
+
238
+ # ── Refinement steps ──
239
+ while not done and step_count < max_steps:
240
+ feedback = result.get("feedback", "")
241
+ field_scores = result.get("field_scores", {})
242
+
243
+ refinement_content = REFINEMENT_PROMPT.format(
244
+ raw_report=raw_report,
245
+ score=score,
246
+ feedback=feedback,
247
+ field_scores=json.dumps(field_scores, indent=2),
248
+ )
249
+
250
+ messages = [
251
+ {"role": "system", "content": SYSTEM_PROMPT},
252
+ {"role": "user", "content": refinement_content},
253
+ ]
254
+
255
+ llm_response = call_llm(messages)
256
+ action = parse_json_response(llm_response)
257
+
258
+ if not action or "title" not in action:
259
+ action = make_default_action()
260
+
261
+ for field in ["title", "steps_to_reproduce", "expected_behavior",
262
+ "actual_behavior", "severity", "environment", "additional_notes"]:
263
+ if field not in action:
264
+ action[field] = ""
265
+
266
+ step_count += 1
267
+ result = env_step(action)
268
+
269
+ if result:
270
+ score = result.get("score", 0.0)
271
+ reward = result.get("reward", 0.0)
272
+ done = result.get("done", False)
273
+ error = "null"
274
+ else:
275
+ score = 0.0
276
+ reward = 0.0
277
+ done = True
278
+ error = "step_request_failed"
279
+
280
+ rewards.append(reward)
281
+ best_score = max(best_score, score)
282
+ action_summary = action.get("title", "refined_report")[:50].replace(" ", "_")
283
+
284
+ print(
285
+ f"[STEP] step={step_count} action={action_summary} "
286
+ f"reward={reward:.2f} done={str(done).lower()} error={error}"
287
+ )
288
+
289
+ # ── END ──
290
+ success = best_score >= 0.6
291
+ rewards_str = ",".join(f"{r:.2f}" for r in rewards)
292
+
293
+ print(
294
+ f"[END] success={str(success).lower()} steps={step_count} "
295
+ f"score={best_score:.2f} rewards={rewards_str}"
296
+ )
297
+
298
+ return {
299
+ "success": success,
300
+ "steps": step_count,
301
+ "score": best_score,
302
+ "rewards": rewards,
303
+ }
304
+
305
+
306
+ def main():
307
+ """Run inference on all tasks."""
308
+ # Validate environment variables
309
+ missing = []
310
+ if not API_BASE_URL:
311
+ missing.append("API_BASE_URL")
312
+ if not MODEL_NAME:
313
+ missing.append("MODEL_NAME")
314
+ if not HF_TOKEN:
315
+ missing.append("HF_TOKEN")
316
+
317
+ if missing:
318
+ print(f"❌ Missing environment variables: {', '.join(missing)}", file=sys.stderr)
319
+ print("Set them before running:", file=sys.stderr)
320
+ print(" export API_BASE_URL=https://...", file=sys.stderr)
321
+ print(" export MODEL_NAME=meta-llama/...", file=sys.stderr)
322
+ print(" export HF_TOKEN=hf_...", file=sys.stderr)
323
+ sys.exit(1)
324
+
325
+ print(f"═══ Bug Report Structuring - Inference ═══", file=sys.stderr)
326
+ print(f" Model: {MODEL_NAME}", file=sys.stderr)
327
+ print(f" Env: {ENV_URL}", file=sys.stderr)
328
+ print(f" Tasks: {TASKS}", file=sys.stderr)
329
+ print(f"═══════════════════════════════════════════", file=sys.stderr)
330
+
331
+ results = {}
332
+ total_score = 0.0
333
+ start_time = time.time()
334
+
335
+ for task_id in TASKS:
336
+ print(f"\n--- Task: {task_id} ---", file=sys.stderr)
337
+ result = run_task(task_id)
338
+ results[task_id] = result
339
+ total_score += result["score"]
340
+ print(f" Score: {result['score']:.2f}", file=sys.stderr)
341
+
342
+ elapsed = time.time() - start_time
343
+ avg_score = total_score / len(TASKS)
344
+
345
+ print(f"\n═══ Summary ═══", file=sys.stderr)
346
+ print(f" Average Score: {avg_score:.2f}", file=sys.stderr)
347
+ print(f" Time Elapsed: {elapsed:.1f}s", file=sys.stderr)
348
+ for task_id, result in results.items():
349
+ status = "βœ…" if result["success"] else "❌"
350
+ print(
351
+ f" {status} {task_id}: {result['score']:.2f} "
352
+ f"({result['steps']} steps)",
353
+ file=sys.stderr,
354
+ )
355
+ print(f"═══════════════", file=sys.stderr)
356
+
357
+
358
+ if __name__ == "__main__":
359
+ main()