Spaces:
Sleeping
Sleeping
File size: 13,870 Bytes
611a353 62b5822 611a353 62b5822 611a353 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 | #!/usr/bin/env python3
"""
Bug Report Structuring Environment - Inference Script
This script runs the LLM agent against the Bug Report Structuring Environment.
It connects to the deployed environment (HF Space), uses an LLM to structure
messy bug reports, and logs results in the required OpenEnv format.
Required environment variables:
API_BASE_URL β Base URL for the LLM API (e.g., vLLM or HF Inference)
MODEL_NAME β Model identifier (e.g., meta-llama/Llama-3.1-8B-Instruct)
HF_TOKEN β Hugging Face authentication token
Log format (STDOUT):
[START] task=<task> env=<env> model=<model>
[STEP] step=<n> action=<summary> reward=<0.00> done=<bool> error=<msg|null>
[END] success=<bool> steps=<n> score=<0.00> rewards=<r1,r2,...>
"""
import os
import sys
import json
import time
import requests
from openai import OpenAI
from pathlib import Path
# βββ Load Environment Variables from .env if it exists βββββββββββ
def load_env_file():
"""Try to load .env file from multiple locations."""
possible_paths = [
Path(__file__).parent / ".env", # Same dir as script
Path.cwd() / ".env", # Current working directory
Path.home() / ".env", # Home directory
]
for env_path in possible_paths:
if env_path.exists():
try:
with open(env_path) as f:
for line in f:
line = line.strip()
if line and not line.startswith("#"):
key, _, value = line.partition("=")
key = key.strip()
value = value.strip()
if key and value:
os.environ.setdefault(key, value)
return
except Exception:
continue
load_env_file()
# βββ Configuration ββββββββββββββββββββββββββββββββββββββββββββββββ
API_BASE_URL = os.environ.get("API_BASE_URL", "").strip()
MODEL_NAME = os.environ.get("MODEL_NAME", "").strip()
HF_TOKEN = os.environ.get("HF_TOKEN", "").strip()
# βββ Validate Required Environment Variables ββββββββββββββββββββββ
required_vars = {
"API_BASE_URL": API_BASE_URL,
"MODEL_NAME": MODEL_NAME,
"HF_TOKEN": HF_TOKEN,
}
missing_vars = [var for var, value in required_vars.items() if not value]
if missing_vars:
print(
f"β Missing environment variables: {', '.join(missing_vars)}",
file=sys.stderr,
)
print(
f"\nSet them before running:",
file=sys.stderr,
)
print(f"export API_BASE_URL=https://...", file=sys.stderr)
print(f"export MODEL_NAME=meta-llama/...", file=sys.stderr)
print(f"export HF_TOKEN=hf_...", file=sys.stderr)
sys.exit(1)
# Environment URL (the deployed HF Space)
ENV_URL = os.environ.get(
"ENV_URL",
"https://rahul-13-bug-report-structuring-env.hf.space"
)
BENCHMARK_NAME = "bug_report_structuring"
TASKS = ["easy", "medium", "hard"]
MAX_RETRIES = 2
# βββ LLM Client Setup ββββββββββββββββββββββββββββββββββββββββββββ
client = OpenAI(
base_url=API_BASE_URL,
api_key=HF_TOKEN,
)
# βββ Prompt Templates ββββββββββββββββββββββββββββββββββββββββββββ
SYSTEM_PROMPT = """You are an expert bug report analyst. Your job is to take messy, unstructured bug reports and convert them into well-organized, structured formats.
You must output a valid JSON object with exactly these fields:
- "title": A clear, concise title summarizing the bug
- "steps_to_reproduce": Numbered step-by-step instructions to reproduce the bug
- "expected_behavior": What should happen (correct behavior)
- "actual_behavior": What actually happens (the bug symptoms)
- "severity": One of "low", "medium", "high", or "critical"
- "environment": OS, browser, version, platform details
- "additional_notes": Any other relevant details
Rules:
1. Extract ALL information from the original report - don't miss details
2. Use professional, clear language
3. Steps should be specific and actionable
4. Include version numbers, error messages, and technical details
5. Severity should reflect the actual impact described
6. Output ONLY the JSON object, no other text or markdown"""
REFINEMENT_PROMPT = """You previously structured a bug report but the grading feedback indicates room for improvement.
Original messy bug report:
{raw_report}
Your previous submission scored {score:.2f}/1.00.
Feedback:
{feedback}
Previous field scores:
{field_scores}
Please submit an improved version. Focus on the fields with low scores.
Output ONLY a valid JSON object with the same fields: title, steps_to_reproduce, expected_behavior, actual_behavior, severity, environment, additional_notes."""
# βββ Helper Functions βββββββββββββββββββββββββββββββββββββββββββββ
def call_llm(messages: list) -> str:
"""Call the LLM and return the response text."""
try:
response = client.chat.completions.create(
model=MODEL_NAME,
messages=messages,
temperature=0.3,
max_tokens=2048,
)
return response.choices[0].message.content.strip()
except Exception as e:
print(f" [LLM ERROR] {e}", file=sys.stderr)
return ""
def parse_json_response(text: str) -> dict:
"""Parse JSON from LLM response, handling markdown code blocks."""
# Strip markdown code blocks if present
if "```json" in text:
text = text.split("```json")[1].split("```")[0].strip()
elif "```" in text:
text = text.split("```")[1].split("```")[0].strip()
try:
return json.loads(text)
except json.JSONDecodeError:
# Try to find JSON object in the text
start = text.find("{")
end = text.rfind("}") + 1
if start >= 0 and end > start:
try:
return json.loads(text[start:end])
except json.JSONDecodeError:
pass
return {}
def env_reset(task_id: str) -> dict:
"""Call the environment's reset endpoint."""
try:
resp = requests.post(
f"{ENV_URL}/reset",
json={"task_id": task_id},
timeout=30,
)
resp.raise_for_status()
return resp.json()
except Exception as e:
print(f" [ENV ERROR] Reset failed: {e}", file=sys.stderr)
return {}
def env_step(action: dict) -> dict:
"""Call the environment's step endpoint."""
try:
resp = requests.post(
f"{ENV_URL}/step",
json={"action": action},
timeout=30,
)
resp.raise_for_status()
return resp.json()
except Exception as e:
print(f" [ENV ERROR] Step failed: {e}", file=sys.stderr)
return {}
def make_default_action() -> dict:
"""Return a minimal valid action as fallback."""
return {
"title": "Bug Report",
"steps_to_reproduce": "1. See the bug report",
"expected_behavior": "Application works correctly",
"actual_behavior": "Application does not work as expected",
"severity": "medium",
"environment": "Not specified",
"additional_notes": "",
}
# βββ Main Inference Loop βββββββββββββββββββββββββββββββββββββββββ
def run_task(task_id: str) -> dict:
"""
Run the agent on a single task.
Returns dict with: success, steps, score, rewards
"""
# ββ START ββ
print(f"[START] task={task_id} env={BENCHMARK_NAME} model={MODEL_NAME}")
rewards = []
best_score = 0.0
step_count = 0
success = False
# Reset environment
obs = env_reset(task_id)
if not obs:
print(f"[STEP] step=1 action=reset_failed reward=0.00 done=true error=environment_reset_failed")
print(f"[END] success=false steps=1 score=0.00 rewards=0.00")
return {"success": False, "steps": 1, "score": 0.0, "rewards": [0.0]}
raw_report = obs.get("raw_report", "")
max_steps = obs.get("max_steps", 3)
# ββ First submission ββ
messages = [
{"role": "system", "content": SYSTEM_PROMPT},
{"role": "user", "content": f"Structure this bug report:\n\n{raw_report}"},
]
llm_response = call_llm(messages)
action = parse_json_response(llm_response)
if not action or "title" not in action:
action = make_default_action()
# Ensure all fields exist
for field in ["title", "steps_to_reproduce", "expected_behavior",
"actual_behavior", "severity", "environment", "additional_notes"]:
if field not in action:
action[field] = ""
step_count = 1
result = env_step(action)
if result:
score = result.get("score", 0.0)
reward = result.get("reward", 0.0)
done = result.get("done", False)
error = "null"
else:
score = 0.0
reward = 0.0
done = True
error = "step_request_failed"
rewards.append(reward)
best_score = max(best_score, score)
action_summary = action.get("title", "structured_report")[:50].replace(" ", "_")
print(
f"[STEP] step={step_count} action={action_summary} "
f"reward={reward:.2f} done={str(done).lower()} error={error}"
)
# ββ Refinement steps ββ
while not done and step_count < max_steps:
feedback = result.get("feedback", "")
field_scores = result.get("field_scores", {})
refinement_content = REFINEMENT_PROMPT.format(
raw_report=raw_report,
score=score,
feedback=feedback,
field_scores=json.dumps(field_scores, indent=2),
)
messages = [
{"role": "system", "content": SYSTEM_PROMPT},
{"role": "user", "content": refinement_content},
]
llm_response = call_llm(messages)
action = parse_json_response(llm_response)
if not action or "title" not in action:
action = make_default_action()
for field in ["title", "steps_to_reproduce", "expected_behavior",
"actual_behavior", "severity", "environment", "additional_notes"]:
if field not in action:
action[field] = ""
step_count += 1
result = env_step(action)
if result:
score = result.get("score", 0.0)
reward = result.get("reward", 0.0)
done = result.get("done", False)
error = "null"
else:
score = 0.0
reward = 0.0
done = True
error = "step_request_failed"
rewards.append(reward)
best_score = max(best_score, score)
action_summary = action.get("title", "refined_report")[:50].replace(" ", "_")
print(
f"[STEP] step={step_count} action={action_summary} "
f"reward={reward:.2f} done={str(done).lower()} error={error}"
)
# ββ END ββ
success = best_score >= 0.6
rewards_str = ",".join(f"{r:.2f}" for r in rewards)
print(
f"[END] success={str(success).lower()} steps={step_count} "
f"score={best_score:.2f} rewards={rewards_str}"
)
return {
"success": success,
"steps": step_count,
"score": best_score,
"rewards": rewards,
}
def main():
"""Run inference on all tasks."""
# Validate environment variables
missing = []
if not API_BASE_URL:
missing.append("API_BASE_URL")
if not MODEL_NAME:
missing.append("MODEL_NAME")
if not HF_TOKEN:
missing.append("HF_TOKEN")
if missing:
print(f"β Missing environment variables: {', '.join(missing)}", file=sys.stderr)
print("Set them before running:", file=sys.stderr)
print(" export API_BASE_URL=https://...", file=sys.stderr)
print(" export MODEL_NAME=meta-llama/...", file=sys.stderr)
print(" export HF_TOKEN=hf_...", file=sys.stderr)
sys.exit(1)
print(f"βββ Bug Report Structuring - Inference βββ", file=sys.stderr)
print(f" Model: {MODEL_NAME}", file=sys.stderr)
print(f" Env: {ENV_URL}", file=sys.stderr)
print(f" Tasks: {TASKS}", file=sys.stderr)
print(f"βββββββββββββββββββββββββββββββββββββββββββ", file=sys.stderr)
results = {}
total_score = 0.0
start_time = time.time()
for task_id in TASKS:
print(f"\n--- Task: {task_id} ---", file=sys.stderr)
result = run_task(task_id)
results[task_id] = result
total_score += result["score"]
print(f" Score: {result['score']:.2f}", file=sys.stderr)
elapsed = time.time() - start_time
avg_score = total_score / len(TASKS)
print(f"\nβββ Summary βββ", file=sys.stderr)
print(f" Average Score: {avg_score:.2f}", file=sys.stderr)
print(f" Time Elapsed: {elapsed:.1f}s", file=sys.stderr)
for task_id, result in results.items():
status = "β
" if result["success"] else "β"
print(
f" {status} {task_id}: {result['score']:.2f} "
f"({result['steps']} steps)",
file=sys.stderr,
)
print(f"βββββββββββββββ", file=sys.stderr)
if __name__ == "__main__":
main()
|