Spaces:
Sleeping
Sleeping
File size: 7,905 Bytes
ad432a3 d145b94 ad432a3 d145b94 cf4ce1e ad432a3 cf4ce1e ad432a3 d145b94 cf4ce1e d145b94 cf4ce1e d145b94 9e31281 ad432a3 d145b94 69cf4ae ad432a3 d145b94 ad432a3 69cf4ae d145b94 ad432a3 d145b94 ad432a3 d145b94 a3f3034 d145b94 ad432a3 d145b94 cf4ce1e a3f3034 d145b94 a3f3034 cf4ce1e d145b94 a3f3034 d145b94 cf4ce1e 9e31281 cf4ce1e d145b94 cf4ce1e ad432a3 cf4ce1e ad432a3 d145b94 a3f3034 d145b94 ad432a3 d145b94 ad432a3 d145b94 69cf4ae d145b94 ad432a3 d145b94 ad432a3 d145b94 a3f3034 d145b94 3782916 69cf4ae 3782916 d145b94 69cf4ae d145b94 69cf4ae d145b94 69cf4ae d145b94 ad432a3 cf4ce1e ad432a3 d145b94 ad432a3 d145b94 ad432a3 d145b94 cf4ce1e ad432a3 cf4ce1e | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 | #!/usr/bin/env python3
"""Hackathon baseline inference for coding_env.
MANDATORY environment variables handled here:
- API_BASE_URL (defaulted)
- MODEL_NAME (defaulted)
- HF_TOKEN (no default)
- LOCAL_IMAGE_NAME (optional, for local Docker workflows)
"""
from __future__ import annotations
import json
import os
from typing import Any, Dict, List
import requests
from openai import OpenAI
API_BASE_URL = os.getenv("API_BASE_URL", "https://router.huggingface.co/v1")
MODEL_NAME = os.getenv("MODEL_NAME", "Qwen/Qwen2.5-72B-Instruct")
HF_TOKEN = os.getenv("HF_TOKEN")
LOCAL_IMAGE_NAME = os.getenv("LOCAL_IMAGE_NAME")
ENV_BASE_URL = os.getenv("ENV_BASE_URL", "http://localhost:8000")
BENCHMARK = os.getenv("BENCHMARK", "coding_env")
MAX_STEPS = int(os.getenv("MAX_STEPS", "1"))
SUCCESS_SCORE_THRESHOLD = float(os.getenv("SUCCESS_SCORE_THRESHOLD", "0.60"))
MIN_STRICT_SCORE = 0.01
MAX_STRICT_SCORE = 0.99
def _bool_text(value: bool) -> str:
return "true" if value else "false"
def _strict_score(value: float) -> float:
return max(MIN_STRICT_SCORE, min(MAX_STRICT_SCORE, round(float(value), 4)))
def log_start(task: str, env: str, model: str) -> None:
print(f"[START] task={task} env={env} model={model}", flush=True)
def log_step(
step: int, action: str, reward: float, done: bool, error: str | None
) -> None:
error_value = error if error else "null"
print(
f"[STEP] step={step} action={action} reward={reward:.2f} "
f"done={_bool_text(done)} error={error_value}",
flush=True,
)
def log_end(success: bool, steps: int, score: float, rewards: List[float]) -> None:
rewards_str = ",".join(f"{r:.2f}" for r in rewards)
print(
f"[END] success={_bool_text(success)} steps={steps} "
f"score={score:.2f} rewards={rewards_str}",
flush=True,
)
def _safe_json(method: str, url: str, **kwargs: Any) -> Dict[str, Any]:
try:
response = requests.request(method, url, timeout=30, **kwargs)
response.raise_for_status()
data = response.json()
if isinstance(data, dict):
return data
except Exception:
pass
return {}
def _task_list() -> List[str]:
data = _safe_json("GET", f"{ENV_BASE_URL}/tasks")
tasks = data.get("tasks", [])
if isinstance(tasks, list):
values: List[str] = []
for item in tasks:
if isinstance(item, dict) and item.get("task_id"):
values.append(str(item["task_id"]))
if values:
return values
return ["task_easy_1", "task_medium_1", "task_hard_1"]
def _build_action(client: OpenAI | None, task_description: str, code_snippet: str) -> Dict[str, Any]:
fallback = {
"review": "Likely logic issue in this PR change; please review line-level semantics.",
"file_path": "services/metrics/aggregation.py",
"issue_type": "logic",
"severity": "medium",
"bug_type": "logic",
"line_number": 1,
"confidence": 0.20,
}
if client is None:
return fallback
prompt = f"""You are reviewing a production pull request.
Task: {task_description}
PR context:
{code_snippet}
Return ONLY valid JSON with keys:
review (string),
file_path (string from changed files),
issue_type (one of logic|security|performance|maintainability),
severity (one of low|medium|high|critical),
bug_type (one of syntax|logic|security|none),
line_number (integer),
confidence (0.0-1.0 float)
"""
try:
response = client.chat.completions.create(
model=MODEL_NAME,
temperature=0.0,
messages=[{"role": "user", "content": prompt}],
)
raw = (response.choices[0].message.content or "").strip()
raw = raw.replace("```json", "").replace("```", "").strip()
parsed = json.loads(raw)
if not isinstance(parsed, dict):
return fallback
return {
"review": str(parsed.get("review", fallback["review"])),
"file_path": str(parsed.get("file_path", fallback["file_path"])),
"issue_type": str(parsed.get("issue_type", fallback["issue_type"])),
"severity": str(parsed.get("severity", fallback["severity"])),
"bug_type": str(parsed.get("bug_type", fallback["bug_type"])),
"line_number": int(parsed.get("line_number", fallback["line_number"])),
"confidence": float(parsed.get("confidence", fallback["confidence"])),
}
except Exception:
return fallback
def run_task(task_id: str, client: OpenAI | None) -> float:
episode_id = f"baseline-{task_id}"
rewards: List[float] = []
score = MIN_STRICT_SCORE
success = False
last_error: str | None = None
steps_taken = 0
log_start(task_id, BENCHMARK, MODEL_NAME)
try:
reset_data = _safe_json(
"POST",
f"{ENV_BASE_URL}/reset",
json={"task_id": task_id, "episode_id": episode_id},
)
obs = reset_data.get("observation", {}) if isinstance(reset_data, dict) else {}
task_description = str(obs.get("task_description", "Review code quality and bugs."))
code_snippet = str(obs.get("code_snippet", ""))
for step_num in range(1, MAX_STEPS + 1):
action = _build_action(client, task_description, code_snippet)
action_str = (
f"file={action['file_path']};"
f"issue={action['issue_type']};"
f"sev={action['severity']};"
f"bug_type={action['bug_type']};"
f"line={action['line_number']};"
f"confidence={float(action['confidence']):.2f}"
)
step_data = _safe_json(
"POST",
f"{ENV_BASE_URL}/step",
json={
"action": action,
"task_id": task_id,
"episode_id": episode_id,
},
)
reward = _strict_score(float(step_data.get("reward", MIN_STRICT_SCORE) or MIN_STRICT_SCORE))
done = bool(step_data.get("done", not bool(step_data)))
obs_after = step_data.get("observation", {}) if isinstance(step_data, dict) else {}
raw_error = obs_after.get("last_action_error")
last_error = str(raw_error) if raw_error else None
rewards.append(reward)
steps_taken = step_num
log_step(step_num, action_str, reward, done, last_error)
if done:
break
grader_data = _safe_json(
"GET", f"{ENV_BASE_URL}/grader?task_id={task_id}&episode_id={episode_id}"
)
grader_score = _strict_score(float(grader_data.get("score", MIN_STRICT_SCORE) or MIN_STRICT_SCORE))
step_score = _strict_score(rewards[-1] if rewards else MIN_STRICT_SCORE)
score = _strict_score(max(grader_score, step_score))
success = score >= SUCCESS_SCORE_THRESHOLD
except Exception as exc:
last_error = str(exc)
if steps_taken == 0:
log_step(
1,
"bug_type=none;line=-1;confidence=0.00",
MIN_STRICT_SCORE,
True,
last_error,
)
rewards.append(MIN_STRICT_SCORE)
steps_taken = 1
score = MIN_STRICT_SCORE
success = False
finally:
log_end(success, max(1, steps_taken), score, rewards or [0.0])
return score
def main() -> Dict[str, float]:
client = OpenAI(base_url=API_BASE_URL, api_key=HF_TOKEN) if HF_TOKEN else None
tasks = _task_list()
scores: Dict[str, float] = {}
for task_id in tasks:
scores[task_id] = run_task(task_id, client)
avg = sum(scores.values()) / len(scores) if scores else 0.0
scores["average"] = round(avg, 4)
return scores
if __name__ == "__main__":
main()
|