Spaces:
Sleeping
Sleeping
Commit Β·
2fbe4d0
1
Parent(s): 926a06f
fix: no exact 0.0 or 1.0 anywhere in rewards
Browse files- server/app.py +3 -4
- server/task.py +15 -49
server/app.py
CHANGED
|
@@ -63,18 +63,17 @@ def task_hard():
|
|
| 63 |
@app.post("/tasks/easy/reset")
|
| 64 |
def reset_easy():
|
| 65 |
bug = sample_bug("easy")
|
| 66 |
-
return {"task_id": "easy", "bug_report": bug.dict(), "done": False, "reward": 0.
|
| 67 |
|
| 68 |
@app.post("/tasks/medium/reset")
|
| 69 |
def reset_medium():
|
| 70 |
bug = sample_bug("medium")
|
| 71 |
-
return {"task_id": "medium", "bug_report": bug.dict(), "done": False, "reward": 0.
|
| 72 |
|
| 73 |
@app.post("/tasks/hard/reset")
|
| 74 |
def reset_hard():
|
| 75 |
bug = sample_bug("hard")
|
| 76 |
-
return {"task_id": "hard", "bug_report": bug.dict(), "done": False, "reward": 0.
|
| 77 |
-
|
| 78 |
def main():
|
| 79 |
import uvicorn
|
| 80 |
uvicorn.run(app, host="0.0.0.0", port=7860)
|
|
|
|
| 63 |
@app.post("/tasks/easy/reset")
|
| 64 |
def reset_easy():
|
| 65 |
bug = sample_bug("easy")
|
| 66 |
+
return {"task_id": "easy", "bug_report": bug.dict(), "done": False, "reward": 0.05}
|
| 67 |
|
| 68 |
@app.post("/tasks/medium/reset")
|
| 69 |
def reset_medium():
|
| 70 |
bug = sample_bug("medium")
|
| 71 |
+
return {"task_id": "medium", "bug_report": bug.dict(), "done": False, "reward": 0.05}
|
| 72 |
|
| 73 |
@app.post("/tasks/hard/reset")
|
| 74 |
def reset_hard():
|
| 75 |
bug = sample_bug("hard")
|
| 76 |
+
return {"task_id": "hard", "bug_report": bug.dict(), "done": False, "reward": 0.05}
|
|
|
|
| 77 |
def main():
|
| 78 |
import uvicorn
|
| 79 |
uvicorn.run(app, host="0.0.0.0", port=7860)
|
server/task.py
CHANGED
|
@@ -245,94 +245,60 @@ PRIORITY_ORDER = {"P0": 0, "P1": 1, "P2": 2, "P3": 3}
|
|
| 245 |
|
| 246 |
|
| 247 |
def _priority_score(predicted: str, correct: str) -> float:
|
| 248 |
-
"""Exact match = 1.0, one level off = 0.5, two+ off = 0.0"""
|
| 249 |
if predicted == correct:
|
| 250 |
-
return
|
| 251 |
diff = abs(PRIORITY_ORDER.get(predicted, 99) - PRIORITY_ORDER.get(correct, 99))
|
| 252 |
-
return 0.5 if diff == 1 else 0.
|
| 253 |
|
| 254 |
|
| 255 |
def _label_score(predicted: List[str], correct: List[str]) -> float:
|
| 256 |
-
"""Jaccard similarity between predicted and correct label sets."""
|
| 257 |
pred_set = set(l.lower() for l in predicted)
|
| 258 |
corr_set = set(l.lower() for l in correct)
|
| 259 |
if not corr_set:
|
| 260 |
-
return
|
| 261 |
intersection = pred_set & corr_set
|
| 262 |
union = pred_set | corr_set
|
| 263 |
-
|
|
|
|
| 264 |
|
| 265 |
|
| 266 |
-
def grade_action(
|
| 267 |
-
task_key: str, bug: BugReport, action: TriageAction
|
| 268 |
-
) -> Tuple[float, str]:
|
| 269 |
-
"""
|
| 270 |
-
Returns (score: 0.0β1.0, feedback: str)
|
| 271 |
-
|
| 272 |
-
Easy β priority only (100%)
|
| 273 |
-
Medium β priority (45%) + labels (40%) + team routing (15%)
|
| 274 |
-
Hard β priority (35%) + labels (30%) + team (20%) + milestone (15%)
|
| 275 |
-
with -0.15 penalty for missing security escalation
|
| 276 |
-
"""
|
| 277 |
answer = TASKS[task_key]["answers"][bug.id]
|
| 278 |
feedback_parts = []
|
| 279 |
|
| 280 |
if task_key == "easy":
|
| 281 |
-
# Only grade priority
|
| 282 |
score = _priority_score(action.priority, answer["priority"])
|
| 283 |
-
symbol = "β" if score =
|
| 284 |
-
feedback_parts.append(
|
| 285 |
-
|
| 286 |
-
)
|
| 287 |
return round(score, 3), " | ".join(feedback_parts)
|
| 288 |
|
| 289 |
elif task_key == "medium":
|
| 290 |
-
# Priority (45%) + labels (40%) + team routing (15%)
|
| 291 |
p_score = _priority_score(action.priority, answer["priority"])
|
| 292 |
l_score = _label_score(action.labels, answer["labels"])
|
| 293 |
-
|
| 294 |
expected_team = answer.get("assigned_team", "")
|
| 295 |
-
t_score = (
|
| 296 |
-
1.0
|
| 297 |
-
if expected_team and action.assigned_team.lower() == expected_team.lower()
|
| 298 |
-
else 0.0
|
| 299 |
-
)
|
| 300 |
-
|
| 301 |
score = 0.45 * p_score + 0.40 * l_score + 0.15 * t_score
|
| 302 |
-
|
| 303 |
feedback_parts.append(f"Priority: {p_score:.2f} (got {action.priority}, expected {answer['priority']})")
|
| 304 |
feedback_parts.append(f"Labels: {l_score:.2f}")
|
| 305 |
feedback_parts.append(f"Team: {t_score:.2f} (got {action.assigned_team}, expected {expected_team})")
|
| 306 |
-
|
| 307 |
return round(score, 3), " | ".join(feedback_parts)
|
| 308 |
|
| 309 |
else: # hard
|
| 310 |
-
# Priority (35%) + labels (30%) + team (20%) + milestone (15%)
|
| 311 |
p_score = _priority_score(action.priority, answer["priority"])
|
| 312 |
l_score = _label_score(action.labels, answer["labels"])
|
| 313 |
-
t_score = (
|
| 314 |
-
|
| 315 |
-
if action.assigned_team.lower() == answer["assigned_team"].lower()
|
| 316 |
-
else 0.0
|
| 317 |
-
)
|
| 318 |
-
m_score = (
|
| 319 |
-
1.0
|
| 320 |
-
if action.milestone.lower() == answer["milestone"].lower()
|
| 321 |
-
else 0.0
|
| 322 |
-
)
|
| 323 |
-
|
| 324 |
score = 0.35 * p_score + 0.30 * l_score + 0.20 * t_score + 0.15 * m_score
|
| 325 |
-
|
| 326 |
feedback_parts.append(f"Priority: {p_score:.2f} (got {action.priority}, expected {answer['priority']})")
|
| 327 |
feedback_parts.append(f"Labels: {l_score:.2f}")
|
| 328 |
feedback_parts.append(f"Team: {t_score:.2f} (got {action.assigned_team}, expected {answer['assigned_team']})")
|
| 329 |
feedback_parts.append(f"Milestone: {m_score:.2f} (got {action.milestone}, expected {answer['milestone']})")
|
| 330 |
-
|
| 331 |
-
# Penalty: missing security escalation on security bugs
|
| 332 |
if answer.get("assigned_team") == "security" and action.assigned_team.lower() != "security":
|
| 333 |
-
score = max(0.
|
| 334 |
feedback_parts.append("β Security escalation missed (-0.15)")
|
| 335 |
-
|
| 336 |
return round(score, 3), " | ".join(feedback_parts)
|
| 337 |
|
| 338 |
def priority_match(*args, **kwargs):
|
|
|
|
| 245 |
|
| 246 |
|
| 247 |
def _priority_score(predicted: str, correct: str) -> float:
|
|
|
|
| 248 |
if predicted == correct:
|
| 249 |
+
return 0.95
|
| 250 |
diff = abs(PRIORITY_ORDER.get(predicted, 99) - PRIORITY_ORDER.get(correct, 99))
|
| 251 |
+
return 0.5 if diff == 1 else 0.05
|
| 252 |
|
| 253 |
|
| 254 |
def _label_score(predicted: List[str], correct: List[str]) -> float:
|
|
|
|
| 255 |
pred_set = set(l.lower() for l in predicted)
|
| 256 |
corr_set = set(l.lower() for l in correct)
|
| 257 |
if not corr_set:
|
| 258 |
+
return 0.95
|
| 259 |
intersection = pred_set & corr_set
|
| 260 |
union = pred_set | corr_set
|
| 261 |
+
raw = len(intersection) / len(union)
|
| 262 |
+
return max(0.05, min(0.95, raw))
|
| 263 |
|
| 264 |
|
| 265 |
+
def grade_action(task_key, bug, action):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 266 |
answer = TASKS[task_key]["answers"][bug.id]
|
| 267 |
feedback_parts = []
|
| 268 |
|
| 269 |
if task_key == "easy":
|
|
|
|
| 270 |
score = _priority_score(action.priority, answer["priority"])
|
| 271 |
+
symbol = "β" if score >= 0.9 else "~" if score >= 0.4 else "β"
|
| 272 |
+
feedback_parts.append(f"Priority: {symbol} (got {action.priority}, expected {answer['priority']})")
|
| 273 |
+
score = max(0.05, min(0.95, score))
|
|
|
|
| 274 |
return round(score, 3), " | ".join(feedback_parts)
|
| 275 |
|
| 276 |
elif task_key == "medium":
|
|
|
|
| 277 |
p_score = _priority_score(action.priority, answer["priority"])
|
| 278 |
l_score = _label_score(action.labels, answer["labels"])
|
|
|
|
| 279 |
expected_team = answer.get("assigned_team", "")
|
| 280 |
+
t_score = 0.95 if expected_team and action.assigned_team.lower() == expected_team.lower() else 0.05
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 281 |
score = 0.45 * p_score + 0.40 * l_score + 0.15 * t_score
|
|
|
|
| 282 |
feedback_parts.append(f"Priority: {p_score:.2f} (got {action.priority}, expected {answer['priority']})")
|
| 283 |
feedback_parts.append(f"Labels: {l_score:.2f}")
|
| 284 |
feedback_parts.append(f"Team: {t_score:.2f} (got {action.assigned_team}, expected {expected_team})")
|
| 285 |
+
score = max(0.05, min(0.95, score))
|
| 286 |
return round(score, 3), " | ".join(feedback_parts)
|
| 287 |
|
| 288 |
else: # hard
|
|
|
|
| 289 |
p_score = _priority_score(action.priority, answer["priority"])
|
| 290 |
l_score = _label_score(action.labels, answer["labels"])
|
| 291 |
+
t_score = 0.95 if action.assigned_team.lower() == answer["assigned_team"].lower() else 0.05
|
| 292 |
+
m_score = 0.95 if action.milestone.lower() == answer["milestone"].lower() else 0.05
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 293 |
score = 0.35 * p_score + 0.30 * l_score + 0.20 * t_score + 0.15 * m_score
|
|
|
|
| 294 |
feedback_parts.append(f"Priority: {p_score:.2f} (got {action.priority}, expected {answer['priority']})")
|
| 295 |
feedback_parts.append(f"Labels: {l_score:.2f}")
|
| 296 |
feedback_parts.append(f"Team: {t_score:.2f} (got {action.assigned_team}, expected {answer['assigned_team']})")
|
| 297 |
feedback_parts.append(f"Milestone: {m_score:.2f} (got {action.milestone}, expected {answer['milestone']})")
|
|
|
|
|
|
|
| 298 |
if answer.get("assigned_team") == "security" and action.assigned_team.lower() != "security":
|
| 299 |
+
score = max(0.05, score - 0.15)
|
| 300 |
feedback_parts.append("β Security escalation missed (-0.15)")
|
| 301 |
+
score = max(0.05, min(0.95, score))
|
| 302 |
return round(score, 3), " | ".join(feedback_parts)
|
| 303 |
|
| 304 |
def priority_match(*args, **kwargs):
|