Siteshcodes commited on
Commit
2fbe4d0
Β·
1 Parent(s): 926a06f

fix: no exact 0.0 or 1.0 anywhere in rewards

Browse files
Files changed (2) hide show
  1. server/app.py +3 -4
  2. server/task.py +15 -49
server/app.py CHANGED
@@ -63,18 +63,17 @@ def task_hard():
63
  @app.post("/tasks/easy/reset")
64
  def reset_easy():
65
  bug = sample_bug("easy")
66
- return {"task_id": "easy", "bug_report": bug.dict(), "done": False, "reward": 0.0}
67
 
68
  @app.post("/tasks/medium/reset")
69
  def reset_medium():
70
  bug = sample_bug("medium")
71
- return {"task_id": "medium", "bug_report": bug.dict(), "done": False, "reward": 0.0}
72
 
73
  @app.post("/tasks/hard/reset")
74
  def reset_hard():
75
  bug = sample_bug("hard")
76
- return {"task_id": "hard", "bug_report": bug.dict(), "done": False, "reward": 0.0}
77
-
78
  def main():
79
  import uvicorn
80
  uvicorn.run(app, host="0.0.0.0", port=7860)
 
63
  @app.post("/tasks/easy/reset")
64
  def reset_easy():
65
  bug = sample_bug("easy")
66
+ return {"task_id": "easy", "bug_report": bug.dict(), "done": False, "reward": 0.05}
67
 
68
  @app.post("/tasks/medium/reset")
69
  def reset_medium():
70
  bug = sample_bug("medium")
71
+ return {"task_id": "medium", "bug_report": bug.dict(), "done": False, "reward": 0.05}
72
 
73
  @app.post("/tasks/hard/reset")
74
  def reset_hard():
75
  bug = sample_bug("hard")
76
+ return {"task_id": "hard", "bug_report": bug.dict(), "done": False, "reward": 0.05}
 
77
  def main():
78
  import uvicorn
79
  uvicorn.run(app, host="0.0.0.0", port=7860)
server/task.py CHANGED
@@ -245,94 +245,60 @@ PRIORITY_ORDER = {"P0": 0, "P1": 1, "P2": 2, "P3": 3}
245
 
246
 
247
  def _priority_score(predicted: str, correct: str) -> float:
248
- """Exact match = 1.0, one level off = 0.5, two+ off = 0.0"""
249
  if predicted == correct:
250
- return 1.0
251
  diff = abs(PRIORITY_ORDER.get(predicted, 99) - PRIORITY_ORDER.get(correct, 99))
252
- return 0.5 if diff == 1 else 0.0
253
 
254
 
255
  def _label_score(predicted: List[str], correct: List[str]) -> float:
256
- """Jaccard similarity between predicted and correct label sets."""
257
  pred_set = set(l.lower() for l in predicted)
258
  corr_set = set(l.lower() for l in correct)
259
  if not corr_set:
260
- return 1.0
261
  intersection = pred_set & corr_set
262
  union = pred_set | corr_set
263
- return len(intersection) / len(union)
 
264
 
265
 
266
- def grade_action(
267
- task_key: str, bug: BugReport, action: TriageAction
268
- ) -> Tuple[float, str]:
269
- """
270
- Returns (score: 0.0–1.0, feedback: str)
271
-
272
- Easy β€” priority only (100%)
273
- Medium β€” priority (45%) + labels (40%) + team routing (15%)
274
- Hard β€” priority (35%) + labels (30%) + team (20%) + milestone (15%)
275
- with -0.15 penalty for missing security escalation
276
- """
277
  answer = TASKS[task_key]["answers"][bug.id]
278
  feedback_parts = []
279
 
280
  if task_key == "easy":
281
- # Only grade priority
282
  score = _priority_score(action.priority, answer["priority"])
283
- symbol = "βœ“" if score == 1.0 else "~" if score == 0.5 else "βœ—"
284
- feedback_parts.append(
285
- f"Priority: {symbol} (got {action.priority}, expected {answer['priority']})"
286
- )
287
  return round(score, 3), " | ".join(feedback_parts)
288
 
289
  elif task_key == "medium":
290
- # Priority (45%) + labels (40%) + team routing (15%)
291
  p_score = _priority_score(action.priority, answer["priority"])
292
  l_score = _label_score(action.labels, answer["labels"])
293
-
294
  expected_team = answer.get("assigned_team", "")
295
- t_score = (
296
- 1.0
297
- if expected_team and action.assigned_team.lower() == expected_team.lower()
298
- else 0.0
299
- )
300
-
301
  score = 0.45 * p_score + 0.40 * l_score + 0.15 * t_score
302
-
303
  feedback_parts.append(f"Priority: {p_score:.2f} (got {action.priority}, expected {answer['priority']})")
304
  feedback_parts.append(f"Labels: {l_score:.2f}")
305
  feedback_parts.append(f"Team: {t_score:.2f} (got {action.assigned_team}, expected {expected_team})")
306
-
307
  return round(score, 3), " | ".join(feedback_parts)
308
 
309
  else: # hard
310
- # Priority (35%) + labels (30%) + team (20%) + milestone (15%)
311
  p_score = _priority_score(action.priority, answer["priority"])
312
  l_score = _label_score(action.labels, answer["labels"])
313
- t_score = (
314
- 1.0
315
- if action.assigned_team.lower() == answer["assigned_team"].lower()
316
- else 0.0
317
- )
318
- m_score = (
319
- 1.0
320
- if action.milestone.lower() == answer["milestone"].lower()
321
- else 0.0
322
- )
323
-
324
  score = 0.35 * p_score + 0.30 * l_score + 0.20 * t_score + 0.15 * m_score
325
-
326
  feedback_parts.append(f"Priority: {p_score:.2f} (got {action.priority}, expected {answer['priority']})")
327
  feedback_parts.append(f"Labels: {l_score:.2f}")
328
  feedback_parts.append(f"Team: {t_score:.2f} (got {action.assigned_team}, expected {answer['assigned_team']})")
329
  feedback_parts.append(f"Milestone: {m_score:.2f} (got {action.milestone}, expected {answer['milestone']})")
330
-
331
- # Penalty: missing security escalation on security bugs
332
  if answer.get("assigned_team") == "security" and action.assigned_team.lower() != "security":
333
- score = max(0.0, score - 0.15)
334
  feedback_parts.append("⚠ Security escalation missed (-0.15)")
335
-
336
  return round(score, 3), " | ".join(feedback_parts)
337
 
338
  def priority_match(*args, **kwargs):
 
245
 
246
 
247
  def _priority_score(predicted: str, correct: str) -> float:
 
248
  if predicted == correct:
249
+ return 0.95
250
  diff = abs(PRIORITY_ORDER.get(predicted, 99) - PRIORITY_ORDER.get(correct, 99))
251
+ return 0.5 if diff == 1 else 0.05
252
 
253
 
254
  def _label_score(predicted: List[str], correct: List[str]) -> float:
 
255
  pred_set = set(l.lower() for l in predicted)
256
  corr_set = set(l.lower() for l in correct)
257
  if not corr_set:
258
+ return 0.95
259
  intersection = pred_set & corr_set
260
  union = pred_set | corr_set
261
+ raw = len(intersection) / len(union)
262
+ return max(0.05, min(0.95, raw))
263
 
264
 
265
+ def grade_action(task_key, bug, action):
 
 
 
 
 
 
 
 
 
 
266
  answer = TASKS[task_key]["answers"][bug.id]
267
  feedback_parts = []
268
 
269
  if task_key == "easy":
 
270
  score = _priority_score(action.priority, answer["priority"])
271
+ symbol = "βœ“" if score >= 0.9 else "~" if score >= 0.4 else "βœ—"
272
+ feedback_parts.append(f"Priority: {symbol} (got {action.priority}, expected {answer['priority']})")
273
+ score = max(0.05, min(0.95, score))
 
274
  return round(score, 3), " | ".join(feedback_parts)
275
 
276
  elif task_key == "medium":
 
277
  p_score = _priority_score(action.priority, answer["priority"])
278
  l_score = _label_score(action.labels, answer["labels"])
 
279
  expected_team = answer.get("assigned_team", "")
280
+ t_score = 0.95 if expected_team and action.assigned_team.lower() == expected_team.lower() else 0.05
 
 
 
 
 
281
  score = 0.45 * p_score + 0.40 * l_score + 0.15 * t_score
 
282
  feedback_parts.append(f"Priority: {p_score:.2f} (got {action.priority}, expected {answer['priority']})")
283
  feedback_parts.append(f"Labels: {l_score:.2f}")
284
  feedback_parts.append(f"Team: {t_score:.2f} (got {action.assigned_team}, expected {expected_team})")
285
+ score = max(0.05, min(0.95, score))
286
  return round(score, 3), " | ".join(feedback_parts)
287
 
288
  else: # hard
 
289
  p_score = _priority_score(action.priority, answer["priority"])
290
  l_score = _label_score(action.labels, answer["labels"])
291
+ t_score = 0.95 if action.assigned_team.lower() == answer["assigned_team"].lower() else 0.05
292
+ m_score = 0.95 if action.milestone.lower() == answer["milestone"].lower() else 0.05
 
 
 
 
 
 
 
 
 
293
  score = 0.35 * p_score + 0.30 * l_score + 0.20 * t_score + 0.15 * m_score
 
294
  feedback_parts.append(f"Priority: {p_score:.2f} (got {action.priority}, expected {answer['priority']})")
295
  feedback_parts.append(f"Labels: {l_score:.2f}")
296
  feedback_parts.append(f"Team: {t_score:.2f} (got {action.assigned_team}, expected {answer['assigned_team']})")
297
  feedback_parts.append(f"Milestone: {m_score:.2f} (got {action.milestone}, expected {answer['milestone']})")
 
 
298
  if answer.get("assigned_team") == "security" and action.assigned_team.lower() != "security":
299
+ score = max(0.05, score - 0.15)
300
  feedback_parts.append("⚠ Security escalation missed (-0.15)")
301
+ score = max(0.05, min(0.95, score))
302
  return round(score, 3), " | ".join(feedback_parts)
303
 
304
  def priority_match(*args, **kwargs):