Spaces:
Sleeping
Sleeping
Upload folder using huggingface_hub
Browse files
bug_triage_env/graders/task1_grader.py
CHANGED
|
@@ -19,13 +19,13 @@ def grade(episode_log: List[Dict[str, Any]], ground_truth: Dict[str, Any]) -> fl
|
|
| 19 |
Score in [0.0, 1.0].
|
| 20 |
"""
|
| 21 |
if not episode_log:
|
| 22 |
-
return 0.
|
| 23 |
|
| 24 |
last_action = episode_log[-1]
|
| 25 |
predicted = (last_action.get("bug_type") or "").strip().lower()
|
| 26 |
expected = (ground_truth.get("bug_type") or "").strip().lower()
|
| 27 |
|
| 28 |
if not predicted:
|
| 29 |
-
return 0.
|
| 30 |
|
| 31 |
-
return
|
|
|
|
| 19 |
Score in [0.0, 1.0].
|
| 20 |
"""
|
| 21 |
if not episode_log:
|
| 22 |
+
return 0.0
|
| 23 |
|
| 24 |
last_action = episode_log[-1]
|
| 25 |
predicted = (last_action.get("bug_type") or "").strip().lower()
|
| 26 |
expected = (ground_truth.get("bug_type") or "").strip().lower()
|
| 27 |
|
| 28 |
if not predicted:
|
| 29 |
+
return 0.0
|
| 30 |
|
| 31 |
+
return 1.0 if predicted == expected else 0.01
|
bug_triage_env/graders/task2_grader.py
CHANGED
|
@@ -27,17 +27,17 @@ def grade(episode_log: List[Dict[str, Any]], ground_truth: Dict[str, Any]) -> fl
|
|
| 27 |
float score in [0.0, 1.0].
|
| 28 |
"""
|
| 29 |
if not episode_log:
|
| 30 |
-
return 0.
|
| 31 |
|
| 32 |
last_action = episode_log[-1]
|
| 33 |
predicted = (last_action.get("priority") or "").strip().lower()
|
| 34 |
expected = (ground_truth.get("priority") or "").strip().lower()
|
| 35 |
|
| 36 |
if not predicted or predicted not in PRIORITY_ORDER:
|
| 37 |
-
return 0.
|
| 38 |
if expected not in PRIORITY_ORDER:
|
| 39 |
-
return 0.
|
| 40 |
|
| 41 |
diff = abs(PRIORITY_ORDER[predicted] - PRIORITY_ORDER[expected])
|
| 42 |
score = 1.0 - diff * (1.0 / 3.0)
|
| 43 |
-
return max(0.
|
|
|
|
| 27 |
float score in [0.0, 1.0].
|
| 28 |
"""
|
| 29 |
if not episode_log:
|
| 30 |
+
return 0.0
|
| 31 |
|
| 32 |
last_action = episode_log[-1]
|
| 33 |
predicted = (last_action.get("priority") or "").strip().lower()
|
| 34 |
expected = (ground_truth.get("priority") or "").strip().lower()
|
| 35 |
|
| 36 |
if not predicted or predicted not in PRIORITY_ORDER:
|
| 37 |
+
return 0.0
|
| 38 |
if expected not in PRIORITY_ORDER:
|
| 39 |
+
return 0.0
|
| 40 |
|
| 41 |
diff = abs(PRIORITY_ORDER[predicted] - PRIORITY_ORDER[expected])
|
| 42 |
score = 1.0 - diff * (1.0 / 3.0)
|
| 43 |
+
return max(0.0, min(1.0, score))
|
bug_triage_env/graders/task3_grader.py
CHANGED
|
@@ -85,7 +85,7 @@ def grade(episode_log: List[Dict[str, Any]], ground_truth: Dict[str, Any]) -> fl
|
|
| 85 |
float score in [0.0, 1.0].
|
| 86 |
"""
|
| 87 |
if not episode_log:
|
| 88 |
-
return 0.
|
| 89 |
|
| 90 |
action = episode_log[-1]
|
| 91 |
|
|
@@ -111,4 +111,4 @@ def grade(episode_log: List[Dict[str, Any]], ground_truth: Dict[str, Any]) -> fl
|
|
| 111 |
+ WEIGHTS["action"] * s_act
|
| 112 |
)
|
| 113 |
|
| 114 |
-
return max(0.
|
|
|
|
| 85 |
float score in [0.0, 1.0].
|
| 86 |
"""
|
| 87 |
if not episode_log:
|
| 88 |
+
return 0.0
|
| 89 |
|
| 90 |
action = episode_log[-1]
|
| 91 |
|
|
|
|
| 111 |
+ WEIGHTS["action"] * s_act
|
| 112 |
)
|
| 113 |
|
| 114 |
+
return max(0.0, min(1.0, round(score, 4)))
|
bug_triage_env/server/environment.py
CHANGED
|
@@ -117,15 +117,12 @@ class BugTriageEnvironment(_OpenEnvBase):
|
|
| 117 |
grader_fn = GRADERS.get(ep["task_id"], GRADERS["task_1"])
|
| 118 |
grader_score = grader_fn(ep["actions"], ep["ground_truth"])
|
| 119 |
|
| 120 |
-
#
|
| 121 |
-
reward = (grader_score * 1.5) - 0.5
|
| 122 |
-
|
| 123 |
-
# Confidence calibration bonus/penalty
|
| 124 |
calibration_bonus = self._compute_calibration_bonus(
|
| 125 |
action.confidence, grader_score
|
| 126 |
)
|
| 127 |
-
reward
|
| 128 |
-
reward = max(
|
| 129 |
|
| 130 |
# Build feedback string
|
| 131 |
feedback = self._build_feedback(
|
|
|
|
| 117 |
grader_fn = GRADERS.get(ep["task_id"], GRADERS["task_1"])
|
| 118 |
grader_score = grader_fn(ep["actions"], ep["ground_truth"])
|
| 119 |
|
| 120 |
+
# Bound the reward strictly to [0.0, 1.0] to pass Hackathon Phase 2
|
|
|
|
|
|
|
|
|
|
| 121 |
calibration_bonus = self._compute_calibration_bonus(
|
| 122 |
action.confidence, grader_score
|
| 123 |
)
|
| 124 |
+
reward = grader_score + calibration_bonus
|
| 125 |
+
reward = max(0.0, min(1.0, reward))
|
| 126 |
|
| 127 |
# Build feedback string
|
| 128 |
feedback = self._build_feedback(
|