Spaces:
Sleeping
Sleeping
fix: clamp all task scores to strict (0,1) range - never 0.0 or 1.0 - grader.py: add _clamp() helper, apply to all sub-scores and total - models.py: update Pydantic fields to gt/lt strict bounds - inference.py: clamp avg_reward and final_score Fixes Phase 2 'task scores out of range' validation error
Browse files- grader.py +23 -13
- inference.py +4 -2
- models.py +16 -16
grader.py
CHANGED
|
@@ -6,7 +6,7 @@ Evaluates agent responses on three axes:
|
|
| 6 |
- Tone (positive vs. negative signal detection)
|
| 7 |
- Completeness (checklist of required response elements)
|
| 8 |
|
| 9 |
-
Returns a RewardBreakdown with a total score in
|
| 10 |
"""
|
| 11 |
|
| 12 |
import re
|
|
@@ -15,6 +15,16 @@ from typing import Any, Dict, List
|
|
| 15 |
from models import RewardBreakdown
|
| 16 |
|
| 17 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 18 |
def _normalise(text: str) -> str:
|
| 19 |
"""Lower-case and strip extra whitespace for matching."""
|
| 20 |
return re.sub(r"\s+", " ", text.strip().lower())
|
|
@@ -293,23 +303,23 @@ def grade_response(
|
|
| 293 |
conversation_history: Previous messages
|
| 294 |
|
| 295 |
Returns:
|
| 296 |
-
RewardBreakdown with scores in
|
| 297 |
"""
|
| 298 |
-
# Score each axis
|
| 299 |
-
correctness_raw = _score_correctness(
|
| 300 |
response,
|
| 301 |
grading_rubric.get("correctness", {}),
|
| 302 |
-
)
|
| 303 |
-
tone_raw = _score_tone(
|
| 304 |
response,
|
| 305 |
grading_rubric.get("tone", {}),
|
| 306 |
-
)
|
| 307 |
-
completeness_raw = _score_completeness(
|
| 308 |
response,
|
| 309 |
grading_rubric.get("completeness", {}),
|
| 310 |
ticket_info,
|
| 311 |
conversation_history,
|
| 312 |
-
)
|
| 313 |
|
| 314 |
# Get weights
|
| 315 |
w_correctness = grading_rubric.get("correctness", {}).get("weight", 0.33)
|
|
@@ -319,15 +329,15 @@ def grade_response(
|
|
| 319 |
# Compute penalties
|
| 320 |
penalties = _compute_penalties(response, conversation_history)
|
| 321 |
|
| 322 |
-
# Weighted total (before penalties)
|
| 323 |
-
weighted = (
|
| 324 |
correctness_raw * w_correctness
|
| 325 |
+ tone_raw * w_tone
|
| 326 |
+ completeness_raw * w_completeness
|
| 327 |
)
|
| 328 |
|
| 329 |
-
# Apply penalties
|
| 330 |
-
total =
|
| 331 |
|
| 332 |
# Build explanation
|
| 333 |
parts = []
|
|
|
|
| 6 |
- Tone (positive vs. negative signal detection)
|
| 7 |
- Completeness (checklist of required response elements)
|
| 8 |
|
| 9 |
+
Returns a RewardBreakdown with a total score in (0.0, 1.0) β strict open interval.
|
| 10 |
"""
|
| 11 |
|
| 12 |
import re
|
|
|
|
| 15 |
from models import RewardBreakdown
|
| 16 |
|
| 17 |
|
| 18 |
+
# Strict open-interval clamp: scores must never be exactly 0.0 or 1.0
|
| 19 |
+
_SCORE_MIN = 0.01
|
| 20 |
+
_SCORE_MAX = 0.99
|
| 21 |
+
|
| 22 |
+
|
| 23 |
+
def _clamp(value: float, lo: float = _SCORE_MIN, hi: float = _SCORE_MAX) -> float:
|
| 24 |
+
"""Clamp *value* into the strict open interval (0, 1)."""
|
| 25 |
+
return max(lo, min(hi, float(value)))
|
| 26 |
+
|
| 27 |
+
|
| 28 |
def _normalise(text: str) -> str:
|
| 29 |
"""Lower-case and strip extra whitespace for matching."""
|
| 30 |
return re.sub(r"\s+", " ", text.strip().lower())
|
|
|
|
| 303 |
conversation_history: Previous messages
|
| 304 |
|
| 305 |
Returns:
|
| 306 |
+
RewardBreakdown with scores in strict (0.0, 1.0) open interval
|
| 307 |
"""
|
| 308 |
+
# Score each axis and clamp to strict (0, 1)
|
| 309 |
+
correctness_raw = _clamp(_score_correctness(
|
| 310 |
response,
|
| 311 |
grading_rubric.get("correctness", {}),
|
| 312 |
+
))
|
| 313 |
+
tone_raw = _clamp(_score_tone(
|
| 314 |
response,
|
| 315 |
grading_rubric.get("tone", {}),
|
| 316 |
+
))
|
| 317 |
+
completeness_raw = _clamp(_score_completeness(
|
| 318 |
response,
|
| 319 |
grading_rubric.get("completeness", {}),
|
| 320 |
ticket_info,
|
| 321 |
conversation_history,
|
| 322 |
+
))
|
| 323 |
|
| 324 |
# Get weights
|
| 325 |
w_correctness = grading_rubric.get("correctness", {}).get("weight", 0.33)
|
|
|
|
| 329 |
# Compute penalties
|
| 330 |
penalties = _compute_penalties(response, conversation_history)
|
| 331 |
|
| 332 |
+
# Weighted total (before penalties) β clamped
|
| 333 |
+
weighted = _clamp(
|
| 334 |
correctness_raw * w_correctness
|
| 335 |
+ tone_raw * w_tone
|
| 336 |
+ completeness_raw * w_completeness
|
| 337 |
)
|
| 338 |
|
| 339 |
+
# Apply penalties β clamped to strict (0, 1)
|
| 340 |
+
total = _clamp(weighted + penalties)
|
| 341 |
|
| 342 |
# Build explanation
|
| 343 |
parts = []
|
inference.py
CHANGED
|
@@ -323,8 +323,9 @@ def run_task(env_client: EnvClient, task_id: str) -> Dict[str, Any]:
|
|
| 323 |
f"done={done}"
|
| 324 |
)
|
| 325 |
|
| 326 |
-
# Compute average reward for this task
|
| 327 |
avg_reward = total_reward / max(step_count, 1)
|
|
|
|
| 328 |
elapsed = time.time() - start_time
|
| 329 |
|
| 330 |
logger.info(
|
|
@@ -411,7 +412,8 @@ def main():
|
|
| 411 |
)
|
| 412 |
total_avg += r.get("avg_reward", 0)
|
| 413 |
|
| 414 |
-
final_score = total_avg / len(results) if results else 0.
|
|
|
|
| 415 |
logger.info("-" * 60)
|
| 416 |
logger.info(f" FINAL SCORE: {final_score:.4f} (0.0 -- 1.0)")
|
| 417 |
logger.info("=" * 60)
|
|
|
|
| 323 |
f"done={done}"
|
| 324 |
)
|
| 325 |
|
| 326 |
+
# Compute average reward for this task β clamped to strict (0, 1)
|
| 327 |
avg_reward = total_reward / max(step_count, 1)
|
| 328 |
+
avg_reward = max(0.01, min(0.99, avg_reward))
|
| 329 |
elapsed = time.time() - start_time
|
| 330 |
|
| 331 |
logger.info(
|
|
|
|
| 412 |
)
|
| 413 |
total_avg += r.get("avg_reward", 0)
|
| 414 |
|
| 415 |
+
final_score = total_avg / len(results) if results else 0.01
|
| 416 |
+
final_score = max(0.01, min(0.99, final_score)) # strict (0, 1)
|
| 417 |
logger.info("-" * 60)
|
| 418 |
logger.info(f" FINAL SCORE: {final_score:.4f} (0.0 -- 1.0)")
|
| 419 |
logger.info("=" * 60)
|
models.py
CHANGED
|
@@ -132,24 +132,24 @@ class SupportObservation(BaseModel):
|
|
| 132 |
class RewardBreakdown(BaseModel):
|
| 133 |
"""Detailed breakdown of the reward score."""
|
| 134 |
correctness: float = Field(
|
| 135 |
-
default=0.
|
| 136 |
-
|
| 137 |
-
description="Score for factual correctness (0
|
| 138 |
)
|
| 139 |
tone: float = Field(
|
| 140 |
-
default=0.
|
| 141 |
-
|
| 142 |
-
description="Score for professional tone (0
|
| 143 |
)
|
| 144 |
completeness: float = Field(
|
| 145 |
-
default=0.
|
| 146 |
-
|
| 147 |
-
description="Score for response completeness (0
|
| 148 |
)
|
| 149 |
efficiency: float = Field(
|
| 150 |
-
default=0.
|
| 151 |
-
|
| 152 |
-
description="Score for resolution efficiency (0
|
| 153 |
)
|
| 154 |
penalties: float = Field(
|
| 155 |
default=0.0,
|
|
@@ -157,9 +157,9 @@ class RewardBreakdown(BaseModel):
|
|
| 157 |
description="Penalty deductions (negative value)",
|
| 158 |
)
|
| 159 |
total: float = Field(
|
| 160 |
-
default=0.
|
| 161 |
-
|
| 162 |
-
description="Overall weighted score (0
|
| 163 |
)
|
| 164 |
explanation: str = Field(
|
| 165 |
default="",
|
|
@@ -200,6 +200,6 @@ class SupportState(BaseModel):
|
|
| 200 |
class StepResult(BaseModel):
|
| 201 |
"""Result returned from step(), matching OpenEnv convention."""
|
| 202 |
observation: SupportObservation
|
| 203 |
-
reward: float = Field(
|
| 204 |
done: bool
|
| 205 |
info: Dict[str, Any] = Field(default_factory=dict)
|
|
|
|
| 132 |
class RewardBreakdown(BaseModel):
|
| 133 |
"""Detailed breakdown of the reward score."""
|
| 134 |
correctness: float = Field(
|
| 135 |
+
default=0.01,
|
| 136 |
+
gt=0.0, lt=1.0,
|
| 137 |
+
description="Score for factual correctness β strict (0, 1)",
|
| 138 |
)
|
| 139 |
tone: float = Field(
|
| 140 |
+
default=0.01,
|
| 141 |
+
gt=0.0, lt=1.0,
|
| 142 |
+
description="Score for professional tone β strict (0, 1)",
|
| 143 |
)
|
| 144 |
completeness: float = Field(
|
| 145 |
+
default=0.01,
|
| 146 |
+
gt=0.0, lt=1.0,
|
| 147 |
+
description="Score for response completeness β strict (0, 1)",
|
| 148 |
)
|
| 149 |
efficiency: float = Field(
|
| 150 |
+
default=0.01,
|
| 151 |
+
gt=0.0, lt=1.0,
|
| 152 |
+
description="Score for resolution efficiency β strict (0, 1)",
|
| 153 |
)
|
| 154 |
penalties: float = Field(
|
| 155 |
default=0.0,
|
|
|
|
| 157 |
description="Penalty deductions (negative value)",
|
| 158 |
)
|
| 159 |
total: float = Field(
|
| 160 |
+
default=0.01,
|
| 161 |
+
gt=0.0, lt=1.0,
|
| 162 |
+
description="Overall weighted score β strict (0, 1)",
|
| 163 |
)
|
| 164 |
explanation: str = Field(
|
| 165 |
default="",
|
|
|
|
| 200 |
class StepResult(BaseModel):
|
| 201 |
"""Result returned from step(), matching OpenEnv convention."""
|
| 202 |
observation: SupportObservation
|
| 203 |
+
reward: float = Field(gt=0.0, lt=1.0)
|
| 204 |
done: bool
|
| 205 |
info: Dict[str, Any] = Field(default_factory=dict)
|