Spaces:

mathi3046
/

customer-support-env

Sleeping

App Files Files Community

mathi3046 commited on Apr 7

Commit

11e6068

1 Parent(s): 1a235af

fix: clamp all task scores to strict (0,1) range - never 0.0 or 1.0 - grader.py: add _clamp() helper, apply to all sub-scores and total - models.py: update Pydantic fields to gt/lt strict bounds - inference.py: clamp avg_reward and final_score Fixes Phase 2 'task scores out of range' validation error

Browse files

Files changed (3) hide show

grader.py +23 -13
inference.py +4 -2
models.py +16 -16

grader.py CHANGED Viewed

@@ -6,7 +6,7 @@ Evaluates agent responses on three axes:
   - Tone         (positive vs. negative signal detection)
   - Completeness (checklist of required response elements)
-Returns a RewardBreakdown with a total score in [0.0, 1.0].
 """
 import re
@@ -15,6 +15,16 @@ from typing import Any, Dict, List
 from models import RewardBreakdown
 def _normalise(text: str) -> str:
     """Lower-case and strip extra whitespace for matching."""
     return re.sub(r"\s+", " ", text.strip().lower())
@@ -293,23 +303,23 @@ def grade_response(
         conversation_history: Previous messages
     Returns:
-        RewardBreakdown with scores in [0.0, 1.0] and explanation
     """
-    # Score each axis
-    correctness_raw = _score_correctness(
         response,
         grading_rubric.get("correctness", {}),
-    )
-    tone_raw = _score_tone(
         response,
         grading_rubric.get("tone", {}),
-    )
-    completeness_raw = _score_completeness(
         response,
         grading_rubric.get("completeness", {}),
         ticket_info,
         conversation_history,
-    )
     # Get weights
     w_correctness = grading_rubric.get("correctness", {}).get("weight", 0.33)
@@ -319,15 +329,15 @@ def grade_response(
     # Compute penalties
     penalties = _compute_penalties(response, conversation_history)
-    # Weighted total (before penalties)
-    weighted = (
         correctness_raw * w_correctness
         + tone_raw * w_tone
         + completeness_raw * w_completeness
     )
-    # Apply penalties
-    total = max(0.01, min(0.99, weighted + penalties))
     # Build explanation
     parts = []

   - Tone         (positive vs. negative signal detection)
   - Completeness (checklist of required response elements)
+Returns a RewardBreakdown with a total score in (0.0, 1.0) — strict open interval.
 """
 import re
 from models import RewardBreakdown
+# Strict open-interval clamp: scores must never be exactly 0.0 or 1.0
+_SCORE_MIN = 0.01
+_SCORE_MAX = 0.99
+def _clamp(value: float, lo: float = _SCORE_MIN, hi: float = _SCORE_MAX) -> float:
+    """Clamp *value* into the strict open interval (0, 1)."""
+    return max(lo, min(hi, float(value)))
 def _normalise(text: str) -> str:
     """Lower-case and strip extra whitespace for matching."""
     return re.sub(r"\s+", " ", text.strip().lower())
         conversation_history: Previous messages
     Returns:
+        RewardBreakdown with scores in strict (0.0, 1.0) open interval
     """
+    # Score each axis and clamp to strict (0, 1)
+    correctness_raw = _clamp(_score_correctness(
         response,
         grading_rubric.get("correctness", {}),
+    ))
+    tone_raw = _clamp(_score_tone(
         response,
         grading_rubric.get("tone", {}),
+    ))
+    completeness_raw = _clamp(_score_completeness(
         response,
         grading_rubric.get("completeness", {}),
         ticket_info,
         conversation_history,
+    ))
     # Get weights
     w_correctness = grading_rubric.get("correctness", {}).get("weight", 0.33)
     # Compute penalties
     penalties = _compute_penalties(response, conversation_history)
+    # Weighted total (before penalties) — clamped
+    weighted = _clamp(
         correctness_raw * w_correctness
         + tone_raw * w_tone
         + completeness_raw * w_completeness
     )
+    # Apply penalties — clamped to strict (0, 1)
+    total = _clamp(weighted + penalties)
     # Build explanation
     parts = []

inference.py CHANGED Viewed

@@ -323,8 +323,9 @@ def run_task(env_client: EnvClient, task_id: str) -> Dict[str, Any]:
             f"done={done}"
         )
-    # Compute average reward for this task
     avg_reward = total_reward / max(step_count, 1)
     elapsed = time.time() - start_time
     logger.info(
@@ -411,7 +412,8 @@ def main():
         )
         total_avg += r.get("avg_reward", 0)
-    final_score = total_avg / len(results) if results else 0.0
     logger.info("-" * 60)
     logger.info(f"  FINAL SCORE: {final_score:.4f} (0.0 -- 1.0)")
     logger.info("=" * 60)

             f"done={done}"
         )
+    # Compute average reward for this task — clamped to strict (0, 1)
     avg_reward = total_reward / max(step_count, 1)
+    avg_reward = max(0.01, min(0.99, avg_reward))
     elapsed = time.time() - start_time
     logger.info(
         )
         total_avg += r.get("avg_reward", 0)
+    final_score = total_avg / len(results) if results else 0.01
+    final_score = max(0.01, min(0.99, final_score))  # strict (0, 1)
     logger.info("-" * 60)
     logger.info(f"  FINAL SCORE: {final_score:.4f} (0.0 -- 1.0)")
     logger.info("=" * 60)

models.py CHANGED Viewed

@@ -132,24 +132,24 @@ class SupportObservation(BaseModel):
 class RewardBreakdown(BaseModel):
     """Detailed breakdown of the reward score."""
     correctness: float = Field(
-        default=0.0,
-        ge=0.0, le=1.0,
-        description="Score for factual correctness (0.0-1.0)",
     )
     tone: float = Field(
-        default=0.0,
-        ge=0.0, le=1.0,
-        description="Score for professional tone (0.0-1.0)",
     )
     completeness: float = Field(
-        default=0.0,
-        ge=0.0, le=1.0,
-        description="Score for response completeness (0.0-1.0)",
     )
     efficiency: float = Field(
-        default=0.0,
-        ge=0.0, le=1.0,
-        description="Score for resolution efficiency (0.0-1.0)",
     )
     penalties: float = Field(
         default=0.0,
@@ -157,9 +157,9 @@ class RewardBreakdown(BaseModel):
         description="Penalty deductions (negative value)",
     )
     total: float = Field(
-        default=0.0,
-        ge=0.0, le=1.0,
-        description="Overall weighted score (0.0-1.0)",
     )
     explanation: str = Field(
         default="",
@@ -200,6 +200,6 @@ class SupportState(BaseModel):
 class StepResult(BaseModel):
     """Result returned from step(), matching OpenEnv convention."""
     observation: SupportObservation
-    reward: float = Field(ge=0.0, le=1.0)
     done: bool
     info: Dict[str, Any] = Field(default_factory=dict)

 class RewardBreakdown(BaseModel):
     """Detailed breakdown of the reward score."""
     correctness: float = Field(
+        default=0.01,
+        gt=0.0, lt=1.0,
+        description="Score for factual correctness — strict (0, 1)",
     )
     tone: float = Field(
+        default=0.01,
+        gt=0.0, lt=1.0,
+        description="Score for professional tone — strict (0, 1)",
     )
     completeness: float = Field(
+        default=0.01,
+        gt=0.0, lt=1.0,
+        description="Score for response completeness — strict (0, 1)",
     )
     efficiency: float = Field(
+        default=0.01,
+        gt=0.0, lt=1.0,
+        description="Score for resolution efficiency — strict (0, 1)",
     )
     penalties: float = Field(
         default=0.0,
         description="Penalty deductions (negative value)",
     )
     total: float = Field(
+        default=0.01,
+        gt=0.0, lt=1.0,
+        description="Overall weighted score — strict (0, 1)",
     )
     explanation: str = Field(
         default="",
 class StepResult(BaseModel):
     """Result returned from step(), matching OpenEnv convention."""
     observation: SupportObservation
+    reward: float = Field(gt=0.0, lt=1.0)
     done: bool
     info: Dict[str, Any] = Field(default_factory=dict)