mathi3046 commited on
Commit
11e6068
Β·
1 Parent(s): 1a235af

fix: clamp all task scores to strict (0,1) range - never 0.0 or 1.0 - grader.py: add _clamp() helper, apply to all sub-scores and total - models.py: update Pydantic fields to gt/lt strict bounds - inference.py: clamp avg_reward and final_score Fixes Phase 2 'task scores out of range' validation error

Browse files
Files changed (3) hide show
  1. grader.py +23 -13
  2. inference.py +4 -2
  3. models.py +16 -16
grader.py CHANGED
@@ -6,7 +6,7 @@ Evaluates agent responses on three axes:
6
  - Tone (positive vs. negative signal detection)
7
  - Completeness (checklist of required response elements)
8
 
9
- Returns a RewardBreakdown with a total score in [0.0, 1.0].
10
  """
11
 
12
  import re
@@ -15,6 +15,16 @@ from typing import Any, Dict, List
15
  from models import RewardBreakdown
16
 
17
 
 
 
 
 
 
 
 
 
 
 
18
  def _normalise(text: str) -> str:
19
  """Lower-case and strip extra whitespace for matching."""
20
  return re.sub(r"\s+", " ", text.strip().lower())
@@ -293,23 +303,23 @@ def grade_response(
293
  conversation_history: Previous messages
294
 
295
  Returns:
296
- RewardBreakdown with scores in [0.0, 1.0] and explanation
297
  """
298
- # Score each axis
299
- correctness_raw = _score_correctness(
300
  response,
301
  grading_rubric.get("correctness", {}),
302
- )
303
- tone_raw = _score_tone(
304
  response,
305
  grading_rubric.get("tone", {}),
306
- )
307
- completeness_raw = _score_completeness(
308
  response,
309
  grading_rubric.get("completeness", {}),
310
  ticket_info,
311
  conversation_history,
312
- )
313
 
314
  # Get weights
315
  w_correctness = grading_rubric.get("correctness", {}).get("weight", 0.33)
@@ -319,15 +329,15 @@ def grade_response(
319
  # Compute penalties
320
  penalties = _compute_penalties(response, conversation_history)
321
 
322
- # Weighted total (before penalties)
323
- weighted = (
324
  correctness_raw * w_correctness
325
  + tone_raw * w_tone
326
  + completeness_raw * w_completeness
327
  )
328
 
329
- # Apply penalties
330
- total = max(0.01, min(0.99, weighted + penalties))
331
 
332
  # Build explanation
333
  parts = []
 
6
  - Tone (positive vs. negative signal detection)
7
  - Completeness (checklist of required response elements)
8
 
9
+ Returns a RewardBreakdown with a total score in (0.0, 1.0) β€” strict open interval.
10
  """
11
 
12
  import re
 
15
  from models import RewardBreakdown
16
 
17
 
18
+ # Strict open-interval clamp: scores must never be exactly 0.0 or 1.0
19
+ _SCORE_MIN = 0.01
20
+ _SCORE_MAX = 0.99
21
+
22
+
23
+ def _clamp(value: float, lo: float = _SCORE_MIN, hi: float = _SCORE_MAX) -> float:
24
+ """Clamp *value* into the strict open interval (0, 1)."""
25
+ return max(lo, min(hi, float(value)))
26
+
27
+
28
  def _normalise(text: str) -> str:
29
  """Lower-case and strip extra whitespace for matching."""
30
  return re.sub(r"\s+", " ", text.strip().lower())
 
303
  conversation_history: Previous messages
304
 
305
  Returns:
306
+ RewardBreakdown with scores in strict (0.0, 1.0) open interval
307
  """
308
+ # Score each axis and clamp to strict (0, 1)
309
+ correctness_raw = _clamp(_score_correctness(
310
  response,
311
  grading_rubric.get("correctness", {}),
312
+ ))
313
+ tone_raw = _clamp(_score_tone(
314
  response,
315
  grading_rubric.get("tone", {}),
316
+ ))
317
+ completeness_raw = _clamp(_score_completeness(
318
  response,
319
  grading_rubric.get("completeness", {}),
320
  ticket_info,
321
  conversation_history,
322
+ ))
323
 
324
  # Get weights
325
  w_correctness = grading_rubric.get("correctness", {}).get("weight", 0.33)
 
329
  # Compute penalties
330
  penalties = _compute_penalties(response, conversation_history)
331
 
332
+ # Weighted total (before penalties) β€” clamped
333
+ weighted = _clamp(
334
  correctness_raw * w_correctness
335
  + tone_raw * w_tone
336
  + completeness_raw * w_completeness
337
  )
338
 
339
+ # Apply penalties β€” clamped to strict (0, 1)
340
+ total = _clamp(weighted + penalties)
341
 
342
  # Build explanation
343
  parts = []
inference.py CHANGED
@@ -323,8 +323,9 @@ def run_task(env_client: EnvClient, task_id: str) -> Dict[str, Any]:
323
  f"done={done}"
324
  )
325
 
326
- # Compute average reward for this task
327
  avg_reward = total_reward / max(step_count, 1)
 
328
  elapsed = time.time() - start_time
329
 
330
  logger.info(
@@ -411,7 +412,8 @@ def main():
411
  )
412
  total_avg += r.get("avg_reward", 0)
413
 
414
- final_score = total_avg / len(results) if results else 0.0
 
415
  logger.info("-" * 60)
416
  logger.info(f" FINAL SCORE: {final_score:.4f} (0.0 -- 1.0)")
417
  logger.info("=" * 60)
 
323
  f"done={done}"
324
  )
325
 
326
+ # Compute average reward for this task β€” clamped to strict (0, 1)
327
  avg_reward = total_reward / max(step_count, 1)
328
+ avg_reward = max(0.01, min(0.99, avg_reward))
329
  elapsed = time.time() - start_time
330
 
331
  logger.info(
 
412
  )
413
  total_avg += r.get("avg_reward", 0)
414
 
415
+ final_score = total_avg / len(results) if results else 0.01
416
+ final_score = max(0.01, min(0.99, final_score)) # strict (0, 1)
417
  logger.info("-" * 60)
418
  logger.info(f" FINAL SCORE: {final_score:.4f} (0.0 -- 1.0)")
419
  logger.info("=" * 60)
models.py CHANGED
@@ -132,24 +132,24 @@ class SupportObservation(BaseModel):
132
  class RewardBreakdown(BaseModel):
133
  """Detailed breakdown of the reward score."""
134
  correctness: float = Field(
135
- default=0.0,
136
- ge=0.0, le=1.0,
137
- description="Score for factual correctness (0.0-1.0)",
138
  )
139
  tone: float = Field(
140
- default=0.0,
141
- ge=0.0, le=1.0,
142
- description="Score for professional tone (0.0-1.0)",
143
  )
144
  completeness: float = Field(
145
- default=0.0,
146
- ge=0.0, le=1.0,
147
- description="Score for response completeness (0.0-1.0)",
148
  )
149
  efficiency: float = Field(
150
- default=0.0,
151
- ge=0.0, le=1.0,
152
- description="Score for resolution efficiency (0.0-1.0)",
153
  )
154
  penalties: float = Field(
155
  default=0.0,
@@ -157,9 +157,9 @@ class RewardBreakdown(BaseModel):
157
  description="Penalty deductions (negative value)",
158
  )
159
  total: float = Field(
160
- default=0.0,
161
- ge=0.0, le=1.0,
162
- description="Overall weighted score (0.0-1.0)",
163
  )
164
  explanation: str = Field(
165
  default="",
@@ -200,6 +200,6 @@ class SupportState(BaseModel):
200
  class StepResult(BaseModel):
201
  """Result returned from step(), matching OpenEnv convention."""
202
  observation: SupportObservation
203
- reward: float = Field(ge=0.0, le=1.0)
204
  done: bool
205
  info: Dict[str, Any] = Field(default_factory=dict)
 
132
  class RewardBreakdown(BaseModel):
133
  """Detailed breakdown of the reward score."""
134
  correctness: float = Field(
135
+ default=0.01,
136
+ gt=0.0, lt=1.0,
137
+ description="Score for factual correctness β€” strict (0, 1)",
138
  )
139
  tone: float = Field(
140
+ default=0.01,
141
+ gt=0.0, lt=1.0,
142
+ description="Score for professional tone β€” strict (0, 1)",
143
  )
144
  completeness: float = Field(
145
+ default=0.01,
146
+ gt=0.0, lt=1.0,
147
+ description="Score for response completeness β€” strict (0, 1)",
148
  )
149
  efficiency: float = Field(
150
+ default=0.01,
151
+ gt=0.0, lt=1.0,
152
+ description="Score for resolution efficiency β€” strict (0, 1)",
153
  )
154
  penalties: float = Field(
155
  default=0.0,
 
157
  description="Penalty deductions (negative value)",
158
  )
159
  total: float = Field(
160
+ default=0.01,
161
+ gt=0.0, lt=1.0,
162
+ description="Overall weighted score β€” strict (0, 1)",
163
  )
164
  explanation: str = Field(
165
  default="",
 
200
  class StepResult(BaseModel):
201
  """Result returned from step(), matching OpenEnv convention."""
202
  observation: SupportObservation
203
+ reward: float = Field(gt=0.0, lt=1.0)
204
  done: bool
205
  info: Dict[str, Any] = Field(default_factory=dict)