junaid0600 commited on
Commit
ea504bf
·
1 Parent(s): dcaf698
Files changed (2) hide show
  1. baseline.py +7 -7
  2. env/models.py +5 -0
baseline.py CHANGED
@@ -187,13 +187,10 @@ def run_baseline() -> BaselineResponse:
187
  Returns BaselineResponse with scores for all 3 tasks.
188
  Must complete within 60 seconds.
189
  """
190
- # Check API key exists (even if rule-based agent doesn't use it,
191
- # the spec requires it to be validated)
192
  try:
193
  _check_api_key()
194
  except ValueError as e:
195
  print(f"Warning: {e}")
196
- # Continue with rule-based agent anyway for demo
197
 
198
  results = []
199
  difficulties = [
@@ -212,25 +209,28 @@ def run_baseline() -> BaselineResponse:
212
  score, steps, feedback = _rule_based_agent(env, task_context)
213
  elapsed = time.time() - start
214
 
 
 
 
215
  results.append(BaselineResult(
216
  task_id = task_id,
217
  difficulty = difficulty,
218
- score = round(score, 4),
219
  steps = steps,
220
  feedback = f"{feedback} (elapsed: {elapsed:.2f}s)"
221
  ))
222
- print(f"Baseline {difficulty.value}: score={round(score,4)}, steps={steps}")
223
 
224
  except Exception as e:
225
  results.append(BaselineResult(
226
  task_id = task_id,
227
  difficulty = difficulty,
228
- score = 0.0,
229
  steps = 0,
230
  feedback = f"Error: {str(e)}"
231
  ))
232
 
233
- avg = round(sum(r.score for r in results) / len(results), 4) if results else 0.0
234
  print(f"Baseline average score: {avg}")
235
 
236
  return BaselineResponse(results=results, average_score=avg)
 
187
  Returns BaselineResponse with scores for all 3 tasks.
188
  Must complete within 60 seconds.
189
  """
 
 
190
  try:
191
  _check_api_key()
192
  except ValueError as e:
193
  print(f"Warning: {e}")
 
194
 
195
  results = []
196
  difficulties = [
 
209
  score, steps, feedback = _rule_based_agent(env, task_context)
210
  elapsed = time.time() - start
211
 
212
+ # FIX 1: clamp score strictly between 0 and 1 exclusive
213
+ safe_score = round(max(0.001, min(0.999, float(score))), 4)
214
+
215
  results.append(BaselineResult(
216
  task_id = task_id,
217
  difficulty = difficulty,
218
+ score = safe_score,
219
  steps = steps,
220
  feedback = f"{feedback} (elapsed: {elapsed:.2f}s)"
221
  ))
222
+ print(f"Baseline {difficulty.value}: score={safe_score}, steps={steps}")
223
 
224
  except Exception as e:
225
  results.append(BaselineResult(
226
  task_id = task_id,
227
  difficulty = difficulty,
228
+ score = 0.001, # FIX 2: was 0.0, which is an invalid boundary value
229
  steps = 0,
230
  feedback = f"Error: {str(e)}"
231
  ))
232
 
233
+ avg = round(sum(r.score for r in results) / len(results), 4) if results else 0.5
234
  print(f"Baseline average score: {avg}")
235
 
236
  return BaselineResponse(results=results, average_score=avg)
env/models.py CHANGED
@@ -174,6 +174,11 @@ class BaselineResult(BaseModel):
174
  steps: int
175
  feedback: str
176
 
 
 
 
 
 
177
  class BaselineResponse(BaseModel):
178
  results: list[BaselineResult]
179
  average_score: float
 
174
  steps: int
175
  feedback: str
176
 
177
+ @field_validator("score")
178
+ @classmethod
179
+ def clamp_score(cls, v):
180
+ return max(0.001, min(0.999, round(float(v), 4)))
181
+
182
  class BaselineResponse(BaseModel):
183
  results: list[BaselineResult]
184
  average_score: float