Spaces:
Sleeping
Sleeping
Commit ·
ea504bf
1
Parent(s): dcaf698
changed
Browse files- baseline.py +7 -7
- env/models.py +5 -0
baseline.py
CHANGED
|
@@ -187,13 +187,10 @@ def run_baseline() -> BaselineResponse:
|
|
| 187 |
Returns BaselineResponse with scores for all 3 tasks.
|
| 188 |
Must complete within 60 seconds.
|
| 189 |
"""
|
| 190 |
-
# Check API key exists (even if rule-based agent doesn't use it,
|
| 191 |
-
# the spec requires it to be validated)
|
| 192 |
try:
|
| 193 |
_check_api_key()
|
| 194 |
except ValueError as e:
|
| 195 |
print(f"Warning: {e}")
|
| 196 |
-
# Continue with rule-based agent anyway for demo
|
| 197 |
|
| 198 |
results = []
|
| 199 |
difficulties = [
|
|
@@ -212,25 +209,28 @@ def run_baseline() -> BaselineResponse:
|
|
| 212 |
score, steps, feedback = _rule_based_agent(env, task_context)
|
| 213 |
elapsed = time.time() - start
|
| 214 |
|
|
|
|
|
|
|
|
|
|
| 215 |
results.append(BaselineResult(
|
| 216 |
task_id = task_id,
|
| 217 |
difficulty = difficulty,
|
| 218 |
-
score =
|
| 219 |
steps = steps,
|
| 220 |
feedback = f"{feedback} (elapsed: {elapsed:.2f}s)"
|
| 221 |
))
|
| 222 |
-
print(f"Baseline {difficulty.value}: score={
|
| 223 |
|
| 224 |
except Exception as e:
|
| 225 |
results.append(BaselineResult(
|
| 226 |
task_id = task_id,
|
| 227 |
difficulty = difficulty,
|
| 228 |
-
score = 0.0,
|
| 229 |
steps = 0,
|
| 230 |
feedback = f"Error: {str(e)}"
|
| 231 |
))
|
| 232 |
|
| 233 |
-
avg = round(sum(r.score for r in results) / len(results), 4) if results else 0.
|
| 234 |
print(f"Baseline average score: {avg}")
|
| 235 |
|
| 236 |
return BaselineResponse(results=results, average_score=avg)
|
|
|
|
| 187 |
Returns BaselineResponse with scores for all 3 tasks.
|
| 188 |
Must complete within 60 seconds.
|
| 189 |
"""
|
|
|
|
|
|
|
| 190 |
try:
|
| 191 |
_check_api_key()
|
| 192 |
except ValueError as e:
|
| 193 |
print(f"Warning: {e}")
|
|
|
|
| 194 |
|
| 195 |
results = []
|
| 196 |
difficulties = [
|
|
|
|
| 209 |
score, steps, feedback = _rule_based_agent(env, task_context)
|
| 210 |
elapsed = time.time() - start
|
| 211 |
|
| 212 |
+
# FIX 1: clamp score strictly between 0 and 1 exclusive
|
| 213 |
+
safe_score = round(max(0.001, min(0.999, float(score))), 4)
|
| 214 |
+
|
| 215 |
results.append(BaselineResult(
|
| 216 |
task_id = task_id,
|
| 217 |
difficulty = difficulty,
|
| 218 |
+
score = safe_score,
|
| 219 |
steps = steps,
|
| 220 |
feedback = f"{feedback} (elapsed: {elapsed:.2f}s)"
|
| 221 |
))
|
| 222 |
+
print(f"Baseline {difficulty.value}: score={safe_score}, steps={steps}")
|
| 223 |
|
| 224 |
except Exception as e:
|
| 225 |
results.append(BaselineResult(
|
| 226 |
task_id = task_id,
|
| 227 |
difficulty = difficulty,
|
| 228 |
+
score = 0.001, # FIX 2: was 0.0, which is an invalid boundary value
|
| 229 |
steps = 0,
|
| 230 |
feedback = f"Error: {str(e)}"
|
| 231 |
))
|
| 232 |
|
| 233 |
+
avg = round(sum(r.score for r in results) / len(results), 4) if results else 0.5
|
| 234 |
print(f"Baseline average score: {avg}")
|
| 235 |
|
| 236 |
return BaselineResponse(results=results, average_score=avg)
|
env/models.py
CHANGED
|
@@ -174,6 +174,11 @@ class BaselineResult(BaseModel):
|
|
| 174 |
steps: int
|
| 175 |
feedback: str
|
| 176 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 177 |
class BaselineResponse(BaseModel):
|
| 178 |
results: list[BaselineResult]
|
| 179 |
average_score: float
|
|
|
|
| 174 |
steps: int
|
| 175 |
feedback: str
|
| 176 |
|
| 177 |
+
@field_validator("score")
|
| 178 |
+
@classmethod
|
| 179 |
+
def clamp_score(cls, v):
|
| 180 |
+
return max(0.001, min(0.999, round(float(v), 4)))
|
| 181 |
+
|
| 182 |
class BaselineResponse(BaseModel):
|
| 183 |
results: list[BaselineResult]
|
| 184 |
average_score: float
|