Spaces:
Sleeping
Sleeping
Commit ·
38ee4ab
1
Parent(s): d7b24b0
fixed scoring and logging for final submission
Browse files- helpers/logging.py +23 -9
- inference.py +8 -7
- openenv.yaml +13 -0
- tasks/task_easy.py +2 -1
- tasks/task_hard.py +1 -1
- tasks/task_hard_2.py +1 -1
- tasks/task_hard_3.py +1 -1
- tasks/task_medium.py +1 -1
- tasks/task_medium_2.py +1 -1
helpers/logging.py
CHANGED
|
@@ -1,19 +1,33 @@
|
|
| 1 |
-
from typing import List, Optional
|
| 2 |
|
| 3 |
|
| 4 |
def log_start(task: str, env: str, model: str) -> None:
|
|
|
|
| 5 |
print(f"[START] task={task} env={env} model={model}", flush=True)
|
| 6 |
|
| 7 |
|
| 8 |
-
def log_step(step: int, action:
|
| 9 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 10 |
done_val = str(done).lower()
|
| 11 |
-
print(
|
| 12 |
-
|
| 13 |
-
flush=True,
|
| 14 |
-
)
|
| 15 |
|
|
|
|
|
|
|
| 16 |
|
| 17 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 18 |
rewards_str = ",".join(f"{r:.2f}" for r in rewards)
|
| 19 |
-
print(f"[END] success={str(success).lower()} steps={steps}
|
|
|
|
| 1 |
+
from typing import List, Optional
|
| 2 |
|
| 3 |
|
| 4 |
def log_start(task: str, env: str, model: str) -> None:
|
| 5 |
+
"""Emit the [START] line at episode begin."""
|
| 6 |
print(f"[START] task={task} env={env} model={model}", flush=True)
|
| 7 |
|
| 8 |
|
| 9 |
+
def log_step(step: int, action: str, reward: float, done: bool, error: Optional[str]) -> None:
|
| 10 |
+
"""Emit one [STEP] line immediately after env.step() returns.
|
| 11 |
+
|
| 12 |
+
Args:
|
| 13 |
+
step: 1-based step number.
|
| 14 |
+
action: Compact single-line action label (e.g. 'execute_code').
|
| 15 |
+
reward: Step reward, formatted to 2 decimal places.
|
| 16 |
+
done: Whether the episode ended after this step.
|
| 17 |
+
error: Raw error string from the env, or None.
|
| 18 |
+
"""
|
| 19 |
+
error_val = error.replace("\n", " ") if error else "null"
|
| 20 |
done_val = str(done).lower()
|
| 21 |
+
print(f"[STEP] step={step} action={action} reward={reward:.2f} done={done_val} error={error_val}", flush=True)
|
| 22 |
+
|
|
|
|
|
|
|
| 23 |
|
| 24 |
+
def log_end(success: bool, steps: int, rewards: List[float]) -> None:
|
| 25 |
+
"""Emit the [END] line after env.close(), always emitted even on exception.
|
| 26 |
|
| 27 |
+
Args:
|
| 28 |
+
success: Whether the episode was successful.
|
| 29 |
+
steps: Total number of steps taken.
|
| 30 |
+
rewards: List of per-step rewards, each formatted to 2 decimal places.
|
| 31 |
+
"""
|
| 32 |
rewards_str = ",".join(f"{r:.2f}" for r in rewards)
|
| 33 |
+
print(f"[END] success={str(success).lower()} steps={steps} rewards={rewards_str}", flush=True)
|
inference.py
CHANGED
|
@@ -76,14 +76,14 @@ def run_task(openai_client: OpenAI, env_client: Any, task_id: int) -> float:
|
|
| 76 |
done = exec_result.done
|
| 77 |
except Exception as exc:
|
| 78 |
print(f"[DEBUG] env step failed: {exc}", flush=True)
|
| 79 |
-
log_step(step=step + 1, action=
|
| 80 |
rewards.append(0.0)
|
| 81 |
continue
|
| 82 |
|
| 83 |
rewards.append(reward)
|
| 84 |
error = exec_obs.error if not exec_obs.success else None
|
| 85 |
result_text = f"Output: {exec_obs.output}" if not exec_obs.error else f"Error: {exec_obs.error}"
|
| 86 |
-
log_step(step=step + 1, action=
|
| 87 |
|
| 88 |
messages.append({"role": "assistant", "content": response_text})
|
| 89 |
messages.append({"role": "user", "content": [{"type": "text", "text": result_text}]})
|
|
@@ -97,13 +97,14 @@ def run_task(openai_client: OpenAI, env_client: Any, task_id: int) -> float:
|
|
| 97 |
score = float(submit_obs.metadata.get("score", 0.0) if submit_obs.metadata else submit_result.reward)
|
| 98 |
except Exception as exc:
|
| 99 |
print(f"[DEBUG] env step failed: {exc}", flush=True)
|
| 100 |
-
log_step(step=step + 1, action=
|
| 101 |
-
log_end(success=False, steps=step + 1,
|
| 102 |
return 0.0
|
| 103 |
|
|
|
|
| 104 |
rewards.append(score)
|
| 105 |
-
log_step(step=step + 1, action=
|
| 106 |
-
log_end(success=score > 0.
|
| 107 |
return score
|
| 108 |
|
| 109 |
else:
|
|
@@ -127,7 +128,7 @@ def run_task(openai_client: OpenAI, env_client: Any, task_id: int) -> float:
|
|
| 127 |
}
|
| 128 |
)
|
| 129 |
|
| 130 |
-
log_end(success=False, steps=MAX_STEPS,
|
| 131 |
return 0.0
|
| 132 |
|
| 133 |
|
|
|
|
| 76 |
done = exec_result.done
|
| 77 |
except Exception as exc:
|
| 78 |
print(f"[DEBUG] env step failed: {exc}", flush=True)
|
| 79 |
+
log_step(step=step + 1, action=action_type, reward=0.0, done=False, error=str(exc))
|
| 80 |
rewards.append(0.0)
|
| 81 |
continue
|
| 82 |
|
| 83 |
rewards.append(reward)
|
| 84 |
error = exec_obs.error if not exec_obs.success else None
|
| 85 |
result_text = f"Output: {exec_obs.output}" if not exec_obs.error else f"Error: {exec_obs.error}"
|
| 86 |
+
log_step(step=step + 1, action=action_type, reward=reward, done=done, error=error)
|
| 87 |
|
| 88 |
messages.append({"role": "assistant", "content": response_text})
|
| 89 |
messages.append({"role": "user", "content": [{"type": "text", "text": result_text}]})
|
|
|
|
| 97 |
score = float(submit_obs.metadata.get("score", 0.0) if submit_obs.metadata else submit_result.reward)
|
| 98 |
except Exception as exc:
|
| 99 |
print(f"[DEBUG] env step failed: {exc}", flush=True)
|
| 100 |
+
log_step(step=step + 1, action=action_type, reward=0.0, done=True, error=str(exc))
|
| 101 |
+
log_end(success=False, steps=step + 1, rewards=rewards)
|
| 102 |
return 0.0
|
| 103 |
|
| 104 |
+
score = max(0.01, min(0.99, score))
|
| 105 |
rewards.append(score)
|
| 106 |
+
log_step(step=step + 1, action=action_type, reward=score, done=True, error=None)
|
| 107 |
+
log_end(success=score > 0.01, steps=step + 1, rewards=rewards)
|
| 108 |
return score
|
| 109 |
|
| 110 |
else:
|
|
|
|
| 128 |
}
|
| 129 |
)
|
| 130 |
|
| 131 |
+
log_end(success=False, steps=MAX_STEPS, rewards=rewards)
|
| 132 |
return 0.0
|
| 133 |
|
| 134 |
|
openenv.yaml
CHANGED
|
@@ -6,3 +6,16 @@ type: space
|
|
| 6 |
runtime: fastapi
|
| 7 |
app: server.app:app
|
| 8 |
port: 8000
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 6 |
runtime: fastapi
|
| 7 |
app: server.app:app
|
| 8 |
port: 8000
|
| 9 |
+
tasks:
|
| 10 |
+
- id: task_easy
|
| 11 |
+
grader: tasks.task_easy:TopRevenueCategoryTask
|
| 12 |
+
- id: task_medium
|
| 13 |
+
grader: tasks.task_medium:CityRevenueShareTask
|
| 14 |
+
- id: task_medium_2
|
| 15 |
+
grader: tasks.task_medium_2:MonthlyRevenueRatioTask
|
| 16 |
+
- id: task_hard
|
| 17 |
+
grader: tasks.task_hard:RepeatCustomerCohortTask
|
| 18 |
+
- id: task_hard_2
|
| 19 |
+
grader: tasks.task_hard_2:CustomerLoyaltyRevenueTask
|
| 20 |
+
- id: task_hard_3
|
| 21 |
+
grader: tasks.task_hard_3:SupplierProfitabilityTask
|
tasks/task_easy.py
CHANGED
|
@@ -49,4 +49,5 @@ class TopRevenueCategoryTask(BaseTask):
|
|
| 49 |
"""
|
| 50 |
expected = self.expected_answer().strip().lower()
|
| 51 |
submitted = answer.strip().lower()
|
| 52 |
-
|
|
|
|
|
|
| 49 |
"""
|
| 50 |
expected = self.expected_answer().strip().lower()
|
| 51 |
submitted = answer.strip().lower()
|
| 52 |
+
raw = 1.0 if expected in submitted else 0.0
|
| 53 |
+
return max(0.01, min(0.99, raw))
|
tasks/task_hard.py
CHANGED
|
@@ -100,4 +100,4 @@ class RepeatCustomerCohortTask(BaseTask):
|
|
| 100 |
except ValueError:
|
| 101 |
pass
|
| 102 |
|
| 103 |
-
return score
|
|
|
|
| 100 |
except ValueError:
|
| 101 |
pass
|
| 102 |
|
| 103 |
+
return max(0.01, min(0.99, score))
|
tasks/task_hard_2.py
CHANGED
|
@@ -100,4 +100,4 @@ class CustomerLoyaltyRevenueTask(BaseTask):
|
|
| 100 |
except ValueError:
|
| 101 |
pass
|
| 102 |
|
| 103 |
-
return score
|
|
|
|
| 100 |
except ValueError:
|
| 101 |
pass
|
| 102 |
|
| 103 |
+
return max(0.01, min(0.99, score))
|
tasks/task_hard_3.py
CHANGED
|
@@ -104,4 +104,4 @@ class SupplierProfitabilityTask(BaseTask):
|
|
| 104 |
except ValueError:
|
| 105 |
pass
|
| 106 |
|
| 107 |
-
return score
|
|
|
|
| 104 |
except ValueError:
|
| 105 |
pass
|
| 106 |
|
| 107 |
+
return max(0.01, min(0.99, score))
|
tasks/task_medium.py
CHANGED
|
@@ -74,4 +74,4 @@ class CityRevenueShareTask(BaseTask):
|
|
| 74 |
except ValueError:
|
| 75 |
pass
|
| 76 |
|
| 77 |
-
return score
|
|
|
|
| 74 |
except ValueError:
|
| 75 |
pass
|
| 76 |
|
| 77 |
+
return max(0.01, min(0.99, score))
|
tasks/task_medium_2.py
CHANGED
|
@@ -85,4 +85,4 @@ class MonthlyRevenueRatioTask(BaseTask):
|
|
| 85 |
except ValueError:
|
| 86 |
pass
|
| 87 |
|
| 88 |
-
return score
|
|
|
|
| 85 |
except ValueError:
|
| 86 |
pass
|
| 87 |
|
| 88 |
+
return max(0.01, min(0.99, score))
|