Spaces:
Running
Running
| """ | |
| Per-step shaping rewards + plan parsing helpers. | |
| The composable rubric in `server.rubrics` only fires on the terminal step | |
| (`propose_plan`). GRPO benefits from a denser signal during exploration, so | |
| the env emits small per-step shaping rewards on `ask_question` calls. | |
| This module is the thin glue between raw tool-call results and reward floats. | |
| """ | |
| from __future__ import annotations | |
| import json | |
| from typing import Any, Optional | |
| REWARD_REVEAL_NEW_FIELD: float = 0.05 | |
| REWARD_NO_USEFUL_INFO: float = 0.02 | |
| PENALTY_DUPLICATE_QUESTION: float = -0.02 | |
| PENALTY_OVER_CAP: float = -0.05 | |
| def parse_plan(plan_str: Any) -> tuple[Optional[dict[str, Any]], Optional[str]]: | |
| if plan_str is None: | |
| return None, "plan is null" | |
| if isinstance(plan_str, dict): | |
| return plan_str, None | |
| if not isinstance(plan_str, str): | |
| return None, f"plan must be a JSON string, got {type(plan_str).__name__}" | |
| text = plan_str.strip() | |
| if not text: | |
| return None, "plan is empty" | |
| try: | |
| parsed = json.loads(text) | |
| except json.JSONDecodeError as exc: | |
| return None, f"invalid JSON: {exc.msg}" | |
| if not isinstance(parsed, dict): | |
| return None, f"plan must be a JSON object, got {type(parsed).__name__}" | |
| return parsed, None | |
| def ask_question_reward( | |
| *, | |
| over_cap: bool, | |
| is_duplicate_field: bool, | |
| revealed_new_field: bool, | |
| ) -> float: | |
| if over_cap: | |
| return PENALTY_OVER_CAP | |
| if is_duplicate_field: | |
| return PENALTY_DUPLICATE_QUESTION | |
| if revealed_new_field: | |
| return REWARD_REVEAL_NEW_FIELD | |
| return REWARD_NO_USEFUL_INFO | |
| __all__ = [ | |
| "REWARD_REVEAL_NEW_FIELD", | |
| "REWARD_NO_USEFUL_INFO", | |
| "PENALTY_DUPLICATE_QUESTION", | |
| "PENALTY_OVER_CAP", | |
| "parse_plan", | |
| "ask_question_reward", | |
| ] | |