Spaces:

agarwalanu3103
/

clarify-rl

Sleeping

File size: 1,794 Bytes

2414d31

"""
Per-step shaping rewards + plan parsing helpers.

The composable rubric in `server.rubrics` only fires on the terminal step
(`propose_plan`). GRPO benefits from a denser signal during exploration, so
the env emits small per-step shaping rewards on `ask_question` calls.

This module is the thin glue between raw tool-call results and reward floats.
"""

from __future__ import annotations

import json
from typing import Any, Optional


REWARD_REVEAL_NEW_FIELD: float = 0.05
REWARD_NO_USEFUL_INFO: float = 0.02
PENALTY_DUPLICATE_QUESTION: float = -0.02
PENALTY_OVER_CAP: float = -0.05


def parse_plan(plan_str: Any) -> tuple[Optional[dict[str, Any]], Optional[str]]:
    if plan_str is None:
        return None, "plan is null"
    if isinstance(plan_str, dict):
        return plan_str, None
    if not isinstance(plan_str, str):
        return None, f"plan must be a JSON string, got {type(plan_str).__name__}"
    text = plan_str.strip()
    if not text:
        return None, "plan is empty"
    try:
        parsed = json.loads(text)
    except json.JSONDecodeError as exc:
        return None, f"invalid JSON: {exc.msg}"
    if not isinstance(parsed, dict):
        return None, f"plan must be a JSON object, got {type(parsed).__name__}"
    return parsed, None


def ask_question_reward(
    *,
    over_cap: bool,
    is_duplicate_field: bool,
    revealed_new_field: bool,
) -> float:
    if over_cap:
        return PENALTY_OVER_CAP
    if is_duplicate_field:
        return PENALTY_DUPLICATE_QUESTION
    if revealed_new_field:
        return REWARD_REVEAL_NEW_FIELD
    return REWARD_NO_USEFUL_INFO


__all__ = [
    "REWARD_REVEAL_NEW_FIELD",
    "REWARD_NO_USEFUL_INFO",
    "PENALTY_DUPLICATE_QUESTION",
    "PENALTY_OVER_CAP",
    "parse_plan",
    "ask_question_reward",
]