clarify-rl / server /grader.py
Anurag Agarwal
ClarifyRL: initial HF Space deploy
2414d31
"""
Per-step shaping rewards + plan parsing helpers.
The composable rubric in `server.rubrics` only fires on the terminal step
(`propose_plan`). GRPO benefits from a denser signal during exploration, so
the env emits small per-step shaping rewards on `ask_question` calls.
This module is the thin glue between raw tool-call results and reward floats.
"""
from __future__ import annotations
import json
from typing import Any, Optional
REWARD_REVEAL_NEW_FIELD: float = 0.05
REWARD_NO_USEFUL_INFO: float = 0.02
PENALTY_DUPLICATE_QUESTION: float = -0.02
PENALTY_OVER_CAP: float = -0.05
def parse_plan(plan_str: Any) -> tuple[Optional[dict[str, Any]], Optional[str]]:
if plan_str is None:
return None, "plan is null"
if isinstance(plan_str, dict):
return plan_str, None
if not isinstance(plan_str, str):
return None, f"plan must be a JSON string, got {type(plan_str).__name__}"
text = plan_str.strip()
if not text:
return None, "plan is empty"
try:
parsed = json.loads(text)
except json.JSONDecodeError as exc:
return None, f"invalid JSON: {exc.msg}"
if not isinstance(parsed, dict):
return None, f"plan must be a JSON object, got {type(parsed).__name__}"
return parsed, None
def ask_question_reward(
*,
over_cap: bool,
is_duplicate_field: bool,
revealed_new_field: bool,
) -> float:
if over_cap:
return PENALTY_OVER_CAP
if is_duplicate_field:
return PENALTY_DUPLICATE_QUESTION
if revealed_new_field:
return REWARD_REVEAL_NEW_FIELD
return REWARD_NO_USEFUL_INFO
__all__ = [
"REWARD_REVEAL_NEW_FIELD",
"REWARD_NO_USEFUL_INFO",
"PENALTY_DUPLICATE_QUESTION",
"PENALTY_OVER_CAP",
"parse_plan",
"ask_question_reward",
]