sql-query-optimizer / env /reward.py
jaivardhan2409's picture
Upload folder using huggingface_hub
e4c32ce verified
from typing import Dict
def compute_reward(grader_score: float, previous_score: float, step_number: int, max_steps: int, is_done: bool, action_valid: bool) -> float:
"""
Computes a shaped reward based on the problem statement requirements:
- Partial credit per step: +0.0-0.5 for incremental improvement
- Completion bonus: +0.5 if grader score >= 0.8 when is_done=True
- Step penalty: -0.02 per unnecessary step (> task's min required steps)
- Invalid action penalty: -0.1 for empty/unparseable queries
- Total clamped to [0.0, 1.0]
"""
reward = 0.0
if not action_valid:
return -0.1
improvement = max(0.0, grader_score - previous_score)
# Give partial credit up to 0.5 based on improvement
reward += improvement * 0.5
if is_done and grader_score >= 0.8:
reward += 0.5
if step_number > max_steps:
reward -= 0.02 * (step_number - max_steps)
return max(0.0, min(1.0, reward))