from environment.models import Issue, Action, Reward from environment.graders import compute_f1 from typing import List def compute_reward( action: Action, ground_truth: List[Issue], step_count: int, max_steps: int ) -> Reward: """ Dense reward: - +0.2 per correctly identified issue (true positive) - -0.1 per false positive - -0.05 per step (encourage efficiency) - +0.5 bonus if all issues found and final=True - Final episode reward = F1 score at end if final=True, else 0 """ # Compute current F1 based on issues reported so far current_f1 = compute_f1(action.issues, ground_truth) # Per-step penalty step_penalty = -0.05 * step_count # True positives: count matching (line, category) truth_set = {(i.line, i.category) for i in ground_truth} agent_set = {(i.line, i.category) for i in action.issues} tp_count = len(truth_set & agent_set) fp_count = len(agent_set - truth_set) tp_reward = tp_count * 0.2 fp_penalty = fp_count * 0.1 reward_value = tp_reward - fp_penalty + step_penalty # Bonus for early completion with all issues all_found = (tp_count == len(ground_truth)) if action.final and all_found: reward_value += 0.5 reason = f"Final answer correct! F1={current_f1}" elif action.final: reason = f"Final answer submitted with F1={current_f1}" # If final but not all correct, still give F1 score as final reward reward_value = current_f1 else: reason = f"Step {step_count}: {tp_count}/{len(ground_truth)} issues found. +{tp_reward:.2f} -{fp_penalty:.2f} -{step_penalty:.2f}" # Clip to [-1, 1] for stability reward_value = max(-1.0, min(1.0, reward_value)) return Reward(value=reward_value, reason=reason)