# server/graders/base_grader.py # Core grading utilities used by ALL domain graders. # # CHANGES FROM PREVIOUS VERSION: # 1. difficulty_multiplier() — REMOVED ENTIRELY. # The cap (hard→0.80, medium→0.90) made every hard task score identically # at 0.80 and every medium task at 0.90, regardless of agent quality. # This is exactly the wrong behaviour for an RL training environment: # GRPO needs variance WITHIN difficulty levels, not a uniform ceiling. # Task difficulty now comes from the grader logic and case design alone. # # 2. safe_score range: [0.01, 0.99] # The official spec says "strictly between 0 and 1". # Discord consensus from many participants confirmed 0.01/0.99 as the # correct interpretation. Do not change this back to [0.0, 1.0]. # # 3. Penalty values kept as-is (increased in last revision): # - repetition_penalty: -0.20 per repeat (was -0.15) # - invalid_action_penalty: -0.40 for wrong domain action (was -0.20) # - harmful_output_penalty: -0.50 for destructive patterns # These are intentionally higher to create real signal. # # 4. efficiency_bonus reduced to 0.05 (was 0.10). # Small enough that it doesn't inflate scores, but still rewards # agents that solve tasks efficiently. from typing import Dict, Any, List, Callable def safe_score(raw) -> float: """ Clamp score to [0.01, 0.99]. Never crash. Returns float. WHY [0.01, 0.99] NOT [0.0, 1.0]: - Official spec says scores must be strictly between 0 and 1 - Discord confirmed 0.01/0.99 as the correct practical interpretation - A score of exactly 0.0 from a broken run looks like a crash - A score of exactly 1.0 means the grader is trivially solved WHY 4 DECIMAL PLACES: - Keeps variance visible (0.4500 vs 0.4750 are meaningfully different) - round() handles float precision artifacts """ if raw is None: return 0.01 try: val = float(raw) return round(max(0.01, min(0.99, val)), 4) except (TypeError, ValueError): return 0.01 def repetition_penalty(action_type: str, last_actions: List[str], window: int = 3) -> float: """ Penalise repeating the same action type in the last N steps. WHY: Without this, GRPO agents discover they can emit the same high-scoring action repeatedly within an episode. The penalty forces genuine strategy exploration each turn. -0.20 per repeat (capped by window=3, so max penalty is -0.60). """ count = last_actions[-window:].count(action_type) return -0.20 * count def invalid_action_penalty(action_type: str, valid_actions: List[str]) -> float: """ Penalise actions not in the valid set for this domain. -0.40 because calling a dependency action on a security task is a fundamental routing error — it should hurt significantly. """ return -0.40 if action_type not in valid_actions else 0.0 def harmful_output_penalty(action: Dict, forbidden_patterns: List[str]) -> float: """ Penalise destructive patterns like 'os.remove', 'drop table'. -0.50 because these patterns represent the agent trying to "cheat" by deleting things rather than fixing them. """ action_str = str(action).lower() for p in forbidden_patterns: if p.lower() in action_str: return -0.50 return 0.0 def efficiency_bonus(step_count: int, max_steps: int, done: bool) -> float: """ Small bonus for finishing early — rewards decisive, confident agents. WHY ONLY 0.05: The correctness score must be the dominant signal. The efficiency bonus should never flip a mediocre answer into a good score. """ return 0.05 if done and step_count < max_steps // 2 else 0.0 def grade_dynamic( action: Dict[str, Any], session, compute_correctness_fn: Callable, valid_actions: List[str], forbidden_patterns: List[str] = None, max_steps: int = 8, ) -> float: """ Full reward pipeline. Entry point for all domain graders. Pipeline: 1. Invalid action check — if wrong domain action, return penalised score immediately 2. Repetition penalty — subtract for repeated action types 3. compute_correctness_fn — domain-specific grader (security/dep/clinical) 4. Harmful output penalty — subtract for destructive patterns 5. Efficiency bonus — add small bonus for early completion 6. safe_score — clamp to [0.01, 0.99] NOTE: difficulty_multiplier has been REMOVED. The task difficulty is expressed through: - Tighter CVSS ranges in hard cases (harder to guess) - More required_fix_tokens in hard cases - Adversarial reviewer_feedback in hard cases - Dependency graphs in hard clinical cases - Multiple checklist items with ordering in hard dep cases The grader itself should produce lower scores for harder tasks naturally. """ if forbidden_patterns is None: forbidden_patterns = [] action_type = action.get('action_type', 'unknown') # Step 1: Invalid action → skip grader entirely, return penalised score inv = invalid_action_penalty(action_type, valid_actions) rep = repetition_penalty(action_type, session.last_actions) if inv < 0: return safe_score(inv + rep) # Step 2: Domain-specific correctness correctness = compute_correctness_fn(action, session.task_case) if correctness is None: correctness = 0.01 # Step 3: Harmful output check harm = harmful_output_penalty(action, forbidden_patterns) # Step 4: Efficiency bonus eff = efficiency_bonus(session.step_count + 1, max_steps, correctness >= 0.75) # Step 5: Combine and clamp raw = correctness + rep + harm + eff return safe_score(raw)