"""
TeamForge Reward Function
Dense, shaped reward signal returned at every step.

Formula:
  r = base_reward
    + test_progress_bonus
    + lint_bonus
    - error_penalty
    - inefficiency_penalty
    - test_modification_penalty
"""

from __future__ import annotations

import re
from typing import Optional


# ─────────────────────────────────────────────
# CONSTANTS
# ─────────────────────────────────────────────

# Positive signals
PLAN_STEP_REWARD = 0.05
EDIT_FILE_REWARD = 0.05
COMMIT_REWARD = 0.10
REVIEW_REWARD = 0.15
REFLECT_REWARD = 0.10
TEST_PASS_BONUS_PER_TEST = 0.05
LINT_CLEAN_BONUS = 0.05

# Neutral/Small signals (replacing negative penalties to stay strictly in 0.1-0.9 range)
# We use 0.1 to satisfy "strictly between 0 and 1" requirement with high rounding safety
ACTION_ERROR_REWARD = 0.1
REPEATED_FAILURE_REWARD = 0.1
STEP_BASE_REWARD = 0.1
TEST_MODIFICATION_REWARD = 0.1
LINT_VIOLATION_REWARD = 0.1


# ─────────────────────────────────────────────
# REWARD CALCULATOR
# ─────────────────────────────────────────────

class RewardCalculator:
    """
    Stateful reward calculator. Tracks previous test/lint state
    to compute delta-based rewards.
    """

    def __init__(self):
        self._prev_tests_passed: int = 0
        self._prev_lint_violations: int = 0
        self._action_failure_counts: dict[str, int] = {}
        self._test_files: list[str] = []

    def set_test_files(self, test_files: list[str]) -> None:
        """Register which files are tests (so we can penalise modification)."""
        self._test_files = [f.lower() for f in test_files]

    def compute(
        self,
        action_type: str,
        action_success: bool,
        action_output: str,
        tests_passed: Optional[int] = None,
        lint_violations: Optional[int] = None,
        edited_file: Optional[str] = None,
    ) -> float:
        reward = STEP_BASE_REWARD

        # ── Test-file modification penalty (as small positive reward) ──
        if edited_file and self._is_test_file(edited_file):
            return TEST_MODIFICATION_REWARD

        # ── Action failure ──
        if not action_success:
            count = self._action_failure_counts.get(action_type, 0) + 1
            self._action_failure_counts[action_type] = count
            if count >= 2:
                return REPEATED_FAILURE_REWARD
            return ACTION_ERROR_REWARD

        # Reset failure count on success
        self._action_failure_counts.pop(action_type, None)

        # ── Per-action rewards ──
        reward += {
            "plan_step":        PLAN_STEP_REWARD,
            "edit_file":        EDIT_FILE_REWARD,
            "commit":           COMMIT_REWARD,
            "generate_review":  REVIEW_REWARD,
            "self_reflect":     REFLECT_REWARD,
            "run_tests":        0.02,
            "run_lint":         0.02,
            "request_iteration": 0.02,
        }.get(action_type, 0.1)

        # ── Test progress bonus ──
        if tests_passed is not None:
            delta = tests_passed - self._prev_tests_passed
            if delta > 0:
                reward += delta * TEST_PASS_BONUS_PER_TEST
            self._prev_tests_passed = tests_passed

        # ── Lint improvement bonus ──
        if lint_violations is not None:
            if lint_violations == 0:
                reward += LINT_CLEAN_BONUS
            else:
                delta = lint_violations - self._prev_lint_violations
                if delta < 0:  # fewer violations
                    reward += abs(delta) * LINT_VIOLATION_REWARD
            self._prev_lint_violations = lint_violations

        # Final clamp to strictly within [0.1, 0.9] per OpenEnv validator requirement
        return round(max(0.1, min(0.9, reward)), 4)

    def _is_test_file(self, path: str) -> bool:
        low = path.lower()
        return any(
            low == tf or low.endswith(tf)
            for tf in self._test_files
        ) or "test" in low.split("/")[-1]