from __future__ import annotations

import math
import random
from dataclasses import dataclass, field
from typing import Literal, TypedDict

from .arena_geometry import Arena
from .card_catalog import load_cards
from .reward_config import load_reward_config

Zone = Literal[
    "bridge_left",
    "bridge_right",
    "back_left",
    "back_right",
    "mid_left",
    "mid_right",
]

Emote = Literal["laugh", "yawn", "cry", "thanks", "chicken", "wp"]


class TowerHP(TypedDict):
    left: int
    right: int
    king: int


@dataclass
class Unit:
    owner: Literal["me", "opp"]
    card: str
    hp: float
    dps: float
    targets: Literal["ground", "air", "both", "towers"]
    zone: Zone
    is_air: bool
    stage: int  # 0=back, 1=mid, 2=bridge
    splash: bool = False
    radius: float = 0.0


@dataclass
class CardDef:
    key: str
    name: str
    cost: int
    kind: Literal["troop", "spell", "building"]
    # simplified combat stats (we keep them stable + fast; can be expanded later)
    hp: int = 0
    dps: int = 0
    targets: Literal["ground", "air", "both", "towers"] = "ground"
    is_air: bool = False
    spell_damage: int = 0
    splash: bool = False
    radius: float = 0.0
    description: str | None = None


def _infer_carddef(card_key: str, catalog: dict) -> CardDef:
    """
    Convert a real card record into our simplified simulator parameters.

    We keep the simulator intentionally abstract, so we infer missing values and
    clamp stats to keep learning stable.
    """
    c = catalog.get(card_key)
    if c is None:
        raise KeyError(card_key)

    kind: Literal["troop", "spell", "building"]
    if c.type == "Troop":
        kind = "troop"
    elif c.type == "Building":
        kind = "building"
    elif c.type == "Spell":
        kind = "spell"
    else:
        kind = "troop"

    # Map targets to a coarse set.
    targets = "both" if (c.targets and "Air" in c.targets) else "ground"
    if c.targets and "Buildings" in c.targets:
        targets = "towers"

    hp = int(c.hitpoints or 0)
    dps = int(c.damage_per_second or 0)
    dmg = int(c.damage or 0)

    # Fall back if DPS missing.
    if dps <= 0 and dmg > 0 and c.hit_speed:
        dps = int(dmg / max(0.1, float(c.hit_speed)))

    # Clamp to keep sim stable.
    hp = max(1, min(hp, 6000)) if kind != "spell" else 0
    dps = max(0, min(dps, 800)) if kind != "spell" else 0

    # Spell damage: use "damage" if available else a stable fallback by elixir.
    spell_damage = 0
    if kind == "spell":
        spell_damage = int(c.damage or (c.elixir * 120))
        spell_damage = max(40, min(spell_damage, 1600))

    is_air = bool(c.move_speed and "Air" in c.move_speed)  # imperfect

    # Splash/radius heuristics for our small pool (good enough for RL + rewards)
    splash = card_key in {"fireball", "arrows"}
    radius = 0.0
    if card_key == "fireball":
        radius = 0.35
    if card_key == "arrows":
        radius = 0.45

    # Targeting priors (closer to real CR interactions for our pool)
    # - Giant targets buildings (incl towers) only
    if card_key == "giant":
        targets = "towers"

    return CardDef(
        key=card_key,
        name=c.name,
        cost=int(c.elixir),
        kind=kind,
        hp=hp,
        dps=dps,
        targets=targets,  # type: ignore[arg-type]
        is_air=is_air,
        spell_damage=spell_damage,
        splash=splash,
        radius=radius,
        description=c.description,
    )

PLACEMENT_ZONES: tuple[Zone, ...] = (
    "bridge_left",
    "bridge_right",
    "back_left",
    "back_right",
    "mid_left",
    "mid_right",
)


def _zone_lane(zone: Zone) -> Literal["left", "right"]:
    return "left" if zone.endswith("_left") else "right"


def _is_air_zone(zone: Zone) -> bool:
    return False


def _zone_stage(zone: Zone) -> int:
    if zone.startswith("back_"):
        return 0
    if zone.startswith("mid_"):
        return 1
    return 2


@dataclass
class GameState:
    turn: int
    time_remaining_s: float
    double_elixir: bool

    my_elixir: float
    opp_elixir_estimate: float

    my_tower_hp: TowerHP
    opp_tower_hp: TowerHP

    my_hand: list[str]
    my_next_card: str

    opp_hand_estimate: list[str]

    my_units: list[Unit] = field(default_factory=list)
    opp_units: list[Unit] = field(default_factory=list)

    opp_tilt_meter: float = 0.0
    my_crowns: int = 0
    opp_crowns: int = 0

    invalid_action_last: bool = False
    invalid_action_count: int = 0

    done: bool = False


class ToxicRoyaleSim:
    """
    Lightweight, fast text simulator.

    Design goals for hackathon:
    - deterministic-ish (seedable) and fast enough for many rollouts
    - objective, non-zero rewards early
    - supports "tilt meter" dynamics for novelty
    """

    def __init__(self, seed: int | None = None):
        self._rng = random.Random(seed)
        self._seed = seed
        self._last_events: list[str] = []
        self._invalid_last = False
        self._arena = Arena()
        self.reset()

    def _zone_point(self, owner: Literal["me", "opp"], zone: Zone) -> tuple[float, float]:
        """
        Map an abstract placement zone to a normalized arena (x,y).

        Convention:
        - y grows from opponent side (0) to my side (1)
        - our zone definitions are expressed from *my* perspective
          so we mirror y for opponent placements.
        """
        x = 0.33 if zone.endswith("_left") else 0.67
        if zone.startswith("back_"):
            y = 0.80
        elif zone.startswith("mid_"):
            y = 0.66
        else:
            y = 0.52
        if owner == "opp":
            y = 1.0 - y
        return x, y

    def _placement_is_valid(self, owner: Literal["me", "opp"], card: CardDef, zone: Zone) -> bool:
        x, y = self._zone_point(owner, zone)
        if card.kind == "spell":
            return True
        return self._arena.can_place_troop(owner, x, y)

    def reset(self) -> GameState:
        self._last_events = []
        self._turn = 0
        # Real-ish Clash timing: 3:00 regulation + overtime if tied.
        # Keep it simple but closer to the real feel of the screen you shared.
        self._time_remaining_s = 180.0
        self._in_overtime = False

        self._my_elixir = 5.0
        self._opp_elixir = 5.0

        self._my_tower_hp: TowerHP = {"left": 1400, "right": 1400, "king": 2400}
        self._opp_tower_hp: TowerHP = {"left": 1400, "right": 1400, "king": 2400}

        # Load real card catalog (RoyaleAPI static data) once per sim instance.
        if not hasattr(self, "_catalog"):
            self._catalog = load_cards()

        # Training pool: keep it small first (stable learning), but every card has a real description.
        # You can expand this list later to "all cards" without changing the env API.
        self._training_pool = [
            "giant",
            "knight",
            "minions",
            "archers",
            "fireball",
            "arrows",
            "musketeer",
            "mini-pekka",
        ]

        # Build simulator defs for the pool.
        self._cards: dict[str, CardDef] = {}
        for key in self._training_pool:
            # keys in RoyaleAPI dataset are kebab-case (e.g. "mini-pekka")
            self._cards[key] = _infer_carddef(key, self._catalog)
        self._cards["wait"] = CardDef(key="wait", name="Wait", cost=0, kind="spell", spell_damage=0)

        self._my_deck = list(self._training_pool)
        self._opp_deck = list(self._training_pool)
        self._rng.shuffle(self._my_deck)
        self._rng.shuffle(self._opp_deck)

        self._my_hand = [self._my_deck.pop() for _ in range(4)]
        self._my_next = self._my_deck.pop()

        self._opp_hand = [self._opp_deck.pop() for _ in range(4)]
        self._opp_next = self._opp_deck.pop()

        self._my_units: list[Unit] = []
        self._opp_units: list[Unit] = []

        self._opp_tilt = 0.0
        self._my_crowns = 0
        self._opp_crowns = 0
        self._done = False
        self._invalid_last = False
        self._invalid_count = 0
        self._my_spent_last = 0.0
        self._opp_spent_last = 0.0
        self._my_wait_last = False
        self._opp_wait_last = False
        self._last_spell_hits = 0
        self._last_spell_tower_dmg = 0
        self._punish_window = 0

        self._log_event("Game start.")
        return self.state()

    def state(self) -> GameState:
        return GameState(
            turn=self._turn,
            time_remaining_s=self._time_remaining_s,
            double_elixir=(self._time_remaining_s <= 60.0) or bool(getattr(self, "_in_overtime", False)),
            my_elixir=self._my_elixir,
            opp_elixir_estimate=self._opp_elixir,
            my_tower_hp=dict(self._my_tower_hp),
            opp_tower_hp=dict(self._opp_tower_hp),
            my_hand=list(self._my_hand),
            my_next_card=self._my_next,
            opp_hand_estimate=list(self._opp_hand),
            my_units=list(self._my_units),
            opp_units=list(self._opp_units),
            opp_tilt_meter=self._opp_tilt,
            my_crowns=self._my_crowns,
            opp_crowns=self._opp_crowns,
            invalid_action_last=self._invalid_last,
            invalid_action_count=self._invalid_count,
            done=self._done,
        )

    def last_events(self, k: int = 6) -> list[str]:
        return self._last_events[-k:]

    def step(self, *, kind: Literal["play", "wait"], card: str | None, zone: Zone | None, emote: str | None) -> dict:
        """
        Advance one tick (0.5s) with (agent action + scripted opponent response).
        Returns: a dict with reward_total and reward_breakdown.
        """
        if self._done:
            return {"reward_total": 0.0, "reward_breakdown": {"already_done": 0.0}}

        before = self._snapshot_score()
        self._invalid_last = False
        self._my_spent_last = 0.0
        self._opp_spent_last = 0.0
        self._my_wait_last = False
        self._opp_wait_last = False
        self._last_spell_hits = 0
        self._last_spell_tower_dmg = 0
        self._punish_window = max(0, int(getattr(self, "_punish_window", 0)) - 1)

        # --- apply agent action ---
        self._apply_player_action(owner="me", kind=kind, card=card, zone=zone)
        self._update_tilt(emote=emote)
        if self._invalid_last:
            self._invalid_count += 1

        # --- apply opponent action (scripted, tilt-affected) ---
        self._scripted_opponent_action()
        # If opponent just made a big investment, you have a short window to punish.
        if float(getattr(self, "_opp_spent_last", 0.0)) >= 7.0:
            self._punish_window = 4  # ~2 seconds (4 ticks)

        # --- tick combat + elixir regen ---
        self._tick()

        after = self._snapshot_score()
        breakdown = self._compute_rewards(before, after, emote=emote, invalid_action=self._invalid_last)
        return breakdown

    # -------------------------
    # Internal mechanics
    # -------------------------
    def _apply_player_action(self, *, owner: Literal["me", "opp"], kind: Literal["play", "wait"], card: str | None, zone: Zone | None):
        if kind == "wait":
            self._log_event(f"{owner} waits.")
            if owner == "me":
                self._my_wait_last = True
            else:
                self._opp_wait_last = True
            return

        if card is None or zone is None:
            self._log_event(f"{owner} attempted invalid play (missing card/zone).")
            if owner == "me":
                self._invalid_last = True
            return

        if owner == "me" and card not in self._my_hand:
            self._log_event(f"{owner} attempted to play {card} not in hand.")
            self._invalid_last = True
            return
        if owner == "opp" and card not in self._opp_hand:
            self._log_event(f"{owner} attempted to play {card} not in hand.")
            return

        cdef = self._cards.get(card)
        if cdef is None or card == "wait":
            self._log_event(f"{owner} attempted unknown card '{card}'.")
            if owner == "me":
                self._invalid_last = True
            return

        elixir = self._my_elixir if owner == "me" else self._opp_elixir
        if elixir + 1e-6 < cdef.cost:
            self._log_event(f"{owner} tried to overspend elixir on {card} (cost {cdef.cost}).")
            if owner == "me":
                self._invalid_last = True
            return

        if not self._placement_is_valid(owner, cdef, zone):
            self._log_event(f"{owner} attempted illegal placement: {card} at {zone}.")
            if owner == "me":
                self._invalid_last = True
            return

        # spend
        if owner == "me":
            self._my_elixir -= cdef.cost
            self._my_spent_last = float(cdef.cost)
        else:
            self._opp_elixir -= cdef.cost
            self._opp_spent_last = float(cdef.cost)

        # cycle hand
        if owner == "me":
            self._my_hand.remove(card)
            self._my_hand.append(self._my_next)
            self._my_next = self._my_deck.pop() if self._my_deck else self._rng.choice(self._my_hand)
        else:
            self._opp_hand.remove(card)
            self._opp_hand.append(self._opp_next)
            self._opp_next = self._opp_deck.pop() if self._opp_deck else self._rng.choice(self._opp_hand)

        if cdef.kind == "spell":
            self._apply_spell(owner=owner, card=card, zone=zone, dmg=cdef.spell_damage)
            return

        # troops/buildings become units
        is_air = cdef.is_air or _is_air_zone(zone)
        u = Unit(
            owner=owner,
            card=card,
            hp=float(cdef.hp),
            dps=float(cdef.dps),
            targets=cdef.targets,
            zone=zone,
            is_air=is_air,
            stage=_zone_stage(zone),
        )
        if owner == "me":
            self._my_units.append(u)
        else:
            self._opp_units.append(u)
        self._log_event(f"{owner} played {card} at {zone} (cost {cdef.cost}).")

    def _apply_spell(self, *, owner: Literal["me", "opp"], card: str, zone: Zone, dmg: int):
        lane = _zone_lane(zone)
        target_towers = self._opp_tower_hp if owner == "me" else self._my_tower_hp
        target_units = self._opp_units if owner == "me" else self._my_units

        cdef = self._cards.get(card)
        radius = float(getattr(cdef, "radius", 0.0) or 0.0)

        # --- AoE: hit up to N units in the lane (front-most first) ---
        if radius > 0:
            lane_units = [u for u in target_units if _zone_lane(u.zone) == lane]
            # Larger radius -> more units affected, capped.
            max_hits = 3 if radius < 0.40 else 5
            lane_units.sort(key=lambda u: (u.stage, -u.hp), reverse=True)
            hits = lane_units[:max_hits]
            if hits:
                for u in hits:
                    u.hp -= float(dmg)
                self._log_event(f"{owner} cast {card} hitting {len(hits)} units in {lane} for {dmg}.")
                self._last_spell_hits = max(self._last_spell_hits, len(hits))

        # --- Tower chip (spells can hit towers when targeted near them) ---
        # Bridge/mid zones chip lane tower lightly; back zones are assumed defensive.
        tower_mult = 1.0 if zone.startswith("bridge_") else (0.6 if zone.startswith("mid_") else 0.0)
        tower_dmg = int(dmg * tower_mult)
        if tower_dmg > 0:
            if target_towers[lane] > 0:
                target_towers[lane] = max(0, target_towers[lane] - tower_dmg)
                self._log_event(f"{owner} cast {card} chipping {lane} tower for {tower_dmg}.")
            else:
                target_towers["king"] = max(0, target_towers["king"] - int(tower_dmg * 0.5))
                self._log_event(f"{owner} cast {card} chipping king tower for {int(tower_dmg*0.5)}.")
            self._last_spell_tower_dmg = max(self._last_spell_tower_dmg, tower_dmg)

    def _tick(self):
        # one turn = 0.5 seconds
        self._turn += 1
        self._time_remaining_s = max(0.0, self._time_remaining_s - 0.5)

        # Overtime: if regulation ends tied, give 60s overtime (double elixir).
        if self._time_remaining_s <= 0.0 and not getattr(self, "_in_overtime", False):
            if self._my_crowns == self._opp_crowns:
                self._in_overtime = True
                self._time_remaining_s = 60.0
                self._log_event("Overtime!")

        double = (self._time_remaining_s <= 60.0) or bool(getattr(self, "_in_overtime", False))

        regen = 0.35 if not double else 0.7
        self._my_elixir = min(10.0, self._my_elixir + regen)
        self._opp_elixir = min(10.0, self._opp_elixir + regen)

        # combat: lane skirmishes (units fight units first, then towers)
        self._lane_skirmish(lane="left")
        self._lane_skirmish(lane="right")

        self._cleanup_dead()
        self._update_crowns_and_done()

    def _lane_skirmish(self, *, lane: Literal["left", "right"]):
        """
        Very simplified Clash-like combat:
        - units have a lane and a stage (back/mid/bridge)
        - troops advance toward bridge unless blocked by enemies
        - each tick, each side deals damage to one enemy unit (if any), else to the lane tower (else king)
        """
        my_units = [u for u in self._my_units if _zone_lane(u.zone) == lane]
        opp_units = [u for u in self._opp_units if _zone_lane(u.zone) == lane]

        # Advance troops if no enemies near bridge; buildings don't advance.
        if not opp_units:
            for u in my_units:
                if self._cards[u.card].kind == "troop":
                    u.stage = min(2, u.stage + 1)
        if not my_units:
            for u in opp_units:
                if self._cards[u.card].kind == "troop":
                    u.stage = min(2, u.stage + 1)

        # If both have units, they fight: front-most units exchange damage.
        if my_units and opp_units:
            my_front = max(my_units, key=lambda u: (u.stage, -u.hp))
            opp_front = max(opp_units, key=lambda u: (u.stage, -u.hp))

            # Targeting: some units prefer towers/buildings (giant) so they ignore troops if possible.
            # In this simplified sim, that means: if targets==towers, they do NOT damage enemy units.
            if my_front.targets != "towers":
                opp_front.hp -= max(0.0, my_front.dps) * 0.5
            if opp_front.targets != "towers":
                my_front.hp -= max(0.0, opp_front.dps) * 0.5
            return

        # Otherwise, any units at bridge pressure the tower.
        def tower_hit(units: list[Unit], target: TowerHP):
            # Only units that can hit towers contribute. (All troops can, but if a troop targets towers-only,
            # it still contributes here; if it doesn't target towers, it can still hit towers once no troops exist.)
            dps = sum(u.dps for u in units if u.stage >= 2)
            if dps <= 0:
                return
            if target[lane] > 0:
                target[lane] = max(0, target[lane] - int(dps * 0.45))
            else:
                target["king"] = max(0, target["king"] - int(dps * 0.20))

        if my_units:
            tower_hit(my_units, self._opp_tower_hp)
        if opp_units:
            tower_hit(opp_units, self._my_tower_hp)

    def _cleanup_dead(self):
        self._my_units = [u for u in self._my_units if u.hp > 0]
        self._opp_units = [u for u in self._opp_units if u.hp > 0]

    def _update_crowns_and_done(self):
        self._my_crowns = int(self._opp_tower_hp["left"] == 0) + int(self._opp_tower_hp["right"] == 0) + int(self._opp_tower_hp["king"] == 0)
        self._opp_crowns = int(self._my_tower_hp["left"] == 0) + int(self._my_tower_hp["right"] == 0) + int(self._my_tower_hp["king"] == 0)

        if self._my_crowns >= 3 or self._opp_crowns >= 3 or self._time_remaining_s <= 0.0:
            self._done = True

    def _scripted_opponent_action(self):
        # Tilt makes opponent overspend / pick worse card; we implement that as randomness in choice + zone.
        playable = [c for c in self._opp_hand if self._cards[c].cost <= self._opp_elixir]
        if not playable:
            self._apply_player_action(owner="opp", kind="wait", card=None, zone=None)
            return

        noise = self._opp_card_noise()
        if self._rng.random() < noise:
            # "tilt": pick a random playable (might be expensive)
            card = self._rng.choice(playable)
        else:
            # baseline: pick cheapest playable
            card = sorted(playable, key=lambda c: self._cards[c].cost)[0]

        # also sometimes choose a poor zone when tilted
        zone = self._rng.choice(PLACEMENT_ZONES)
        self._apply_player_action(owner="opp", kind="play", card=card, zone=zone)

    def _update_tilt(self, *, emote: str | None):
        # Decay
        self._opp_tilt = max(0.0, min(1.0, self._opp_tilt - 0.03))

        if emote is None:
            return

        # Detect "BM moment" proxies from last tick state deltas isn't available here; so use heuristics:
        # - if opponent is behind in total tower HP, emotes have bigger impact
        my_adv = self._tower_hp_advantage()
        timing_mult = 1.0 + min(1.0, max(0.0, my_adv / 1500.0))  # up to 2x
        base = 0.02
        if my_adv > 400:
            base = 0.08  # "dominance"
        if emote == "laugh" and my_adv > 800:
            base = 0.12
        self._opp_tilt = max(0.0, min(1.0, self._opp_tilt + base * timing_mult))

    def _opp_card_noise(self) -> float:
        if self._opp_tilt < 0.3:
            return 0.05
        if self._opp_tilt < 0.6:
            return 0.20
        if self._opp_tilt < 0.9:
            return 0.40
        return 0.60

    def _tower_hp_advantage(self) -> float:
        my = self._my_tower_hp["left"] + self._my_tower_hp["right"] + self._my_tower_hp["king"]
        opp = self._opp_tower_hp["left"] + self._opp_tower_hp["right"] + self._opp_tower_hp["king"]
        return float(my - opp) * -1.0  # positive when opp has less HP (i'm ahead)

    def _snapshot_score(self) -> dict:
        return {
            "my_total_tower_hp": self._my_tower_hp["left"] + self._my_tower_hp["right"] + self._my_tower_hp["king"],
            "opp_total_tower_hp": self._opp_tower_hp["left"] + self._opp_tower_hp["right"] + self._opp_tower_hp["king"],
            "my_crowns": self._my_crowns,
            "opp_crowns": self._opp_crowns,
            "opp_tilt": self._opp_tilt,
            "my_elixir": self._my_elixir,
            "invalid_last": self._invalid_last,
            "my_spent_last": float(getattr(self, "_my_spent_last", 0.0)),
            "opp_spent_last": float(getattr(self, "_opp_spent_last", 0.0)),
            "my_wait_last": bool(getattr(self, "_my_wait_last", False)),
            "spell_hits": int(getattr(self, "_last_spell_hits", 0)),
            "spell_tower_dmg": int(getattr(self, "_last_spell_tower_dmg", 0)),
            "punish_window": int(getattr(self, "_punish_window", 0)),
        }

    def _compute_rewards(self, before: dict, after: dict, *, emote: str | None, invalid_action: bool) -> dict:
        weights, params = load_reward_config()

        # 1) Crown differential (Jaso-style log scaling, normalized)
        crowns_won = after["my_crowns"]
        crowns_lost = after["opp_crowns"]
        r_crowns = (4.9 * math.log(4.8 * crowns_won + 0.75) + 1.4) - (4.9 * math.log(4.8 * crowns_lost + 0.75) + 1.4)
        r_crowns_norm = float(max(-15.0, min(15.0, r_crowns)) / 15.0)

        # 2) Tower damage (dense)
        dmg_dealt = before["opp_total_tower_hp"] - after["opp_total_tower_hp"]
        dmg_taken = before["my_total_tower_hp"] - after["my_total_tower_hp"]
        r_tower = (dmg_dealt - 0.8 * dmg_taken) / 1200.0  # scale to ~[-1,1] typical

        # 3) Elixir discipline (penalize floating at full elixir)
        r_elixir = 0.0
        if before["my_elixir"] >= params.full_elixir_threshold:
            r_elixir = params.full_elixir_penalty

        # 4) Emotes / tilt are NOT part of the reward.
        # They exist only as a novelty mechanic affecting opponent behavior.
        r_tilt = 0.0
        # 5) Anti-stall reward (replaces constant alive reward).
        # Penalize repeated waiting, especially when floating high elixir.
        r_stall = 0.0
        if bool(before.get("my_wait_last")):
            if float(before.get("my_elixir", 0.0)) >= params.tempo_elixir_threshold:
                r_stall = params.stall_wait_penalty_high_elixir
            else:
                r_stall = params.stall_wait_penalty_low_elixir

        # 6) Invalid action penalty (anti-hacking / stabilizes early RL)
        r_invalid = params.invalid_penalty if invalid_action else 0.0

        # 7) Tempo: discourage waiting when you have plenty elixir.
        r_tempo = 0.0
        if bool(before.get("my_wait_last")) and float(before.get("my_elixir", 0.0)) >= params.tempo_elixir_threshold:
            r_tempo = params.tempo_wait_penalty

        # 8) Spell value: reward multi-hit spells (a proxy for good "spell value").
        r_spell = 0.0
        hits = int(before.get("spell_hits", 0))
        tower_chip = int(before.get("spell_tower_dmg", 0))
        if hits >= 2:
            r_spell += 0.02 * min(5, hits)
        if tower_chip > 0:
            r_spell += min(0.05, tower_chip / 6000.0)

        # 9) Elixir efficiency: reward dealing damage with low spend (positive-trade proxy).
        spent = float(before.get("my_spent_last", 0.0))
        if spent > 0:
            r_eff = max(0.0, (dmg_dealt - 0.5 * dmg_taken) / (800.0 * spent))
        else:
            r_eff = 0.0

        # 10) Punish window: if opponent overcommitted recently, reward spending to pressure quickly.
        r_punish = 0.0
        if int(before.get("punish_window", 0)) > 0 and spent > 0.0:
            r_punish = params.punish_spend_reward

        # 11) Overcommit penalty: discourage going to (near) zero elixir unless it immediately creates advantage.
        r_overcommit = 0.0
        if spent > 0.0 and float(before.get("my_elixir", 0.0)) <= params.overcommit_elixir_threshold and dmg_dealt <= 0:
            r_overcommit = params.overcommit_penalty

        # 12) Invalid rate penalty (shapes away from repeated illegal actions).
        # Uses episode-to-date invalid count from the state (tracked elsewhere).
        inv_count = int(before.get("invalid_action_count", 0))
        steps = max(1, int(before.get("turn", 1)))
        inv_rate = inv_count / steps
        r_invalid_rate = params.invalid_rate_penalty * inv_rate

        # 13) Win bonus at terminal (verifiable).
        r_win = 0.0
        if bool(after.get("done")):
            my_c = int(after.get("my_crowns", 0))
            opp_c = int(after.get("opp_crowns", 0))
            if my_c > opp_c:
                r_win = 1.0
            elif my_c < opp_c:
                r_win = -1.0
            else:
                r_win = 0.0

        breakdown = {
            "crown_differential": r_crowns_norm,
            "tower_damage": float(r_tower),
            "elixir_discipline": float(r_elixir),
            "tilt_efficiency": float(r_tilt),
            "stall": float(r_stall),
            "invalid_action": float(r_invalid),
            "invalid_rate": float(r_invalid_rate),
            "tempo": float(r_tempo),
            "spell_value": float(r_spell),
            "elixir_efficiency": float(r_eff),
            "punish_window": float(r_punish),
            "overcommit": float(r_overcommit),
            "win_bonus": float(r_win),
        }
        total = (
            weights.crown_differential * breakdown["crown_differential"]
            + weights.tower_damage * breakdown["tower_damage"]
            + weights.elixir_discipline * breakdown["elixir_discipline"]
            + weights.invalid_action * breakdown["invalid_action"]
            + weights.invalid_rate * breakdown["invalid_rate"]
            + weights.tempo * breakdown["tempo"]
            + weights.spell_value * breakdown["spell_value"]
            + weights.elixir_efficiency * breakdown["elixir_efficiency"]
            + weights.punish_window * breakdown["punish_window"]
            + weights.overcommit * breakdown["overcommit"]
            + weights.stall * breakdown["stall"]
            + weights.win_bonus * breakdown["win_bonus"]
        )
        return {"reward_total": float(total), "reward_breakdown": breakdown}

    def _log_event(self, text: str):
        self._last_events.append(f"Turn {self._turn}: {text}")