Spaces:

omm7
/

toxic-royale-env

Sleeping

App Files Files Community

toxic-royale-env / simulator.py

omm7

Upload folder using huggingface_hub

b0620f3 verified about 1 month ago

raw

history blame contribute delete

28.7 kB

	from __future__ import annotations

	import math
	import random
	from dataclasses import dataclass, field
	from typing import Literal, TypedDict

	from .arena_geometry import Arena
	from .card_catalog import load_cards
	from .reward_config import load_reward_config

	Zone = Literal[
	"bridge_left",
	"bridge_right",
	"back_left",
	"back_right",
	"mid_left",
	"mid_right",
	]

	Emote = Literal["laugh", "yawn", "cry", "thanks", "chicken", "wp"]


	class TowerHP(TypedDict):
	left: int
	right: int
	king: int


	@dataclass
	class Unit:
	owner: Literal["me", "opp"]
	card: str
	hp: float
	dps: float
	targets: Literal["ground", "air", "both", "towers"]
	zone: Zone
	is_air: bool
	stage: int # 0=back, 1=mid, 2=bridge
	splash: bool = False
	radius: float = 0.0


	@dataclass
	class CardDef:
	key: str
	name: str
	cost: int
	kind: Literal["troop", "spell", "building"]
	# simplified combat stats (we keep them stable + fast; can be expanded later)
	hp: int = 0
	dps: int = 0
	targets: Literal["ground", "air", "both", "towers"] = "ground"
	is_air: bool = False
	spell_damage: int = 0
	splash: bool = False
	radius: float = 0.0
	description: str \| None = None


	def _infer_carddef(card_key: str, catalog: dict) -> CardDef:
	"""
	Convert a real card record into our simplified simulator parameters.

	We keep the simulator intentionally abstract, so we infer missing values and
	clamp stats to keep learning stable.
	"""
	c = catalog.get(card_key)
	if c is None:
	raise KeyError(card_key)

	kind: Literal["troop", "spell", "building"]
	if c.type == "Troop":
	kind = "troop"
	elif c.type == "Building":
	kind = "building"
	elif c.type == "Spell":
	kind = "spell"
	else:
	kind = "troop"

	# Map targets to a coarse set.
	targets = "both" if (c.targets and "Air" in c.targets) else "ground"
	if c.targets and "Buildings" in c.targets:
	targets = "towers"

	hp = int(c.hitpoints or 0)
	dps = int(c.damage_per_second or 0)
	dmg = int(c.damage or 0)

	# Fall back if DPS missing.
	if dps <= 0 and dmg > 0 and c.hit_speed:
	dps = int(dmg / max(0.1, float(c.hit_speed)))

	# Clamp to keep sim stable.
	hp = max(1, min(hp, 6000)) if kind != "spell" else 0
	dps = max(0, min(dps, 800)) if kind != "spell" else 0

	# Spell damage: use "damage" if available else a stable fallback by elixir.
	spell_damage = 0
	if kind == "spell":
	spell_damage = int(c.damage or (c.elixir * 120))
	spell_damage = max(40, min(spell_damage, 1600))

	is_air = bool(c.move_speed and "Air" in c.move_speed) # imperfect

	# Splash/radius heuristics for our small pool (good enough for RL + rewards)
	splash = card_key in {"fireball", "arrows"}
	radius = 0.0
	if card_key == "fireball":
	radius = 0.35
	if card_key == "arrows":
	radius = 0.45

	# Targeting priors (closer to real CR interactions for our pool)
	# - Giant targets buildings (incl towers) only
	if card_key == "giant":
	targets = "towers"

	return CardDef(
	key=card_key,
	name=c.name,
	cost=int(c.elixir),
	kind=kind,
	hp=hp,
	dps=dps,
	targets=targets, # type: ignore[arg-type]
	is_air=is_air,
	spell_damage=spell_damage,
	splash=splash,
	radius=radius,
	description=c.description,
	)

	PLACEMENT_ZONES: tuple[Zone, ...] = (
	"bridge_left",
	"bridge_right",
	"back_left",
	"back_right",
	"mid_left",
	"mid_right",
	)


	def _zone_lane(zone: Zone) -> Literal["left", "right"]:
	return "left" if zone.endswith("_left") else "right"


	def _is_air_zone(zone: Zone) -> bool:
	return False


	def _zone_stage(zone: Zone) -> int:
	if zone.startswith("back_"):
	return 0
	if zone.startswith("mid_"):
	return 1
	return 2


	@dataclass
	class GameState:
	turn: int
	time_remaining_s: float
	double_elixir: bool

	my_elixir: float
	opp_elixir_estimate: float

	my_tower_hp: TowerHP
	opp_tower_hp: TowerHP

	my_hand: list[str]
	my_next_card: str

	opp_hand_estimate: list[str]

	my_units: list[Unit] = field(default_factory=list)
	opp_units: list[Unit] = field(default_factory=list)

	opp_tilt_meter: float = 0.0
	my_crowns: int = 0
	opp_crowns: int = 0

	invalid_action_last: bool = False
	invalid_action_count: int = 0

	done: bool = False


	class ToxicRoyaleSim:
	"""
	Lightweight, fast text simulator.

	Design goals for hackathon:
	- deterministic-ish (seedable) and fast enough for many rollouts
	- objective, non-zero rewards early
	- supports "tilt meter" dynamics for novelty
	"""

	def __init__(self, seed: int \| None = None):
	self._rng = random.Random(seed)
	self._seed = seed
	self._last_events: list[str] = []
	self._invalid_last = False
	self._arena = Arena()
	self.reset()

	def _zone_point(self, owner: Literal["me", "opp"], zone: Zone) -> tuple[float, float]:
	"""
	Map an abstract placement zone to a normalized arena (x,y).

	Convention:
	- y grows from opponent side (0) to my side (1)
	- our zone definitions are expressed from my perspective
	so we mirror y for opponent placements.
	"""
	x = 0.33 if zone.endswith("_left") else 0.67
	if zone.startswith("back_"):
	y = 0.80
	elif zone.startswith("mid_"):
	y = 0.66
	else:
	y = 0.52
	if owner == "opp":
	y = 1.0 - y
	return x, y

	def _placement_is_valid(self, owner: Literal["me", "opp"], card: CardDef, zone: Zone) -> bool:
	x, y = self._zone_point(owner, zone)
	if card.kind == "spell":
	return True
	return self._arena.can_place_troop(owner, x, y)

	def reset(self) -> GameState:
	self._last_events = []
	self._turn = 0
	# Real-ish Clash timing: 3:00 regulation + overtime if tied.
	# Keep it simple but closer to the real feel of the screen you shared.
	self._time_remaining_s = 180.0
	self._in_overtime = False

	self._my_elixir = 5.0
	self._opp_elixir = 5.0

	self._my_tower_hp: TowerHP = {"left": 1400, "right": 1400, "king": 2400}
	self._opp_tower_hp: TowerHP = {"left": 1400, "right": 1400, "king": 2400}

	# Load real card catalog (RoyaleAPI static data) once per sim instance.
	if not hasattr(self, "_catalog"):
	self._catalog = load_cards()

	# Training pool: keep it small first (stable learning), but every card has a real description.
	# You can expand this list later to "all cards" without changing the env API.
	self._training_pool = [
	"giant",
	"knight",
	"minions",
	"archers",
	"fireball",
	"arrows",
	"musketeer",
	"mini-pekka",
	]

	# Build simulator defs for the pool.
	self._cards: dict[str, CardDef] = {}
	for key in self._training_pool:
	# keys in RoyaleAPI dataset are kebab-case (e.g. "mini-pekka")
	self._cards[key] = _infer_carddef(key, self._catalog)
	self._cards["wait"] = CardDef(key="wait", name="Wait", cost=0, kind="spell", spell_damage=0)

	self._my_deck = list(self._training_pool)
	self._opp_deck = list(self._training_pool)
	self._rng.shuffle(self._my_deck)
	self._rng.shuffle(self._opp_deck)

	self._my_hand = [self._my_deck.pop() for _ in range(4)]
	self._my_next = self._my_deck.pop()

	self._opp_hand = [self._opp_deck.pop() for _ in range(4)]
	self._opp_next = self._opp_deck.pop()

	self._my_units: list[Unit] = []
	self._opp_units: list[Unit] = []

	self._opp_tilt = 0.0
	self._my_crowns = 0
	self._opp_crowns = 0
	self._done = False
	self._invalid_last = False
	self._invalid_count = 0
	self._my_spent_last = 0.0
	self._opp_spent_last = 0.0
	self._my_wait_last = False
	self._opp_wait_last = False
	self._last_spell_hits = 0
	self._last_spell_tower_dmg = 0
	self._punish_window = 0

	self._log_event("Game start.")
	return self.state()

	def state(self) -> GameState:
	return GameState(
	turn=self._turn,
	time_remaining_s=self._time_remaining_s,
	double_elixir=(self._time_remaining_s <= 60.0) or bool(getattr(self, "_in_overtime", False)),
	my_elixir=self._my_elixir,
	opp_elixir_estimate=self._opp_elixir,
	my_tower_hp=dict(self._my_tower_hp),
	opp_tower_hp=dict(self._opp_tower_hp),
	my_hand=list(self._my_hand),
	my_next_card=self._my_next,
	opp_hand_estimate=list(self._opp_hand),
	my_units=list(self._my_units),
	opp_units=list(self._opp_units),
	opp_tilt_meter=self._opp_tilt,
	my_crowns=self._my_crowns,
	opp_crowns=self._opp_crowns,
	invalid_action_last=self._invalid_last,
	invalid_action_count=self._invalid_count,
	done=self._done,
	)

	def last_events(self, k: int = 6) -> list[str]:
	return self._last_events[-k:]

	def step(self, *, kind: Literal["play", "wait"], card: str \| None, zone: Zone \| None, emote: str \| None) -> dict:
	"""
	Advance one tick (0.5s) with (agent action + scripted opponent response).
	Returns: a dict with reward_total and reward_breakdown.
	"""
	if self._done:
	return {"reward_total": 0.0, "reward_breakdown": {"already_done": 0.0}}

	before = self._snapshot_score()
	self._invalid_last = False
	self._my_spent_last = 0.0
	self._opp_spent_last = 0.0
	self._my_wait_last = False
	self._opp_wait_last = False
	self._last_spell_hits = 0
	self._last_spell_tower_dmg = 0
	self._punish_window = max(0, int(getattr(self, "_punish_window", 0)) - 1)

	# --- apply agent action ---
	self._apply_player_action(owner="me", kind=kind, card=card, zone=zone)
	self._update_tilt(emote=emote)
	if self._invalid_last:
	self._invalid_count += 1

	# --- apply opponent action (scripted, tilt-affected) ---
	self._scripted_opponent_action()
	# If opponent just made a big investment, you have a short window to punish.
	if float(getattr(self, "_opp_spent_last", 0.0)) >= 7.0:
	self._punish_window = 4 # ~2 seconds (4 ticks)

	# --- tick combat + elixir regen ---
	self._tick()

	after = self._snapshot_score()
	breakdown = self._compute_rewards(before, after, emote=emote, invalid_action=self._invalid_last)
	return breakdown

	# -------------------------
	# Internal mechanics
	# -------------------------
	def _apply_player_action(self, *, owner: Literal["me", "opp"], kind: Literal["play", "wait"], card: str \| None, zone: Zone \| None):
	if kind == "wait":
	self._log_event(f"{owner} waits.")
	if owner == "me":
	self._my_wait_last = True
	else:
	self._opp_wait_last = True
	return

	if card is None or zone is None:
	self._log_event(f"{owner} attempted invalid play (missing card/zone).")
	if owner == "me":
	self._invalid_last = True
	return

	if owner == "me" and card not in self._my_hand:
	self._log_event(f"{owner} attempted to play {card} not in hand.")
	self._invalid_last = True
	return
	if owner == "opp" and card not in self._opp_hand:
	self._log_event(f"{owner} attempted to play {card} not in hand.")
	return

	cdef = self._cards.get(card)
	if cdef is None or card == "wait":
	self._log_event(f"{owner} attempted unknown card '{card}'.")
	if owner == "me":
	self._invalid_last = True
	return

	elixir = self._my_elixir if owner == "me" else self._opp_elixir
	if elixir + 1e-6 < cdef.cost:
	self._log_event(f"{owner} tried to overspend elixir on {card} (cost {cdef.cost}).")
	if owner == "me":
	self._invalid_last = True
	return

	if not self._placement_is_valid(owner, cdef, zone):
	self._log_event(f"{owner} attempted illegal placement: {card} at {zone}.")
	if owner == "me":
	self._invalid_last = True
	return

	# spend
	if owner == "me":
	self._my_elixir -= cdef.cost
	self._my_spent_last = float(cdef.cost)
	else:
	self._opp_elixir -= cdef.cost
	self._opp_spent_last = float(cdef.cost)

	# cycle hand
	if owner == "me":
	self._my_hand.remove(card)
	self._my_hand.append(self._my_next)
	self._my_next = self._my_deck.pop() if self._my_deck else self._rng.choice(self._my_hand)
	else:
	self._opp_hand.remove(card)
	self._opp_hand.append(self._opp_next)
	self._opp_next = self._opp_deck.pop() if self._opp_deck else self._rng.choice(self._opp_hand)

	if cdef.kind == "spell":
	self._apply_spell(owner=owner, card=card, zone=zone, dmg=cdef.spell_damage)
	return

	# troops/buildings become units
	is_air = cdef.is_air or _is_air_zone(zone)
	u = Unit(
	owner=owner,
	card=card,
	hp=float(cdef.hp),
	dps=float(cdef.dps),
	targets=cdef.targets,
	zone=zone,
	is_air=is_air,
	stage=_zone_stage(zone),
	)
	if owner == "me":
	self._my_units.append(u)
	else:
	self._opp_units.append(u)
	self._log_event(f"{owner} played {card} at {zone} (cost {cdef.cost}).")

	def _apply_spell(self, *, owner: Literal["me", "opp"], card: str, zone: Zone, dmg: int):
	lane = _zone_lane(zone)
	target_towers = self._opp_tower_hp if owner == "me" else self._my_tower_hp
	target_units = self._opp_units if owner == "me" else self._my_units

	cdef = self._cards.get(card)
	radius = float(getattr(cdef, "radius", 0.0) or 0.0)

	# --- AoE: hit up to N units in the lane (front-most first) ---
	if radius > 0:
	lane_units = [u for u in target_units if _zone_lane(u.zone) == lane]
	# Larger radius -> more units affected, capped.
	max_hits = 3 if radius < 0.40 else 5
	lane_units.sort(key=lambda u: (u.stage, -u.hp), reverse=True)
	hits = lane_units[:max_hits]
	if hits:
	for u in hits:
	u.hp -= float(dmg)
	self._log_event(f"{owner} cast {card} hitting {len(hits)} units in {lane} for {dmg}.")
	self._last_spell_hits = max(self._last_spell_hits, len(hits))

	# --- Tower chip (spells can hit towers when targeted near them) ---
	# Bridge/mid zones chip lane tower lightly; back zones are assumed defensive.
	tower_mult = 1.0 if zone.startswith("bridge_") else (0.6 if zone.startswith("mid_") else 0.0)
	tower_dmg = int(dmg * tower_mult)
	if tower_dmg > 0:
	if target_towers[lane] > 0:
	target_towers[lane] = max(0, target_towers[lane] - tower_dmg)
	self._log_event(f"{owner} cast {card} chipping {lane} tower for {tower_dmg}.")
	else:
	target_towers["king"] = max(0, target_towers["king"] - int(tower_dmg * 0.5))
	self._log_event(f"{owner} cast {card} chipping king tower for {int(tower_dmg*0.5)}.")
	self._last_spell_tower_dmg = max(self._last_spell_tower_dmg, tower_dmg)

	def _tick(self):
	# one turn = 0.5 seconds
	self._turn += 1
	self._time_remaining_s = max(0.0, self._time_remaining_s - 0.5)

	# Overtime: if regulation ends tied, give 60s overtime (double elixir).
	if self._time_remaining_s <= 0.0 and not getattr(self, "_in_overtime", False):
	if self._my_crowns == self._opp_crowns:
	self._in_overtime = True
	self._time_remaining_s = 60.0
	self._log_event("Overtime!")

	double = (self._time_remaining_s <= 60.0) or bool(getattr(self, "_in_overtime", False))

	regen = 0.35 if not double else 0.7
	self._my_elixir = min(10.0, self._my_elixir + regen)
	self._opp_elixir = min(10.0, self._opp_elixir + regen)

	# combat: lane skirmishes (units fight units first, then towers)
	self._lane_skirmish(lane="left")
	self._lane_skirmish(lane="right")

	self._cleanup_dead()
	self._update_crowns_and_done()

	def _lane_skirmish(self, *, lane: Literal["left", "right"]):
	"""
	Very simplified Clash-like combat:
	- units have a lane and a stage (back/mid/bridge)
	- troops advance toward bridge unless blocked by enemies
	- each tick, each side deals damage to one enemy unit (if any), else to the lane tower (else king)
	"""
	my_units = [u for u in self._my_units if _zone_lane(u.zone) == lane]
	opp_units = [u for u in self._opp_units if _zone_lane(u.zone) == lane]

	# Advance troops if no enemies near bridge; buildings don't advance.
	if not opp_units:
	for u in my_units:
	if self._cards[u.card].kind == "troop":
	u.stage = min(2, u.stage + 1)
	if not my_units:
	for u in opp_units:
	if self._cards[u.card].kind == "troop":
	u.stage = min(2, u.stage + 1)

	# If both have units, they fight: front-most units exchange damage.
	if my_units and opp_units:
	my_front = max(my_units, key=lambda u: (u.stage, -u.hp))
	opp_front = max(opp_units, key=lambda u: (u.stage, -u.hp))

	# Targeting: some units prefer towers/buildings (giant) so they ignore troops if possible.
	# In this simplified sim, that means: if targets==towers, they do NOT damage enemy units.
	if my_front.targets != "towers":
	opp_front.hp -= max(0.0, my_front.dps) * 0.5
	if opp_front.targets != "towers":
	my_front.hp -= max(0.0, opp_front.dps) * 0.5
	return

	# Otherwise, any units at bridge pressure the tower.
	def tower_hit(units: list[Unit], target: TowerHP):
	# Only units that can hit towers contribute. (All troops can, but if a troop targets towers-only,
	# it still contributes here; if it doesn't target towers, it can still hit towers once no troops exist.)
	dps = sum(u.dps for u in units if u.stage >= 2)
	if dps <= 0:
	return
	if target[lane] > 0:
	target[lane] = max(0, target[lane] - int(dps * 0.45))
	else:
	target["king"] = max(0, target["king"] - int(dps * 0.20))

	if my_units:
	tower_hit(my_units, self._opp_tower_hp)
	if opp_units:
	tower_hit(opp_units, self._my_tower_hp)

	def _cleanup_dead(self):
	self._my_units = [u for u in self._my_units if u.hp > 0]
	self._opp_units = [u for u in self._opp_units if u.hp > 0]

	def _update_crowns_and_done(self):
	self._my_crowns = int(self._opp_tower_hp["left"] == 0) + int(self._opp_tower_hp["right"] == 0) + int(self._opp_tower_hp["king"] == 0)
	self._opp_crowns = int(self._my_tower_hp["left"] == 0) + int(self._my_tower_hp["right"] == 0) + int(self._my_tower_hp["king"] == 0)

	if self._my_crowns >= 3 or self._opp_crowns >= 3 or self._time_remaining_s <= 0.0:
	self._done = True

	def _scripted_opponent_action(self):
	# Tilt makes opponent overspend / pick worse card; we implement that as randomness in choice + zone.
	playable = [c for c in self._opp_hand if self._cards[c].cost <= self._opp_elixir]
	if not playable:
	self._apply_player_action(owner="opp", kind="wait", card=None, zone=None)
	return

	noise = self._opp_card_noise()
	if self._rng.random() < noise:
	# "tilt": pick a random playable (might be expensive)
	card = self._rng.choice(playable)
	else:
	# baseline: pick cheapest playable
	card = sorted(playable, key=lambda c: self._cards[c].cost)[0]

	# also sometimes choose a poor zone when tilted
	zone = self._rng.choice(PLACEMENT_ZONES)
	self._apply_player_action(owner="opp", kind="play", card=card, zone=zone)

	def _update_tilt(self, *, emote: str \| None):
	# Decay
	self._opp_tilt = max(0.0, min(1.0, self._opp_tilt - 0.03))

	if emote is None:
	return

	# Detect "BM moment" proxies from last tick state deltas isn't available here; so use heuristics:
	# - if opponent is behind in total tower HP, emotes have bigger impact
	my_adv = self._tower_hp_advantage()
	timing_mult = 1.0 + min(1.0, max(0.0, my_adv / 1500.0)) # up to 2x
	base = 0.02
	if my_adv > 400:
	base = 0.08 # "dominance"
	if emote == "laugh" and my_adv > 800:
	base = 0.12
	self._opp_tilt = max(0.0, min(1.0, self._opp_tilt + base * timing_mult))

	def _opp_card_noise(self) -> float:
	if self._opp_tilt < 0.3:
	return 0.05
	if self._opp_tilt < 0.6:
	return 0.20
	if self._opp_tilt < 0.9:
	return 0.40
	return 0.60

	def _tower_hp_advantage(self) -> float:
	my = self._my_tower_hp["left"] + self._my_tower_hp["right"] + self._my_tower_hp["king"]
	opp = self._opp_tower_hp["left"] + self._opp_tower_hp["right"] + self._opp_tower_hp["king"]
	return float(my - opp) * -1.0 # positive when opp has less HP (i'm ahead)

	def _snapshot_score(self) -> dict:
	return {
	"my_total_tower_hp": self._my_tower_hp["left"] + self._my_tower_hp["right"] + self._my_tower_hp["king"],
	"opp_total_tower_hp": self._opp_tower_hp["left"] + self._opp_tower_hp["right"] + self._opp_tower_hp["king"],
	"my_crowns": self._my_crowns,
	"opp_crowns": self._opp_crowns,
	"opp_tilt": self._opp_tilt,
	"my_elixir": self._my_elixir,
	"invalid_last": self._invalid_last,
	"my_spent_last": float(getattr(self, "_my_spent_last", 0.0)),
	"opp_spent_last": float(getattr(self, "_opp_spent_last", 0.0)),
	"my_wait_last": bool(getattr(self, "_my_wait_last", False)),
	"spell_hits": int(getattr(self, "_last_spell_hits", 0)),
	"spell_tower_dmg": int(getattr(self, "_last_spell_tower_dmg", 0)),
	"punish_window": int(getattr(self, "_punish_window", 0)),
	}

	def _compute_rewards(self, before: dict, after: dict, *, emote: str \| None, invalid_action: bool) -> dict:
	weights, params = load_reward_config()

	# 1) Crown differential (Jaso-style log scaling, normalized)
	crowns_won = after["my_crowns"]
	crowns_lost = after["opp_crowns"]
	r_crowns = (4.9 * math.log(4.8 * crowns_won + 0.75) + 1.4) - (4.9 * math.log(4.8 * crowns_lost + 0.75) + 1.4)
	r_crowns_norm = float(max(-15.0, min(15.0, r_crowns)) / 15.0)

	# 2) Tower damage (dense)
	dmg_dealt = before["opp_total_tower_hp"] - after["opp_total_tower_hp"]
	dmg_taken = before["my_total_tower_hp"] - after["my_total_tower_hp"]
	r_tower = (dmg_dealt - 0.8 * dmg_taken) / 1200.0 # scale to ~[-1,1] typical

	# 3) Elixir discipline (penalize floating at full elixir)
	r_elixir = 0.0
	if before["my_elixir"] >= params.full_elixir_threshold:
	r_elixir = params.full_elixir_penalty

	# 4) Emotes / tilt are NOT part of the reward.
	# They exist only as a novelty mechanic affecting opponent behavior.
	r_tilt = 0.0
	# 5) Anti-stall reward (replaces constant alive reward).
	# Penalize repeated waiting, especially when floating high elixir.
	r_stall = 0.0
	if bool(before.get("my_wait_last")):
	if float(before.get("my_elixir", 0.0)) >= params.tempo_elixir_threshold:
	r_stall = params.stall_wait_penalty_high_elixir
	else:
	r_stall = params.stall_wait_penalty_low_elixir

	# 6) Invalid action penalty (anti-hacking / stabilizes early RL)
	r_invalid = params.invalid_penalty if invalid_action else 0.0

	# 7) Tempo: discourage waiting when you have plenty elixir.
	r_tempo = 0.0
	if bool(before.get("my_wait_last")) and float(before.get("my_elixir", 0.0)) >= params.tempo_elixir_threshold:
	r_tempo = params.tempo_wait_penalty

	# 8) Spell value: reward multi-hit spells (a proxy for good "spell value").
	r_spell = 0.0
	hits = int(before.get("spell_hits", 0))
	tower_chip = int(before.get("spell_tower_dmg", 0))
	if hits >= 2:
	r_spell += 0.02 * min(5, hits)
	if tower_chip > 0:
	r_spell += min(0.05, tower_chip / 6000.0)

	# 9) Elixir efficiency: reward dealing damage with low spend (positive-trade proxy).
	spent = float(before.get("my_spent_last", 0.0))
	if spent > 0:
	r_eff = max(0.0, (dmg_dealt - 0.5 * dmg_taken) / (800.0 * spent))
	else:
	r_eff = 0.0

	# 10) Punish window: if opponent overcommitted recently, reward spending to pressure quickly.
	r_punish = 0.0
	if int(before.get("punish_window", 0)) > 0 and spent > 0.0:
	r_punish = params.punish_spend_reward

	# 11) Overcommit penalty: discourage going to (near) zero elixir unless it immediately creates advantage.
	r_overcommit = 0.0
	if spent > 0.0 and float(before.get("my_elixir", 0.0)) <= params.overcommit_elixir_threshold and dmg_dealt <= 0:
	r_overcommit = params.overcommit_penalty

	# 12) Invalid rate penalty (shapes away from repeated illegal actions).
	# Uses episode-to-date invalid count from the state (tracked elsewhere).
	inv_count = int(before.get("invalid_action_count", 0))
	steps = max(1, int(before.get("turn", 1)))
	inv_rate = inv_count / steps
	r_invalid_rate = params.invalid_rate_penalty * inv_rate

	# 13) Win bonus at terminal (verifiable).
	r_win = 0.0
	if bool(after.get("done")):
	my_c = int(after.get("my_crowns", 0))
	opp_c = int(after.get("opp_crowns", 0))
	if my_c > opp_c:
	r_win = 1.0
	elif my_c < opp_c:
	r_win = -1.0
	else:
	r_win = 0.0

	breakdown = {
	"crown_differential": r_crowns_norm,
	"tower_damage": float(r_tower),
	"elixir_discipline": float(r_elixir),
	"tilt_efficiency": float(r_tilt),
	"stall": float(r_stall),
	"invalid_action": float(r_invalid),
	"invalid_rate": float(r_invalid_rate),
	"tempo": float(r_tempo),
	"spell_value": float(r_spell),
	"elixir_efficiency": float(r_eff),
	"punish_window": float(r_punish),
	"overcommit": float(r_overcommit),
	"win_bonus": float(r_win),
	}
	total = (
	weights.crown_differential * breakdown["crown_differential"]
	+ weights.tower_damage * breakdown["tower_damage"]
	+ weights.elixir_discipline * breakdown["elixir_discipline"]
	+ weights.invalid_action * breakdown["invalid_action"]
	+ weights.invalid_rate * breakdown["invalid_rate"]
	+ weights.tempo * breakdown["tempo"]
	+ weights.spell_value * breakdown["spell_value"]
	+ weights.elixir_efficiency * breakdown["elixir_efficiency"]
	+ weights.punish_window * breakdown["punish_window"]
	+ weights.overcommit * breakdown["overcommit"]
	+ weights.stall * breakdown["stall"]
	+ weights.win_bonus * breakdown["win_bonus"]
	)
	return {"reward_total": float(total), "reward_breakdown": breakdown}

	def _log_event(self, text: str):
	self._last_events.append(f"Turn {self._turn}: {text}")