toxic-royale-env / simulator.py
omm7's picture
Upload folder using huggingface_hub
b0620f3 verified
from __future__ import annotations
import math
import random
from dataclasses import dataclass, field
from typing import Literal, TypedDict
from .arena_geometry import Arena
from .card_catalog import load_cards
from .reward_config import load_reward_config
Zone = Literal[
"bridge_left",
"bridge_right",
"back_left",
"back_right",
"mid_left",
"mid_right",
]
Emote = Literal["laugh", "yawn", "cry", "thanks", "chicken", "wp"]
class TowerHP(TypedDict):
left: int
right: int
king: int
@dataclass
class Unit:
owner: Literal["me", "opp"]
card: str
hp: float
dps: float
targets: Literal["ground", "air", "both", "towers"]
zone: Zone
is_air: bool
stage: int # 0=back, 1=mid, 2=bridge
splash: bool = False
radius: float = 0.0
@dataclass
class CardDef:
key: str
name: str
cost: int
kind: Literal["troop", "spell", "building"]
# simplified combat stats (we keep them stable + fast; can be expanded later)
hp: int = 0
dps: int = 0
targets: Literal["ground", "air", "both", "towers"] = "ground"
is_air: bool = False
spell_damage: int = 0
splash: bool = False
radius: float = 0.0
description: str | None = None
def _infer_carddef(card_key: str, catalog: dict) -> CardDef:
"""
Convert a real card record into our simplified simulator parameters.
We keep the simulator intentionally abstract, so we infer missing values and
clamp stats to keep learning stable.
"""
c = catalog.get(card_key)
if c is None:
raise KeyError(card_key)
kind: Literal["troop", "spell", "building"]
if c.type == "Troop":
kind = "troop"
elif c.type == "Building":
kind = "building"
elif c.type == "Spell":
kind = "spell"
else:
kind = "troop"
# Map targets to a coarse set.
targets = "both" if (c.targets and "Air" in c.targets) else "ground"
if c.targets and "Buildings" in c.targets:
targets = "towers"
hp = int(c.hitpoints or 0)
dps = int(c.damage_per_second or 0)
dmg = int(c.damage or 0)
# Fall back if DPS missing.
if dps <= 0 and dmg > 0 and c.hit_speed:
dps = int(dmg / max(0.1, float(c.hit_speed)))
# Clamp to keep sim stable.
hp = max(1, min(hp, 6000)) if kind != "spell" else 0
dps = max(0, min(dps, 800)) if kind != "spell" else 0
# Spell damage: use "damage" if available else a stable fallback by elixir.
spell_damage = 0
if kind == "spell":
spell_damage = int(c.damage or (c.elixir * 120))
spell_damage = max(40, min(spell_damage, 1600))
is_air = bool(c.move_speed and "Air" in c.move_speed) # imperfect
# Splash/radius heuristics for our small pool (good enough for RL + rewards)
splash = card_key in {"fireball", "arrows"}
radius = 0.0
if card_key == "fireball":
radius = 0.35
if card_key == "arrows":
radius = 0.45
# Targeting priors (closer to real CR interactions for our pool)
# - Giant targets buildings (incl towers) only
if card_key == "giant":
targets = "towers"
return CardDef(
key=card_key,
name=c.name,
cost=int(c.elixir),
kind=kind,
hp=hp,
dps=dps,
targets=targets, # type: ignore[arg-type]
is_air=is_air,
spell_damage=spell_damage,
splash=splash,
radius=radius,
description=c.description,
)
PLACEMENT_ZONES: tuple[Zone, ...] = (
"bridge_left",
"bridge_right",
"back_left",
"back_right",
"mid_left",
"mid_right",
)
def _zone_lane(zone: Zone) -> Literal["left", "right"]:
return "left" if zone.endswith("_left") else "right"
def _is_air_zone(zone: Zone) -> bool:
return False
def _zone_stage(zone: Zone) -> int:
if zone.startswith("back_"):
return 0
if zone.startswith("mid_"):
return 1
return 2
@dataclass
class GameState:
turn: int
time_remaining_s: float
double_elixir: bool
my_elixir: float
opp_elixir_estimate: float
my_tower_hp: TowerHP
opp_tower_hp: TowerHP
my_hand: list[str]
my_next_card: str
opp_hand_estimate: list[str]
my_units: list[Unit] = field(default_factory=list)
opp_units: list[Unit] = field(default_factory=list)
opp_tilt_meter: float = 0.0
my_crowns: int = 0
opp_crowns: int = 0
invalid_action_last: bool = False
invalid_action_count: int = 0
done: bool = False
class ToxicRoyaleSim:
"""
Lightweight, fast text simulator.
Design goals for hackathon:
- deterministic-ish (seedable) and fast enough for many rollouts
- objective, non-zero rewards early
- supports "tilt meter" dynamics for novelty
"""
def __init__(self, seed: int | None = None):
self._rng = random.Random(seed)
self._seed = seed
self._last_events: list[str] = []
self._invalid_last = False
self._arena = Arena()
self.reset()
def _zone_point(self, owner: Literal["me", "opp"], zone: Zone) -> tuple[float, float]:
"""
Map an abstract placement zone to a normalized arena (x,y).
Convention:
- y grows from opponent side (0) to my side (1)
- our zone definitions are expressed from *my* perspective
so we mirror y for opponent placements.
"""
x = 0.33 if zone.endswith("_left") else 0.67
if zone.startswith("back_"):
y = 0.80
elif zone.startswith("mid_"):
y = 0.66
else:
y = 0.52
if owner == "opp":
y = 1.0 - y
return x, y
def _placement_is_valid(self, owner: Literal["me", "opp"], card: CardDef, zone: Zone) -> bool:
x, y = self._zone_point(owner, zone)
if card.kind == "spell":
return True
return self._arena.can_place_troop(owner, x, y)
def reset(self) -> GameState:
self._last_events = []
self._turn = 0
# Real-ish Clash timing: 3:00 regulation + overtime if tied.
# Keep it simple but closer to the real feel of the screen you shared.
self._time_remaining_s = 180.0
self._in_overtime = False
self._my_elixir = 5.0
self._opp_elixir = 5.0
self._my_tower_hp: TowerHP = {"left": 1400, "right": 1400, "king": 2400}
self._opp_tower_hp: TowerHP = {"left": 1400, "right": 1400, "king": 2400}
# Load real card catalog (RoyaleAPI static data) once per sim instance.
if not hasattr(self, "_catalog"):
self._catalog = load_cards()
# Training pool: keep it small first (stable learning), but every card has a real description.
# You can expand this list later to "all cards" without changing the env API.
self._training_pool = [
"giant",
"knight",
"minions",
"archers",
"fireball",
"arrows",
"musketeer",
"mini-pekka",
]
# Build simulator defs for the pool.
self._cards: dict[str, CardDef] = {}
for key in self._training_pool:
# keys in RoyaleAPI dataset are kebab-case (e.g. "mini-pekka")
self._cards[key] = _infer_carddef(key, self._catalog)
self._cards["wait"] = CardDef(key="wait", name="Wait", cost=0, kind="spell", spell_damage=0)
self._my_deck = list(self._training_pool)
self._opp_deck = list(self._training_pool)
self._rng.shuffle(self._my_deck)
self._rng.shuffle(self._opp_deck)
self._my_hand = [self._my_deck.pop() for _ in range(4)]
self._my_next = self._my_deck.pop()
self._opp_hand = [self._opp_deck.pop() for _ in range(4)]
self._opp_next = self._opp_deck.pop()
self._my_units: list[Unit] = []
self._opp_units: list[Unit] = []
self._opp_tilt = 0.0
self._my_crowns = 0
self._opp_crowns = 0
self._done = False
self._invalid_last = False
self._invalid_count = 0
self._my_spent_last = 0.0
self._opp_spent_last = 0.0
self._my_wait_last = False
self._opp_wait_last = False
self._last_spell_hits = 0
self._last_spell_tower_dmg = 0
self._punish_window = 0
self._log_event("Game start.")
return self.state()
def state(self) -> GameState:
return GameState(
turn=self._turn,
time_remaining_s=self._time_remaining_s,
double_elixir=(self._time_remaining_s <= 60.0) or bool(getattr(self, "_in_overtime", False)),
my_elixir=self._my_elixir,
opp_elixir_estimate=self._opp_elixir,
my_tower_hp=dict(self._my_tower_hp),
opp_tower_hp=dict(self._opp_tower_hp),
my_hand=list(self._my_hand),
my_next_card=self._my_next,
opp_hand_estimate=list(self._opp_hand),
my_units=list(self._my_units),
opp_units=list(self._opp_units),
opp_tilt_meter=self._opp_tilt,
my_crowns=self._my_crowns,
opp_crowns=self._opp_crowns,
invalid_action_last=self._invalid_last,
invalid_action_count=self._invalid_count,
done=self._done,
)
def last_events(self, k: int = 6) -> list[str]:
return self._last_events[-k:]
def step(self, *, kind: Literal["play", "wait"], card: str | None, zone: Zone | None, emote: str | None) -> dict:
"""
Advance one tick (0.5s) with (agent action + scripted opponent response).
Returns: a dict with reward_total and reward_breakdown.
"""
if self._done:
return {"reward_total": 0.0, "reward_breakdown": {"already_done": 0.0}}
before = self._snapshot_score()
self._invalid_last = False
self._my_spent_last = 0.0
self._opp_spent_last = 0.0
self._my_wait_last = False
self._opp_wait_last = False
self._last_spell_hits = 0
self._last_spell_tower_dmg = 0
self._punish_window = max(0, int(getattr(self, "_punish_window", 0)) - 1)
# --- apply agent action ---
self._apply_player_action(owner="me", kind=kind, card=card, zone=zone)
self._update_tilt(emote=emote)
if self._invalid_last:
self._invalid_count += 1
# --- apply opponent action (scripted, tilt-affected) ---
self._scripted_opponent_action()
# If opponent just made a big investment, you have a short window to punish.
if float(getattr(self, "_opp_spent_last", 0.0)) >= 7.0:
self._punish_window = 4 # ~2 seconds (4 ticks)
# --- tick combat + elixir regen ---
self._tick()
after = self._snapshot_score()
breakdown = self._compute_rewards(before, after, emote=emote, invalid_action=self._invalid_last)
return breakdown
# -------------------------
# Internal mechanics
# -------------------------
def _apply_player_action(self, *, owner: Literal["me", "opp"], kind: Literal["play", "wait"], card: str | None, zone: Zone | None):
if kind == "wait":
self._log_event(f"{owner} waits.")
if owner == "me":
self._my_wait_last = True
else:
self._opp_wait_last = True
return
if card is None or zone is None:
self._log_event(f"{owner} attempted invalid play (missing card/zone).")
if owner == "me":
self._invalid_last = True
return
if owner == "me" and card not in self._my_hand:
self._log_event(f"{owner} attempted to play {card} not in hand.")
self._invalid_last = True
return
if owner == "opp" and card not in self._opp_hand:
self._log_event(f"{owner} attempted to play {card} not in hand.")
return
cdef = self._cards.get(card)
if cdef is None or card == "wait":
self._log_event(f"{owner} attempted unknown card '{card}'.")
if owner == "me":
self._invalid_last = True
return
elixir = self._my_elixir if owner == "me" else self._opp_elixir
if elixir + 1e-6 < cdef.cost:
self._log_event(f"{owner} tried to overspend elixir on {card} (cost {cdef.cost}).")
if owner == "me":
self._invalid_last = True
return
if not self._placement_is_valid(owner, cdef, zone):
self._log_event(f"{owner} attempted illegal placement: {card} at {zone}.")
if owner == "me":
self._invalid_last = True
return
# spend
if owner == "me":
self._my_elixir -= cdef.cost
self._my_spent_last = float(cdef.cost)
else:
self._opp_elixir -= cdef.cost
self._opp_spent_last = float(cdef.cost)
# cycle hand
if owner == "me":
self._my_hand.remove(card)
self._my_hand.append(self._my_next)
self._my_next = self._my_deck.pop() if self._my_deck else self._rng.choice(self._my_hand)
else:
self._opp_hand.remove(card)
self._opp_hand.append(self._opp_next)
self._opp_next = self._opp_deck.pop() if self._opp_deck else self._rng.choice(self._opp_hand)
if cdef.kind == "spell":
self._apply_spell(owner=owner, card=card, zone=zone, dmg=cdef.spell_damage)
return
# troops/buildings become units
is_air = cdef.is_air or _is_air_zone(zone)
u = Unit(
owner=owner,
card=card,
hp=float(cdef.hp),
dps=float(cdef.dps),
targets=cdef.targets,
zone=zone,
is_air=is_air,
stage=_zone_stage(zone),
)
if owner == "me":
self._my_units.append(u)
else:
self._opp_units.append(u)
self._log_event(f"{owner} played {card} at {zone} (cost {cdef.cost}).")
def _apply_spell(self, *, owner: Literal["me", "opp"], card: str, zone: Zone, dmg: int):
lane = _zone_lane(zone)
target_towers = self._opp_tower_hp if owner == "me" else self._my_tower_hp
target_units = self._opp_units if owner == "me" else self._my_units
cdef = self._cards.get(card)
radius = float(getattr(cdef, "radius", 0.0) or 0.0)
# --- AoE: hit up to N units in the lane (front-most first) ---
if radius > 0:
lane_units = [u for u in target_units if _zone_lane(u.zone) == lane]
# Larger radius -> more units affected, capped.
max_hits = 3 if radius < 0.40 else 5
lane_units.sort(key=lambda u: (u.stage, -u.hp), reverse=True)
hits = lane_units[:max_hits]
if hits:
for u in hits:
u.hp -= float(dmg)
self._log_event(f"{owner} cast {card} hitting {len(hits)} units in {lane} for {dmg}.")
self._last_spell_hits = max(self._last_spell_hits, len(hits))
# --- Tower chip (spells can hit towers when targeted near them) ---
# Bridge/mid zones chip lane tower lightly; back zones are assumed defensive.
tower_mult = 1.0 if zone.startswith("bridge_") else (0.6 if zone.startswith("mid_") else 0.0)
tower_dmg = int(dmg * tower_mult)
if tower_dmg > 0:
if target_towers[lane] > 0:
target_towers[lane] = max(0, target_towers[lane] - tower_dmg)
self._log_event(f"{owner} cast {card} chipping {lane} tower for {tower_dmg}.")
else:
target_towers["king"] = max(0, target_towers["king"] - int(tower_dmg * 0.5))
self._log_event(f"{owner} cast {card} chipping king tower for {int(tower_dmg*0.5)}.")
self._last_spell_tower_dmg = max(self._last_spell_tower_dmg, tower_dmg)
def _tick(self):
# one turn = 0.5 seconds
self._turn += 1
self._time_remaining_s = max(0.0, self._time_remaining_s - 0.5)
# Overtime: if regulation ends tied, give 60s overtime (double elixir).
if self._time_remaining_s <= 0.0 and not getattr(self, "_in_overtime", False):
if self._my_crowns == self._opp_crowns:
self._in_overtime = True
self._time_remaining_s = 60.0
self._log_event("Overtime!")
double = (self._time_remaining_s <= 60.0) or bool(getattr(self, "_in_overtime", False))
regen = 0.35 if not double else 0.7
self._my_elixir = min(10.0, self._my_elixir + regen)
self._opp_elixir = min(10.0, self._opp_elixir + regen)
# combat: lane skirmishes (units fight units first, then towers)
self._lane_skirmish(lane="left")
self._lane_skirmish(lane="right")
self._cleanup_dead()
self._update_crowns_and_done()
def _lane_skirmish(self, *, lane: Literal["left", "right"]):
"""
Very simplified Clash-like combat:
- units have a lane and a stage (back/mid/bridge)
- troops advance toward bridge unless blocked by enemies
- each tick, each side deals damage to one enemy unit (if any), else to the lane tower (else king)
"""
my_units = [u for u in self._my_units if _zone_lane(u.zone) == lane]
opp_units = [u for u in self._opp_units if _zone_lane(u.zone) == lane]
# Advance troops if no enemies near bridge; buildings don't advance.
if not opp_units:
for u in my_units:
if self._cards[u.card].kind == "troop":
u.stage = min(2, u.stage + 1)
if not my_units:
for u in opp_units:
if self._cards[u.card].kind == "troop":
u.stage = min(2, u.stage + 1)
# If both have units, they fight: front-most units exchange damage.
if my_units and opp_units:
my_front = max(my_units, key=lambda u: (u.stage, -u.hp))
opp_front = max(opp_units, key=lambda u: (u.stage, -u.hp))
# Targeting: some units prefer towers/buildings (giant) so they ignore troops if possible.
# In this simplified sim, that means: if targets==towers, they do NOT damage enemy units.
if my_front.targets != "towers":
opp_front.hp -= max(0.0, my_front.dps) * 0.5
if opp_front.targets != "towers":
my_front.hp -= max(0.0, opp_front.dps) * 0.5
return
# Otherwise, any units at bridge pressure the tower.
def tower_hit(units: list[Unit], target: TowerHP):
# Only units that can hit towers contribute. (All troops can, but if a troop targets towers-only,
# it still contributes here; if it doesn't target towers, it can still hit towers once no troops exist.)
dps = sum(u.dps for u in units if u.stage >= 2)
if dps <= 0:
return
if target[lane] > 0:
target[lane] = max(0, target[lane] - int(dps * 0.45))
else:
target["king"] = max(0, target["king"] - int(dps * 0.20))
if my_units:
tower_hit(my_units, self._opp_tower_hp)
if opp_units:
tower_hit(opp_units, self._my_tower_hp)
def _cleanup_dead(self):
self._my_units = [u for u in self._my_units if u.hp > 0]
self._opp_units = [u for u in self._opp_units if u.hp > 0]
def _update_crowns_and_done(self):
self._my_crowns = int(self._opp_tower_hp["left"] == 0) + int(self._opp_tower_hp["right"] == 0) + int(self._opp_tower_hp["king"] == 0)
self._opp_crowns = int(self._my_tower_hp["left"] == 0) + int(self._my_tower_hp["right"] == 0) + int(self._my_tower_hp["king"] == 0)
if self._my_crowns >= 3 or self._opp_crowns >= 3 or self._time_remaining_s <= 0.0:
self._done = True
def _scripted_opponent_action(self):
# Tilt makes opponent overspend / pick worse card; we implement that as randomness in choice + zone.
playable = [c for c in self._opp_hand if self._cards[c].cost <= self._opp_elixir]
if not playable:
self._apply_player_action(owner="opp", kind="wait", card=None, zone=None)
return
noise = self._opp_card_noise()
if self._rng.random() < noise:
# "tilt": pick a random playable (might be expensive)
card = self._rng.choice(playable)
else:
# baseline: pick cheapest playable
card = sorted(playable, key=lambda c: self._cards[c].cost)[0]
# also sometimes choose a poor zone when tilted
zone = self._rng.choice(PLACEMENT_ZONES)
self._apply_player_action(owner="opp", kind="play", card=card, zone=zone)
def _update_tilt(self, *, emote: str | None):
# Decay
self._opp_tilt = max(0.0, min(1.0, self._opp_tilt - 0.03))
if emote is None:
return
# Detect "BM moment" proxies from last tick state deltas isn't available here; so use heuristics:
# - if opponent is behind in total tower HP, emotes have bigger impact
my_adv = self._tower_hp_advantage()
timing_mult = 1.0 + min(1.0, max(0.0, my_adv / 1500.0)) # up to 2x
base = 0.02
if my_adv > 400:
base = 0.08 # "dominance"
if emote == "laugh" and my_adv > 800:
base = 0.12
self._opp_tilt = max(0.0, min(1.0, self._opp_tilt + base * timing_mult))
def _opp_card_noise(self) -> float:
if self._opp_tilt < 0.3:
return 0.05
if self._opp_tilt < 0.6:
return 0.20
if self._opp_tilt < 0.9:
return 0.40
return 0.60
def _tower_hp_advantage(self) -> float:
my = self._my_tower_hp["left"] + self._my_tower_hp["right"] + self._my_tower_hp["king"]
opp = self._opp_tower_hp["left"] + self._opp_tower_hp["right"] + self._opp_tower_hp["king"]
return float(my - opp) * -1.0 # positive when opp has less HP (i'm ahead)
def _snapshot_score(self) -> dict:
return {
"my_total_tower_hp": self._my_tower_hp["left"] + self._my_tower_hp["right"] + self._my_tower_hp["king"],
"opp_total_tower_hp": self._opp_tower_hp["left"] + self._opp_tower_hp["right"] + self._opp_tower_hp["king"],
"my_crowns": self._my_crowns,
"opp_crowns": self._opp_crowns,
"opp_tilt": self._opp_tilt,
"my_elixir": self._my_elixir,
"invalid_last": self._invalid_last,
"my_spent_last": float(getattr(self, "_my_spent_last", 0.0)),
"opp_spent_last": float(getattr(self, "_opp_spent_last", 0.0)),
"my_wait_last": bool(getattr(self, "_my_wait_last", False)),
"spell_hits": int(getattr(self, "_last_spell_hits", 0)),
"spell_tower_dmg": int(getattr(self, "_last_spell_tower_dmg", 0)),
"punish_window": int(getattr(self, "_punish_window", 0)),
}
def _compute_rewards(self, before: dict, after: dict, *, emote: str | None, invalid_action: bool) -> dict:
weights, params = load_reward_config()
# 1) Crown differential (Jaso-style log scaling, normalized)
crowns_won = after["my_crowns"]
crowns_lost = after["opp_crowns"]
r_crowns = (4.9 * math.log(4.8 * crowns_won + 0.75) + 1.4) - (4.9 * math.log(4.8 * crowns_lost + 0.75) + 1.4)
r_crowns_norm = float(max(-15.0, min(15.0, r_crowns)) / 15.0)
# 2) Tower damage (dense)
dmg_dealt = before["opp_total_tower_hp"] - after["opp_total_tower_hp"]
dmg_taken = before["my_total_tower_hp"] - after["my_total_tower_hp"]
r_tower = (dmg_dealt - 0.8 * dmg_taken) / 1200.0 # scale to ~[-1,1] typical
# 3) Elixir discipline (penalize floating at full elixir)
r_elixir = 0.0
if before["my_elixir"] >= params.full_elixir_threshold:
r_elixir = params.full_elixir_penalty
# 4) Emotes / tilt are NOT part of the reward.
# They exist only as a novelty mechanic affecting opponent behavior.
r_tilt = 0.0
# 5) Anti-stall reward (replaces constant alive reward).
# Penalize repeated waiting, especially when floating high elixir.
r_stall = 0.0
if bool(before.get("my_wait_last")):
if float(before.get("my_elixir", 0.0)) >= params.tempo_elixir_threshold:
r_stall = params.stall_wait_penalty_high_elixir
else:
r_stall = params.stall_wait_penalty_low_elixir
# 6) Invalid action penalty (anti-hacking / stabilizes early RL)
r_invalid = params.invalid_penalty if invalid_action else 0.0
# 7) Tempo: discourage waiting when you have plenty elixir.
r_tempo = 0.0
if bool(before.get("my_wait_last")) and float(before.get("my_elixir", 0.0)) >= params.tempo_elixir_threshold:
r_tempo = params.tempo_wait_penalty
# 8) Spell value: reward multi-hit spells (a proxy for good "spell value").
r_spell = 0.0
hits = int(before.get("spell_hits", 0))
tower_chip = int(before.get("spell_tower_dmg", 0))
if hits >= 2:
r_spell += 0.02 * min(5, hits)
if tower_chip > 0:
r_spell += min(0.05, tower_chip / 6000.0)
# 9) Elixir efficiency: reward dealing damage with low spend (positive-trade proxy).
spent = float(before.get("my_spent_last", 0.0))
if spent > 0:
r_eff = max(0.0, (dmg_dealt - 0.5 * dmg_taken) / (800.0 * spent))
else:
r_eff = 0.0
# 10) Punish window: if opponent overcommitted recently, reward spending to pressure quickly.
r_punish = 0.0
if int(before.get("punish_window", 0)) > 0 and spent > 0.0:
r_punish = params.punish_spend_reward
# 11) Overcommit penalty: discourage going to (near) zero elixir unless it immediately creates advantage.
r_overcommit = 0.0
if spent > 0.0 and float(before.get("my_elixir", 0.0)) <= params.overcommit_elixir_threshold and dmg_dealt <= 0:
r_overcommit = params.overcommit_penalty
# 12) Invalid rate penalty (shapes away from repeated illegal actions).
# Uses episode-to-date invalid count from the state (tracked elsewhere).
inv_count = int(before.get("invalid_action_count", 0))
steps = max(1, int(before.get("turn", 1)))
inv_rate = inv_count / steps
r_invalid_rate = params.invalid_rate_penalty * inv_rate
# 13) Win bonus at terminal (verifiable).
r_win = 0.0
if bool(after.get("done")):
my_c = int(after.get("my_crowns", 0))
opp_c = int(after.get("opp_crowns", 0))
if my_c > opp_c:
r_win = 1.0
elif my_c < opp_c:
r_win = -1.0
else:
r_win = 0.0
breakdown = {
"crown_differential": r_crowns_norm,
"tower_damage": float(r_tower),
"elixir_discipline": float(r_elixir),
"tilt_efficiency": float(r_tilt),
"stall": float(r_stall),
"invalid_action": float(r_invalid),
"invalid_rate": float(r_invalid_rate),
"tempo": float(r_tempo),
"spell_value": float(r_spell),
"elixir_efficiency": float(r_eff),
"punish_window": float(r_punish),
"overcommit": float(r_overcommit),
"win_bonus": float(r_win),
}
total = (
weights.crown_differential * breakdown["crown_differential"]
+ weights.tower_damage * breakdown["tower_damage"]
+ weights.elixir_discipline * breakdown["elixir_discipline"]
+ weights.invalid_action * breakdown["invalid_action"]
+ weights.invalid_rate * breakdown["invalid_rate"]
+ weights.tempo * breakdown["tempo"]
+ weights.spell_value * breakdown["spell_value"]
+ weights.elixir_efficiency * breakdown["elixir_efficiency"]
+ weights.punish_window * breakdown["punish_window"]
+ weights.overcommit * breakdown["overcommit"]
+ weights.stall * breakdown["stall"]
+ weights.win_bonus * breakdown["win_bonus"]
)
return {"reward_total": float(total), "reward_breakdown": breakdown}
def _log_event(self, text: str):
self._last_events.append(f"Turn {self._turn}: {text}")