Spaces:
Sleeping
Sleeping
| from __future__ import annotations | |
| import math | |
| import random | |
| from dataclasses import dataclass, field | |
| from typing import Literal, TypedDict | |
| from .arena_geometry import Arena | |
| from .card_catalog import load_cards | |
| from .reward_config import load_reward_config | |
| Zone = Literal[ | |
| "bridge_left", | |
| "bridge_right", | |
| "back_left", | |
| "back_right", | |
| "mid_left", | |
| "mid_right", | |
| ] | |
| Emote = Literal["laugh", "yawn", "cry", "thanks", "chicken", "wp"] | |
| class TowerHP(TypedDict): | |
| left: int | |
| right: int | |
| king: int | |
| class Unit: | |
| owner: Literal["me", "opp"] | |
| card: str | |
| hp: float | |
| dps: float | |
| targets: Literal["ground", "air", "both", "towers"] | |
| zone: Zone | |
| is_air: bool | |
| stage: int # 0=back, 1=mid, 2=bridge | |
| splash: bool = False | |
| radius: float = 0.0 | |
| class CardDef: | |
| key: str | |
| name: str | |
| cost: int | |
| kind: Literal["troop", "spell", "building"] | |
| # simplified combat stats (we keep them stable + fast; can be expanded later) | |
| hp: int = 0 | |
| dps: int = 0 | |
| targets: Literal["ground", "air", "both", "towers"] = "ground" | |
| is_air: bool = False | |
| spell_damage: int = 0 | |
| splash: bool = False | |
| radius: float = 0.0 | |
| description: str | None = None | |
| def _infer_carddef(card_key: str, catalog: dict) -> CardDef: | |
| """ | |
| Convert a real card record into our simplified simulator parameters. | |
| We keep the simulator intentionally abstract, so we infer missing values and | |
| clamp stats to keep learning stable. | |
| """ | |
| c = catalog.get(card_key) | |
| if c is None: | |
| raise KeyError(card_key) | |
| kind: Literal["troop", "spell", "building"] | |
| if c.type == "Troop": | |
| kind = "troop" | |
| elif c.type == "Building": | |
| kind = "building" | |
| elif c.type == "Spell": | |
| kind = "spell" | |
| else: | |
| kind = "troop" | |
| # Map targets to a coarse set. | |
| targets = "both" if (c.targets and "Air" in c.targets) else "ground" | |
| if c.targets and "Buildings" in c.targets: | |
| targets = "towers" | |
| hp = int(c.hitpoints or 0) | |
| dps = int(c.damage_per_second or 0) | |
| dmg = int(c.damage or 0) | |
| # Fall back if DPS missing. | |
| if dps <= 0 and dmg > 0 and c.hit_speed: | |
| dps = int(dmg / max(0.1, float(c.hit_speed))) | |
| # Clamp to keep sim stable. | |
| hp = max(1, min(hp, 6000)) if kind != "spell" else 0 | |
| dps = max(0, min(dps, 800)) if kind != "spell" else 0 | |
| # Spell damage: use "damage" if available else a stable fallback by elixir. | |
| spell_damage = 0 | |
| if kind == "spell": | |
| spell_damage = int(c.damage or (c.elixir * 120)) | |
| spell_damage = max(40, min(spell_damage, 1600)) | |
| is_air = bool(c.move_speed and "Air" in c.move_speed) # imperfect | |
| # Splash/radius heuristics for our small pool (good enough for RL + rewards) | |
| splash = card_key in {"fireball", "arrows"} | |
| radius = 0.0 | |
| if card_key == "fireball": | |
| radius = 0.35 | |
| if card_key == "arrows": | |
| radius = 0.45 | |
| # Targeting priors (closer to real CR interactions for our pool) | |
| # - Giant targets buildings (incl towers) only | |
| if card_key == "giant": | |
| targets = "towers" | |
| return CardDef( | |
| key=card_key, | |
| name=c.name, | |
| cost=int(c.elixir), | |
| kind=kind, | |
| hp=hp, | |
| dps=dps, | |
| targets=targets, # type: ignore[arg-type] | |
| is_air=is_air, | |
| spell_damage=spell_damage, | |
| splash=splash, | |
| radius=radius, | |
| description=c.description, | |
| ) | |
| PLACEMENT_ZONES: tuple[Zone, ...] = ( | |
| "bridge_left", | |
| "bridge_right", | |
| "back_left", | |
| "back_right", | |
| "mid_left", | |
| "mid_right", | |
| ) | |
| def _zone_lane(zone: Zone) -> Literal["left", "right"]: | |
| return "left" if zone.endswith("_left") else "right" | |
| def _is_air_zone(zone: Zone) -> bool: | |
| return False | |
| def _zone_stage(zone: Zone) -> int: | |
| if zone.startswith("back_"): | |
| return 0 | |
| if zone.startswith("mid_"): | |
| return 1 | |
| return 2 | |
| class GameState: | |
| turn: int | |
| time_remaining_s: float | |
| double_elixir: bool | |
| my_elixir: float | |
| opp_elixir_estimate: float | |
| my_tower_hp: TowerHP | |
| opp_tower_hp: TowerHP | |
| my_hand: list[str] | |
| my_next_card: str | |
| opp_hand_estimate: list[str] | |
| my_units: list[Unit] = field(default_factory=list) | |
| opp_units: list[Unit] = field(default_factory=list) | |
| opp_tilt_meter: float = 0.0 | |
| my_crowns: int = 0 | |
| opp_crowns: int = 0 | |
| invalid_action_last: bool = False | |
| invalid_action_count: int = 0 | |
| done: bool = False | |
| class ToxicRoyaleSim: | |
| """ | |
| Lightweight, fast text simulator. | |
| Design goals for hackathon: | |
| - deterministic-ish (seedable) and fast enough for many rollouts | |
| - objective, non-zero rewards early | |
| - supports "tilt meter" dynamics for novelty | |
| """ | |
| def __init__(self, seed: int | None = None): | |
| self._rng = random.Random(seed) | |
| self._seed = seed | |
| self._last_events: list[str] = [] | |
| self._invalid_last = False | |
| self._arena = Arena() | |
| self.reset() | |
| def _zone_point(self, owner: Literal["me", "opp"], zone: Zone) -> tuple[float, float]: | |
| """ | |
| Map an abstract placement zone to a normalized arena (x,y). | |
| Convention: | |
| - y grows from opponent side (0) to my side (1) | |
| - our zone definitions are expressed from *my* perspective | |
| so we mirror y for opponent placements. | |
| """ | |
| x = 0.33 if zone.endswith("_left") else 0.67 | |
| if zone.startswith("back_"): | |
| y = 0.80 | |
| elif zone.startswith("mid_"): | |
| y = 0.66 | |
| else: | |
| y = 0.52 | |
| if owner == "opp": | |
| y = 1.0 - y | |
| return x, y | |
| def _placement_is_valid(self, owner: Literal["me", "opp"], card: CardDef, zone: Zone) -> bool: | |
| x, y = self._zone_point(owner, zone) | |
| if card.kind == "spell": | |
| return True | |
| return self._arena.can_place_troop(owner, x, y) | |
| def reset(self) -> GameState: | |
| self._last_events = [] | |
| self._turn = 0 | |
| # Real-ish Clash timing: 3:00 regulation + overtime if tied. | |
| # Keep it simple but closer to the real feel of the screen you shared. | |
| self._time_remaining_s = 180.0 | |
| self._in_overtime = False | |
| self._my_elixir = 5.0 | |
| self._opp_elixir = 5.0 | |
| self._my_tower_hp: TowerHP = {"left": 1400, "right": 1400, "king": 2400} | |
| self._opp_tower_hp: TowerHP = {"left": 1400, "right": 1400, "king": 2400} | |
| # Load real card catalog (RoyaleAPI static data) once per sim instance. | |
| if not hasattr(self, "_catalog"): | |
| self._catalog = load_cards() | |
| # Training pool: keep it small first (stable learning), but every card has a real description. | |
| # You can expand this list later to "all cards" without changing the env API. | |
| self._training_pool = [ | |
| "giant", | |
| "knight", | |
| "minions", | |
| "archers", | |
| "fireball", | |
| "arrows", | |
| "musketeer", | |
| "mini-pekka", | |
| ] | |
| # Build simulator defs for the pool. | |
| self._cards: dict[str, CardDef] = {} | |
| for key in self._training_pool: | |
| # keys in RoyaleAPI dataset are kebab-case (e.g. "mini-pekka") | |
| self._cards[key] = _infer_carddef(key, self._catalog) | |
| self._cards["wait"] = CardDef(key="wait", name="Wait", cost=0, kind="spell", spell_damage=0) | |
| self._my_deck = list(self._training_pool) | |
| self._opp_deck = list(self._training_pool) | |
| self._rng.shuffle(self._my_deck) | |
| self._rng.shuffle(self._opp_deck) | |
| self._my_hand = [self._my_deck.pop() for _ in range(4)] | |
| self._my_next = self._my_deck.pop() | |
| self._opp_hand = [self._opp_deck.pop() for _ in range(4)] | |
| self._opp_next = self._opp_deck.pop() | |
| self._my_units: list[Unit] = [] | |
| self._opp_units: list[Unit] = [] | |
| self._opp_tilt = 0.0 | |
| self._my_crowns = 0 | |
| self._opp_crowns = 0 | |
| self._done = False | |
| self._invalid_last = False | |
| self._invalid_count = 0 | |
| self._my_spent_last = 0.0 | |
| self._opp_spent_last = 0.0 | |
| self._my_wait_last = False | |
| self._opp_wait_last = False | |
| self._last_spell_hits = 0 | |
| self._last_spell_tower_dmg = 0 | |
| self._punish_window = 0 | |
| self._log_event("Game start.") | |
| return self.state() | |
| def state(self) -> GameState: | |
| return GameState( | |
| turn=self._turn, | |
| time_remaining_s=self._time_remaining_s, | |
| double_elixir=(self._time_remaining_s <= 60.0) or bool(getattr(self, "_in_overtime", False)), | |
| my_elixir=self._my_elixir, | |
| opp_elixir_estimate=self._opp_elixir, | |
| my_tower_hp=dict(self._my_tower_hp), | |
| opp_tower_hp=dict(self._opp_tower_hp), | |
| my_hand=list(self._my_hand), | |
| my_next_card=self._my_next, | |
| opp_hand_estimate=list(self._opp_hand), | |
| my_units=list(self._my_units), | |
| opp_units=list(self._opp_units), | |
| opp_tilt_meter=self._opp_tilt, | |
| my_crowns=self._my_crowns, | |
| opp_crowns=self._opp_crowns, | |
| invalid_action_last=self._invalid_last, | |
| invalid_action_count=self._invalid_count, | |
| done=self._done, | |
| ) | |
| def last_events(self, k: int = 6) -> list[str]: | |
| return self._last_events[-k:] | |
| def step(self, *, kind: Literal["play", "wait"], card: str | None, zone: Zone | None, emote: str | None) -> dict: | |
| """ | |
| Advance one tick (0.5s) with (agent action + scripted opponent response). | |
| Returns: a dict with reward_total and reward_breakdown. | |
| """ | |
| if self._done: | |
| return {"reward_total": 0.0, "reward_breakdown": {"already_done": 0.0}} | |
| before = self._snapshot_score() | |
| self._invalid_last = False | |
| self._my_spent_last = 0.0 | |
| self._opp_spent_last = 0.0 | |
| self._my_wait_last = False | |
| self._opp_wait_last = False | |
| self._last_spell_hits = 0 | |
| self._last_spell_tower_dmg = 0 | |
| self._punish_window = max(0, int(getattr(self, "_punish_window", 0)) - 1) | |
| # --- apply agent action --- | |
| self._apply_player_action(owner="me", kind=kind, card=card, zone=zone) | |
| self._update_tilt(emote=emote) | |
| if self._invalid_last: | |
| self._invalid_count += 1 | |
| # --- apply opponent action (scripted, tilt-affected) --- | |
| self._scripted_opponent_action() | |
| # If opponent just made a big investment, you have a short window to punish. | |
| if float(getattr(self, "_opp_spent_last", 0.0)) >= 7.0: | |
| self._punish_window = 4 # ~2 seconds (4 ticks) | |
| # --- tick combat + elixir regen --- | |
| self._tick() | |
| after = self._snapshot_score() | |
| breakdown = self._compute_rewards(before, after, emote=emote, invalid_action=self._invalid_last) | |
| return breakdown | |
| # ------------------------- | |
| # Internal mechanics | |
| # ------------------------- | |
| def _apply_player_action(self, *, owner: Literal["me", "opp"], kind: Literal["play", "wait"], card: str | None, zone: Zone | None): | |
| if kind == "wait": | |
| self._log_event(f"{owner} waits.") | |
| if owner == "me": | |
| self._my_wait_last = True | |
| else: | |
| self._opp_wait_last = True | |
| return | |
| if card is None or zone is None: | |
| self._log_event(f"{owner} attempted invalid play (missing card/zone).") | |
| if owner == "me": | |
| self._invalid_last = True | |
| return | |
| if owner == "me" and card not in self._my_hand: | |
| self._log_event(f"{owner} attempted to play {card} not in hand.") | |
| self._invalid_last = True | |
| return | |
| if owner == "opp" and card not in self._opp_hand: | |
| self._log_event(f"{owner} attempted to play {card} not in hand.") | |
| return | |
| cdef = self._cards.get(card) | |
| if cdef is None or card == "wait": | |
| self._log_event(f"{owner} attempted unknown card '{card}'.") | |
| if owner == "me": | |
| self._invalid_last = True | |
| return | |
| elixir = self._my_elixir if owner == "me" else self._opp_elixir | |
| if elixir + 1e-6 < cdef.cost: | |
| self._log_event(f"{owner} tried to overspend elixir on {card} (cost {cdef.cost}).") | |
| if owner == "me": | |
| self._invalid_last = True | |
| return | |
| if not self._placement_is_valid(owner, cdef, zone): | |
| self._log_event(f"{owner} attempted illegal placement: {card} at {zone}.") | |
| if owner == "me": | |
| self._invalid_last = True | |
| return | |
| # spend | |
| if owner == "me": | |
| self._my_elixir -= cdef.cost | |
| self._my_spent_last = float(cdef.cost) | |
| else: | |
| self._opp_elixir -= cdef.cost | |
| self._opp_spent_last = float(cdef.cost) | |
| # cycle hand | |
| if owner == "me": | |
| self._my_hand.remove(card) | |
| self._my_hand.append(self._my_next) | |
| self._my_next = self._my_deck.pop() if self._my_deck else self._rng.choice(self._my_hand) | |
| else: | |
| self._opp_hand.remove(card) | |
| self._opp_hand.append(self._opp_next) | |
| self._opp_next = self._opp_deck.pop() if self._opp_deck else self._rng.choice(self._opp_hand) | |
| if cdef.kind == "spell": | |
| self._apply_spell(owner=owner, card=card, zone=zone, dmg=cdef.spell_damage) | |
| return | |
| # troops/buildings become units | |
| is_air = cdef.is_air or _is_air_zone(zone) | |
| u = Unit( | |
| owner=owner, | |
| card=card, | |
| hp=float(cdef.hp), | |
| dps=float(cdef.dps), | |
| targets=cdef.targets, | |
| zone=zone, | |
| is_air=is_air, | |
| stage=_zone_stage(zone), | |
| ) | |
| if owner == "me": | |
| self._my_units.append(u) | |
| else: | |
| self._opp_units.append(u) | |
| self._log_event(f"{owner} played {card} at {zone} (cost {cdef.cost}).") | |
| def _apply_spell(self, *, owner: Literal["me", "opp"], card: str, zone: Zone, dmg: int): | |
| lane = _zone_lane(zone) | |
| target_towers = self._opp_tower_hp if owner == "me" else self._my_tower_hp | |
| target_units = self._opp_units if owner == "me" else self._my_units | |
| cdef = self._cards.get(card) | |
| radius = float(getattr(cdef, "radius", 0.0) or 0.0) | |
| # --- AoE: hit up to N units in the lane (front-most first) --- | |
| if radius > 0: | |
| lane_units = [u for u in target_units if _zone_lane(u.zone) == lane] | |
| # Larger radius -> more units affected, capped. | |
| max_hits = 3 if radius < 0.40 else 5 | |
| lane_units.sort(key=lambda u: (u.stage, -u.hp), reverse=True) | |
| hits = lane_units[:max_hits] | |
| if hits: | |
| for u in hits: | |
| u.hp -= float(dmg) | |
| self._log_event(f"{owner} cast {card} hitting {len(hits)} units in {lane} for {dmg}.") | |
| self._last_spell_hits = max(self._last_spell_hits, len(hits)) | |
| # --- Tower chip (spells can hit towers when targeted near them) --- | |
| # Bridge/mid zones chip lane tower lightly; back zones are assumed defensive. | |
| tower_mult = 1.0 if zone.startswith("bridge_") else (0.6 if zone.startswith("mid_") else 0.0) | |
| tower_dmg = int(dmg * tower_mult) | |
| if tower_dmg > 0: | |
| if target_towers[lane] > 0: | |
| target_towers[lane] = max(0, target_towers[lane] - tower_dmg) | |
| self._log_event(f"{owner} cast {card} chipping {lane} tower for {tower_dmg}.") | |
| else: | |
| target_towers["king"] = max(0, target_towers["king"] - int(tower_dmg * 0.5)) | |
| self._log_event(f"{owner} cast {card} chipping king tower for {int(tower_dmg*0.5)}.") | |
| self._last_spell_tower_dmg = max(self._last_spell_tower_dmg, tower_dmg) | |
| def _tick(self): | |
| # one turn = 0.5 seconds | |
| self._turn += 1 | |
| self._time_remaining_s = max(0.0, self._time_remaining_s - 0.5) | |
| # Overtime: if regulation ends tied, give 60s overtime (double elixir). | |
| if self._time_remaining_s <= 0.0 and not getattr(self, "_in_overtime", False): | |
| if self._my_crowns == self._opp_crowns: | |
| self._in_overtime = True | |
| self._time_remaining_s = 60.0 | |
| self._log_event("Overtime!") | |
| double = (self._time_remaining_s <= 60.0) or bool(getattr(self, "_in_overtime", False)) | |
| regen = 0.35 if not double else 0.7 | |
| self._my_elixir = min(10.0, self._my_elixir + regen) | |
| self._opp_elixir = min(10.0, self._opp_elixir + regen) | |
| # combat: lane skirmishes (units fight units first, then towers) | |
| self._lane_skirmish(lane="left") | |
| self._lane_skirmish(lane="right") | |
| self._cleanup_dead() | |
| self._update_crowns_and_done() | |
| def _lane_skirmish(self, *, lane: Literal["left", "right"]): | |
| """ | |
| Very simplified Clash-like combat: | |
| - units have a lane and a stage (back/mid/bridge) | |
| - troops advance toward bridge unless blocked by enemies | |
| - each tick, each side deals damage to one enemy unit (if any), else to the lane tower (else king) | |
| """ | |
| my_units = [u for u in self._my_units if _zone_lane(u.zone) == lane] | |
| opp_units = [u for u in self._opp_units if _zone_lane(u.zone) == lane] | |
| # Advance troops if no enemies near bridge; buildings don't advance. | |
| if not opp_units: | |
| for u in my_units: | |
| if self._cards[u.card].kind == "troop": | |
| u.stage = min(2, u.stage + 1) | |
| if not my_units: | |
| for u in opp_units: | |
| if self._cards[u.card].kind == "troop": | |
| u.stage = min(2, u.stage + 1) | |
| # If both have units, they fight: front-most units exchange damage. | |
| if my_units and opp_units: | |
| my_front = max(my_units, key=lambda u: (u.stage, -u.hp)) | |
| opp_front = max(opp_units, key=lambda u: (u.stage, -u.hp)) | |
| # Targeting: some units prefer towers/buildings (giant) so they ignore troops if possible. | |
| # In this simplified sim, that means: if targets==towers, they do NOT damage enemy units. | |
| if my_front.targets != "towers": | |
| opp_front.hp -= max(0.0, my_front.dps) * 0.5 | |
| if opp_front.targets != "towers": | |
| my_front.hp -= max(0.0, opp_front.dps) * 0.5 | |
| return | |
| # Otherwise, any units at bridge pressure the tower. | |
| def tower_hit(units: list[Unit], target: TowerHP): | |
| # Only units that can hit towers contribute. (All troops can, but if a troop targets towers-only, | |
| # it still contributes here; if it doesn't target towers, it can still hit towers once no troops exist.) | |
| dps = sum(u.dps for u in units if u.stage >= 2) | |
| if dps <= 0: | |
| return | |
| if target[lane] > 0: | |
| target[lane] = max(0, target[lane] - int(dps * 0.45)) | |
| else: | |
| target["king"] = max(0, target["king"] - int(dps * 0.20)) | |
| if my_units: | |
| tower_hit(my_units, self._opp_tower_hp) | |
| if opp_units: | |
| tower_hit(opp_units, self._my_tower_hp) | |
| def _cleanup_dead(self): | |
| self._my_units = [u for u in self._my_units if u.hp > 0] | |
| self._opp_units = [u for u in self._opp_units if u.hp > 0] | |
| def _update_crowns_and_done(self): | |
| self._my_crowns = int(self._opp_tower_hp["left"] == 0) + int(self._opp_tower_hp["right"] == 0) + int(self._opp_tower_hp["king"] == 0) | |
| self._opp_crowns = int(self._my_tower_hp["left"] == 0) + int(self._my_tower_hp["right"] == 0) + int(self._my_tower_hp["king"] == 0) | |
| if self._my_crowns >= 3 or self._opp_crowns >= 3 or self._time_remaining_s <= 0.0: | |
| self._done = True | |
| def _scripted_opponent_action(self): | |
| # Tilt makes opponent overspend / pick worse card; we implement that as randomness in choice + zone. | |
| playable = [c for c in self._opp_hand if self._cards[c].cost <= self._opp_elixir] | |
| if not playable: | |
| self._apply_player_action(owner="opp", kind="wait", card=None, zone=None) | |
| return | |
| noise = self._opp_card_noise() | |
| if self._rng.random() < noise: | |
| # "tilt": pick a random playable (might be expensive) | |
| card = self._rng.choice(playable) | |
| else: | |
| # baseline: pick cheapest playable | |
| card = sorted(playable, key=lambda c: self._cards[c].cost)[0] | |
| # also sometimes choose a poor zone when tilted | |
| zone = self._rng.choice(PLACEMENT_ZONES) | |
| self._apply_player_action(owner="opp", kind="play", card=card, zone=zone) | |
| def _update_tilt(self, *, emote: str | None): | |
| # Decay | |
| self._opp_tilt = max(0.0, min(1.0, self._opp_tilt - 0.03)) | |
| if emote is None: | |
| return | |
| # Detect "BM moment" proxies from last tick state deltas isn't available here; so use heuristics: | |
| # - if opponent is behind in total tower HP, emotes have bigger impact | |
| my_adv = self._tower_hp_advantage() | |
| timing_mult = 1.0 + min(1.0, max(0.0, my_adv / 1500.0)) # up to 2x | |
| base = 0.02 | |
| if my_adv > 400: | |
| base = 0.08 # "dominance" | |
| if emote == "laugh" and my_adv > 800: | |
| base = 0.12 | |
| self._opp_tilt = max(0.0, min(1.0, self._opp_tilt + base * timing_mult)) | |
| def _opp_card_noise(self) -> float: | |
| if self._opp_tilt < 0.3: | |
| return 0.05 | |
| if self._opp_tilt < 0.6: | |
| return 0.20 | |
| if self._opp_tilt < 0.9: | |
| return 0.40 | |
| return 0.60 | |
| def _tower_hp_advantage(self) -> float: | |
| my = self._my_tower_hp["left"] + self._my_tower_hp["right"] + self._my_tower_hp["king"] | |
| opp = self._opp_tower_hp["left"] + self._opp_tower_hp["right"] + self._opp_tower_hp["king"] | |
| return float(my - opp) * -1.0 # positive when opp has less HP (i'm ahead) | |
| def _snapshot_score(self) -> dict: | |
| return { | |
| "my_total_tower_hp": self._my_tower_hp["left"] + self._my_tower_hp["right"] + self._my_tower_hp["king"], | |
| "opp_total_tower_hp": self._opp_tower_hp["left"] + self._opp_tower_hp["right"] + self._opp_tower_hp["king"], | |
| "my_crowns": self._my_crowns, | |
| "opp_crowns": self._opp_crowns, | |
| "opp_tilt": self._opp_tilt, | |
| "my_elixir": self._my_elixir, | |
| "invalid_last": self._invalid_last, | |
| "my_spent_last": float(getattr(self, "_my_spent_last", 0.0)), | |
| "opp_spent_last": float(getattr(self, "_opp_spent_last", 0.0)), | |
| "my_wait_last": bool(getattr(self, "_my_wait_last", False)), | |
| "spell_hits": int(getattr(self, "_last_spell_hits", 0)), | |
| "spell_tower_dmg": int(getattr(self, "_last_spell_tower_dmg", 0)), | |
| "punish_window": int(getattr(self, "_punish_window", 0)), | |
| } | |
| def _compute_rewards(self, before: dict, after: dict, *, emote: str | None, invalid_action: bool) -> dict: | |
| weights, params = load_reward_config() | |
| # 1) Crown differential (Jaso-style log scaling, normalized) | |
| crowns_won = after["my_crowns"] | |
| crowns_lost = after["opp_crowns"] | |
| r_crowns = (4.9 * math.log(4.8 * crowns_won + 0.75) + 1.4) - (4.9 * math.log(4.8 * crowns_lost + 0.75) + 1.4) | |
| r_crowns_norm = float(max(-15.0, min(15.0, r_crowns)) / 15.0) | |
| # 2) Tower damage (dense) | |
| dmg_dealt = before["opp_total_tower_hp"] - after["opp_total_tower_hp"] | |
| dmg_taken = before["my_total_tower_hp"] - after["my_total_tower_hp"] | |
| r_tower = (dmg_dealt - 0.8 * dmg_taken) / 1200.0 # scale to ~[-1,1] typical | |
| # 3) Elixir discipline (penalize floating at full elixir) | |
| r_elixir = 0.0 | |
| if before["my_elixir"] >= params.full_elixir_threshold: | |
| r_elixir = params.full_elixir_penalty | |
| # 4) Emotes / tilt are NOT part of the reward. | |
| # They exist only as a novelty mechanic affecting opponent behavior. | |
| r_tilt = 0.0 | |
| # 5) Anti-stall reward (replaces constant alive reward). | |
| # Penalize repeated waiting, especially when floating high elixir. | |
| r_stall = 0.0 | |
| if bool(before.get("my_wait_last")): | |
| if float(before.get("my_elixir", 0.0)) >= params.tempo_elixir_threshold: | |
| r_stall = params.stall_wait_penalty_high_elixir | |
| else: | |
| r_stall = params.stall_wait_penalty_low_elixir | |
| # 6) Invalid action penalty (anti-hacking / stabilizes early RL) | |
| r_invalid = params.invalid_penalty if invalid_action else 0.0 | |
| # 7) Tempo: discourage waiting when you have plenty elixir. | |
| r_tempo = 0.0 | |
| if bool(before.get("my_wait_last")) and float(before.get("my_elixir", 0.0)) >= params.tempo_elixir_threshold: | |
| r_tempo = params.tempo_wait_penalty | |
| # 8) Spell value: reward multi-hit spells (a proxy for good "spell value"). | |
| r_spell = 0.0 | |
| hits = int(before.get("spell_hits", 0)) | |
| tower_chip = int(before.get("spell_tower_dmg", 0)) | |
| if hits >= 2: | |
| r_spell += 0.02 * min(5, hits) | |
| if tower_chip > 0: | |
| r_spell += min(0.05, tower_chip / 6000.0) | |
| # 9) Elixir efficiency: reward dealing damage with low spend (positive-trade proxy). | |
| spent = float(before.get("my_spent_last", 0.0)) | |
| if spent > 0: | |
| r_eff = max(0.0, (dmg_dealt - 0.5 * dmg_taken) / (800.0 * spent)) | |
| else: | |
| r_eff = 0.0 | |
| # 10) Punish window: if opponent overcommitted recently, reward spending to pressure quickly. | |
| r_punish = 0.0 | |
| if int(before.get("punish_window", 0)) > 0 and spent > 0.0: | |
| r_punish = params.punish_spend_reward | |
| # 11) Overcommit penalty: discourage going to (near) zero elixir unless it immediately creates advantage. | |
| r_overcommit = 0.0 | |
| if spent > 0.0 and float(before.get("my_elixir", 0.0)) <= params.overcommit_elixir_threshold and dmg_dealt <= 0: | |
| r_overcommit = params.overcommit_penalty | |
| # 12) Invalid rate penalty (shapes away from repeated illegal actions). | |
| # Uses episode-to-date invalid count from the state (tracked elsewhere). | |
| inv_count = int(before.get("invalid_action_count", 0)) | |
| steps = max(1, int(before.get("turn", 1))) | |
| inv_rate = inv_count / steps | |
| r_invalid_rate = params.invalid_rate_penalty * inv_rate | |
| # 13) Win bonus at terminal (verifiable). | |
| r_win = 0.0 | |
| if bool(after.get("done")): | |
| my_c = int(after.get("my_crowns", 0)) | |
| opp_c = int(after.get("opp_crowns", 0)) | |
| if my_c > opp_c: | |
| r_win = 1.0 | |
| elif my_c < opp_c: | |
| r_win = -1.0 | |
| else: | |
| r_win = 0.0 | |
| breakdown = { | |
| "crown_differential": r_crowns_norm, | |
| "tower_damage": float(r_tower), | |
| "elixir_discipline": float(r_elixir), | |
| "tilt_efficiency": float(r_tilt), | |
| "stall": float(r_stall), | |
| "invalid_action": float(r_invalid), | |
| "invalid_rate": float(r_invalid_rate), | |
| "tempo": float(r_tempo), | |
| "spell_value": float(r_spell), | |
| "elixir_efficiency": float(r_eff), | |
| "punish_window": float(r_punish), | |
| "overcommit": float(r_overcommit), | |
| "win_bonus": float(r_win), | |
| } | |
| total = ( | |
| weights.crown_differential * breakdown["crown_differential"] | |
| + weights.tower_damage * breakdown["tower_damage"] | |
| + weights.elixir_discipline * breakdown["elixir_discipline"] | |
| + weights.invalid_action * breakdown["invalid_action"] | |
| + weights.invalid_rate * breakdown["invalid_rate"] | |
| + weights.tempo * breakdown["tempo"] | |
| + weights.spell_value * breakdown["spell_value"] | |
| + weights.elixir_efficiency * breakdown["elixir_efficiency"] | |
| + weights.punish_window * breakdown["punish_window"] | |
| + weights.overcommit * breakdown["overcommit"] | |
| + weights.stall * breakdown["stall"] | |
| + weights.win_bonus * breakdown["win_bonus"] | |
| ) | |
| return {"reward_total": float(total), "reward_breakdown": breakdown} | |
| def _log_event(self, text: str): | |
| self._last_events.append(f"Turn {self._turn}: {text}") | |