bytefight-policy / model_v2.py
Broyojo's picture
Upload model_v2.py with huggingface_hub
ed086b4 verified
"""
ByteFight Policy Model v2: 10-channel CNN + 6 scalars + Transformer decoder.
CNN channels (10, spatial):
0: my paint depth (0-4, normalized)
1: opp paint depth (0-4, normalized)
2: my beacon (0/1)
3: opp beacon (0/1)
4: wall (0/1, includes OOB)
5: hill (0/1)
6: powerup (0/1)
7: valid cell (1=in-bounds, not wall)
8: my position (binary mask)
9: opp position (binary mask)
Scalars (6, direct projection):
my_stamina, my_max_stamina, opp_stamina, opp_max_stamina, turn, board_area
Actions (21): 20 action types + EOS
"""
import torch
import torch.nn as nn
import torch.nn.functional as F
from dataclasses import dataclass
NUM_ACTIONS = 21
EOS_ACTION = 20
BOARD_CHANNELS = 10
N_SCALARS = 6
MAX_BOARD = 31
@dataclass
class Config:
d_model: int = 256
n_layer: int = 6
n_head: int = 8
max_seq: int = 6000
n_actions: int = NUM_ACTIONS
board_channels: int = BOARD_CHANNELS
n_scalars: int = N_SCALARS
dropout: float = 0.1
class ResBlock(nn.Module):
def __init__(self, channels):
super().__init__()
self.conv1 = nn.Conv2d(channels, channels, 3, padding=1, bias=False)
self.bn1 = nn.BatchNorm2d(channels)
self.conv2 = nn.Conv2d(channels, channels, 3, padding=1, bias=False)
self.bn2 = nn.BatchNorm2d(channels)
def forward(self, x):
out = F.relu(self.bn1(self.conv1(x)))
out = self.bn2(self.conv2(out))
return F.relu(out + x)
class BoardEncoder(nn.Module):
def __init__(self, cfg: Config):
super().__init__()
self.stem = nn.Sequential(
nn.Conv2d(cfg.board_channels, 64, 3, padding=1, bias=False),
nn.BatchNorm2d(64), nn.ReLU())
self.blocks = nn.Sequential(
ResBlock(64), ResBlock(64),
nn.Conv2d(64, 128, 3, stride=2, padding=1, bias=False),
nn.BatchNorm2d(128), nn.ReLU(),
ResBlock(128), ResBlock(128))
self.pool = nn.AdaptiveAvgPool2d(1)
self.proj = nn.Linear(128, cfg.d_model)
self.scalar_proj = nn.Linear(cfg.n_scalars, cfg.d_model)
def forward(self, board_grid, scalars):
"""board_grid: (B, 10, H, W), scalars: (B, 6) → (B, d_model)"""
x = self.stem(board_grid)
x = self.blocks(x)
x = self.pool(x).flatten(1)
return self.proj(x) + self.scalar_proj(scalars)
class RMSNorm(nn.Module):
def __init__(self, d, eps=1e-6):
super().__init__()
self.w = nn.Parameter(torch.ones(d))
self.eps = eps
def forward(self, x):
return x * torch.rsqrt(x.pow(2).mean(-1, keepdim=True) + self.eps) * self.w
class Attention(nn.Module):
def __init__(self, cfg):
super().__init__()
self.n_head = cfg.n_head
self.head_dim = cfg.d_model // cfg.n_head
self.qkv = nn.Linear(cfg.d_model, 3 * cfg.d_model, bias=False)
self.out = nn.Linear(cfg.d_model, cfg.d_model, bias=False)
self.dropout = cfg.dropout
def forward(self, x, kv_cache=None):
B, T, C = x.shape
qkv = self.qkv(x).reshape(B, T, 3, self.n_head, self.head_dim)
q, k, v = qkv.unbind(2)
q, k, v = q.transpose(1, 2), k.transpose(1, 2), v.transpose(1, 2)
if kv_cache is not None:
k_prev, v_prev = kv_cache
k = torch.cat([k_prev, k], dim=2)
v = torch.cat([v_prev, v], dim=2)
new_cache = (k, v)
x = F.scaled_dot_product_attention(q, k, v,
dropout_p=self.dropout if self.training else 0.0,
is_causal=(kv_cache is None))
return self.out(x.transpose(1, 2).reshape(B, T, C)), new_cache
class FFN(nn.Module):
def __init__(self, cfg):
super().__init__()
h = (int(cfg.d_model * 8 / 3) + 15) // 16 * 16
self.w1 = nn.Linear(cfg.d_model, h, bias=False)
self.w2 = nn.Linear(h, cfg.d_model, bias=False)
self.w3 = nn.Linear(cfg.d_model, h, bias=False)
def forward(self, x):
return self.w2(F.silu(self.w1(x)) * self.w3(x))
class Block(nn.Module):
def __init__(self, cfg):
super().__init__()
self.norm1 = RMSNorm(cfg.d_model)
self.attn = Attention(cfg)
self.norm2 = RMSNorm(cfg.d_model)
self.ffn = FFN(cfg)
def forward(self, x, kv_cache=None):
attn_out, new_cache = self.attn(self.norm1(x), kv_cache)
x = x + attn_out
x = x + self.ffn(self.norm2(x))
return x, new_cache
class PolicyModel(nn.Module):
"""16-channel CNN encoder + Transformer decoder."""
def __init__(self, cfg: Config = None):
super().__init__()
if cfg is None:
cfg = Config()
self.cfg = cfg
self.encoder = BoardEncoder(cfg)
self.action_embed = nn.Embedding(cfg.n_actions, cfg.d_model)
self.pos_embed = nn.Embedding(cfg.max_seq, cfg.d_model)
self.blocks = nn.ModuleList([Block(cfg) for _ in range(cfg.n_layer)])
self.norm = RMSNorm(cfg.d_model)
self.head = nn.Linear(cfg.d_model, cfg.n_actions, bias=False)
def forward(self, boards, scalars, seq_actions, seq_targets, seq_is_board, board_counts):
"""Training forward."""
B, T = seq_actions.shape
device = seq_actions.device
board_embs = self.encoder(boards, scalars) # (total_boards, d_model)
act_emb = self.action_embed(seq_actions)
seq_emb = act_emb.clone()
board_idx = 0
for b in range(B):
positions = seq_is_board[b].nonzero(as_tuple=True)[0]
n_boards = board_counts[b].item()
for i in range(n_boards):
if i < len(positions):
seq_emb[b, positions[i]] = board_embs[board_idx]
board_idx += 1
pos = self.pos_embed(torch.arange(T, device=device))
x = seq_emb + pos
for block in self.blocks:
x, _ = block(x)
logits = self.head(self.norm(x))
loss = F.cross_entropy(
logits.reshape(-1, self.cfg.n_actions),
seq_targets.reshape(-1),
ignore_index=-100)
return logits, loss
@torch.no_grad()
def generate(self, board_grid, scalars, kv_caches=None, seq_pos=0,
max_actions=10, temperature=0.0):
"""Generate actions with KV cache. board_grid: (1, 10, H, W), scalars: (1, 6)."""
self.eval()
device = board_grid.device
board_emb = self.encoder(board_grid, scalars).unsqueeze(1)
pos = self.pos_embed(torch.tensor([seq_pos], device=device)).unsqueeze(0)
x = board_emb + pos
if kv_caches is None:
kv_caches = [None] * len(self.blocks)
new_caches = []
for block, cache in zip(self.blocks, kv_caches):
x, new_cache = block(x, cache)
new_caches.append(new_cache)
kv_caches = new_caches
logits = self.head(self.norm(x))
next_logits = logits[0, -1]
seq_pos += 1
if temperature <= 0:
action = next_logits.argmax().item()
else:
action = torch.multinomial(
F.softmax(next_logits / temperature, dim=-1), 1).item()
actions = []
if action == EOS_ACTION:
return actions, kv_caches, seq_pos
actions.append(action)
for _ in range(max_actions - 1):
act_emb = self.action_embed(torch.tensor([[action]], device=device))
pos = self.pos_embed(torch.tensor([seq_pos], device=device)).unsqueeze(0)
x = act_emb + pos
new_caches = []
for block, cache in zip(self.blocks, kv_caches):
x, new_cache = block(x, cache)
new_caches.append(new_cache)
kv_caches = new_caches
logits = self.head(self.norm(x))
next_logits = logits[0, -1]
seq_pos += 1
if temperature <= 0:
action = next_logits.argmax().item()
else:
action = torch.multinomial(
F.softmax(next_logits / temperature, dim=-1), 1).item()
if action == EOS_ACTION:
break
actions.append(action)
return actions, kv_caches, seq_pos
def count_params(self):
return sum(p.numel() for p in self.parameters())
if __name__ == '__main__':
cfg = Config()
model = PolicyModel(cfg)
print(f"Total params: {model.count_params():,}")
for h, w in [(8, 8), (15, 15), (31, 31)]:
board = torch.randn(2, BOARD_CHANNELS, h, w)
s = torch.randn(2, N_SCALARS)
emb = model.encoder(board, s)
print(f" Board {h}x{w} -> {emb.shape}")
kv = None
pos = 0
for turn in range(5):
board = torch.randn(1, BOARD_CHANNELS, 15, 15)
s = torch.randn(1, N_SCALARS)
acts, kv, pos = model.generate(board, s, kv_caches=kv, seq_pos=pos)
cache_size = kv[0][0].shape[2]
print(f" Turn {turn}: actions={acts}, seq_pos={pos}, cache_len={cache_size}")