| import numpy as np |
|
|
| import torch |
| import torch.nn as nn |
| import torch.nn.functional as F |
|
|
|
|
| class ResidualBlock(nn.Module): |
| def __init__(self, channels): |
| super().__init__() |
| self.conv1 = nn.Conv2d(channels, channels, kernel_size=3, padding=1, bias=False) |
| self.bn1 = nn.BatchNorm2d(channels) |
| self.conv2 = nn.Conv2d(channels, channels, kernel_size=3, padding=1, bias=False) |
| self.bn2 = nn.BatchNorm2d(channels) |
|
|
| def forward(self, x): |
| residual = x |
| x = F.relu(self.bn1(self.conv1(x))) |
| x = self.bn2(self.conv2(x)) |
| return F.relu(x + residual) |
|
|
|
|
| class UltimateTicTacToeModel(nn.Module): |
| def __init__(self, board_size, action_size, device, channels=64, num_blocks=6): |
| super().__init__() |
|
|
| self.action_size = action_size |
| self.input_shape = board_size |
| self.input_channels = board_size[0] |
| self.board_height = board_size[1] |
| self.board_width = board_size[2] |
| self.device = torch.device(device) |
|
|
| self.stem = nn.Sequential( |
| nn.Conv2d(self.input_channels, channels, kernel_size=3, padding=1, bias=False), |
| nn.BatchNorm2d(channels), |
| nn.ReLU(inplace=True), |
| ) |
| self.residual_tower = nn.Sequential(*(ResidualBlock(channels) for _ in range(num_blocks))) |
|
|
| self.policy_head = nn.Sequential( |
| nn.Conv2d(channels, 32, kernel_size=1, bias=False), |
| nn.BatchNorm2d(32), |
| nn.ReLU(inplace=True), |
| ) |
| self.policy_fc = nn.Linear(32 * self.board_height * self.board_width, self.action_size) |
|
|
| self.value_head = nn.Sequential( |
| nn.Conv2d(channels, 32, kernel_size=1, bias=False), |
| nn.BatchNorm2d(32), |
| nn.ReLU(inplace=True), |
| ) |
| self.value_fc1 = nn.Linear(32 * self.board_height * self.board_width, 128) |
| self.value_fc2 = nn.Linear(128, 1) |
|
|
| self.to(self.device) |
|
|
| def forward(self, x): |
| x = x.view(-1, *self.input_shape) |
| x = self.stem(x) |
| x = self.residual_tower(x) |
|
|
| policy = self.policy_head(x) |
| policy = torch.flatten(policy, 1) |
| policy = self.policy_fc(policy) |
|
|
| value = self.value_head(x) |
| value = torch.flatten(value, 1) |
| value = F.relu(self.value_fc1(value)) |
| value = torch.tanh(self.value_fc2(value)) |
|
|
| return F.softmax(policy, dim=1), value |
|
|
| def predict(self, board): |
| board = torch.as_tensor(board, dtype=torch.float32, device=self.device) |
| board = board.view(1, *self.input_shape) |
| self.eval() |
| with torch.no_grad(): |
| pi, v = self.forward(board) |
|
|
| return pi.detach().cpu().numpy()[0], float(v.item()) |
|
|