Spaces:

trioskosmos
/

LovecaSim

Sleeping

App Files Files Community

trioskosmos commited on Feb 3

Commit

5c0c5f6

verified ·

1 Parent(s): 37004b0

Upload ai/models/network.py with huggingface_hub

Browse files

Files changed (1) hide show

ai/models/network.py +480 -0

ai/models/network.py ADDED Viewed

	@@ -0,0 +1,480 @@

+"""
+Neural Network for AlphaZero-style training.
+This module provides a simple neural network architecture for policy and value
+prediction. For a production system, you would use a more sophisticated
+architecture (e.g., ResNet with attention) and train on GPU with PyTorch/TensorFlow.
+"""
+from dataclasses import dataclass
+from typing import Tuple
+import numpy as np
+@dataclass
+class NetworkConfig:
+    """Configuration for AlphaZero Network"""
+    input_size: int = 800  # Feature-based encoding (32 floats per card slot)
+    # Size of observation vector (Matches GameState.get_observation)
+    hidden_size: int = 256
+    num_hidden_layers: int = 3
+    action_size: int = 1000  # Size of action space (Matches GameState.get_legal_actions)
+    learning_rate: float = 0.001
+    l2_reg: float = 0.0001
+def sigmoid(x: np.ndarray) -> np.ndarray:
+    return 1 / (1 + np.exp(-np.clip(x, -500, 500)))
+def relu(x: np.ndarray) -> np.ndarray:
+    return np.maximum(0, x)
+def softmax(x: np.ndarray) -> np.ndarray:
+    exp_x = np.exp(x - np.max(x))
+    return exp_x / exp_x.sum()
+def tanh(x: np.ndarray) -> np.ndarray:
+    return np.tanh(x)
+class SimpleNetwork:
+    """
+    Simple feedforward neural network for policy and value prediction.
+    Architecture:
+    - Input layer (observation)
+    - Hidden layers with ReLU
+    - Policy head (softmax over actions)
+    - Value head (tanh for [-1, 1])
+    """
+    def __init__(self, config: NetworkConfig = None):
+        self.config = config or NetworkConfig()
+        self._init_weights()
+    def _init_weights(self) -> None:
+        """Initialize weights using He initialization"""
+        config = self.config
+        # Shared layers
+        self.hidden_weights = []
+        self.hidden_biases = []
+        in_size = config.input_size
+        for _ in range(config.num_hidden_layers):
+            std = np.sqrt(2.0 / in_size)
+            w = np.random.randn(in_size, config.hidden_size) * std
+            b = np.zeros(config.hidden_size)
+            self.hidden_weights.append(w)
+            self.hidden_biases.append(b)
+            in_size = config.hidden_size
+        # Policy head
+        std = np.sqrt(2.0 / config.hidden_size)
+        self.policy_weight = np.random.randn(config.hidden_size, config.action_size) * std
+        self.policy_bias = np.zeros(config.action_size)
+        # Value head
+        self.value_weight = np.random.randn(config.hidden_size, 1) * std
+        self.value_bias = np.zeros(1)
+    def forward(self, observation: np.ndarray) -> Tuple[np.ndarray, float]:
+        """
+        Forward pass.
+        Args:
+            observation: Input features
+        Returns:
+            (policy probabilities, value)
+        """
+        # Store activations for backward pass
+        self.activations = [observation]
+        x = observation
+        for w, b in zip(self.hidden_weights, self.hidden_biases, strict=False):
+            x = relu(x @ w + b)
+            self.activations.append(x)
+        # Policy head
+        policy_logits = x @ self.policy_weight + self.policy_bias
+        policy = softmax(policy_logits)
+        # Value head
+        value = tanh(x @ self.value_weight + self.value_bias)[0]
+        self.last_policy_logits = policy_logits
+        self.last_value = value
+        return policy, value
+    def predict(self, state) -> Tuple[np.ndarray, float]:
+        """Get policy and value for a game state"""
+        obs = state.get_observation()
+        policy, value = self.forward(obs)
+        # Mask illegal actions
+        legal = state.get_legal_actions()
+        masked_policy = policy * legal
+        if masked_policy.sum() > 0:
+            masked_policy /= masked_policy.sum()
+        else:
+            # Fall back to uniform over legal
+            masked_policy = legal.astype(np.float32)
+            masked_policy /= masked_policy.sum()
+        return masked_policy, value
+    def predict_batch(self, states) -> list:
+        """Get policy and value for a batch of game states"""
+        if not states:
+            return []
+        obs = np.array([s.get_observation() for s in states])
+        policies, values = self.forward(obs)
+        results = []
+        for i, (policy, value) in enumerate(zip(policies, values)):
+            legal = states[i].get_legal_actions()
+            masked_policy = policy * legal
+            if masked_policy.sum() > 0:
+                masked_policy /= masked_policy.sum()
+            else:
+                # Fall back to uniform over legal
+                masked_policy = legal.astype(np.float32)
+                masked_policy /= masked_policy.sum()
+            results.append((masked_policy, value))
+        return results
+    def train_step(
+        self, observations: np.ndarray, target_policies: np.ndarray, target_values: np.ndarray
+    ) -> Tuple[float, float, float]:
+        """
+        One training step (Vectorized).
+        Args:
+            observations: Batch of observations (batch_size, input_size)
+            target_policies: Target policy distributions (batch_size, action_size)
+            target_values: Target values (batch_size,)
+        Returns:
+            (total_loss, policy_loss, value_loss)
+        """
+        batch_size = len(observations)
+        config = self.config
+        # 1. Forward Pass (Batch)
+        pred_policy, pred_value = self.forward(observations)
+        # pred_policy: (B, action_size)
+        # pred_value: (B,)
+        # 2. Loss Calculation
+        # Policy loss: Cross-entropy
+        # Mean over batch
+        policy_loss = -np.mean(np.sum(target_policies * np.log(pred_policy + 1e-8), axis=1))
+        # Value loss: MSE
+        value_loss = np.mean((pred_value - target_values) ** 2)
+        total_loss = policy_loss + value_loss
+        # 3. Backward Pass (Gradients)
+        # d_policy = (pred - target) / batch_size  (Gradient of Mean Cross Entropy)
+        # However, we treat the sum of gradients and then average manually update,
+        # so let's stick to the convention: dL/dLogits = (pred - target) / B
+        d_policy_logits = (pred_policy - target_policies) / batch_size
+        # d_value = 2 * (pred - target) * tanh'(pre_tanh) /  batch_size
+        # tanh' = 1 - tanh^2 = 1 - pred_value^2
+        d_value_out = 2 * (pred_value - target_values) / batch_size
+        d_value_pre_tanh = d_value_out * (1 - pred_value**2)
+        # Gradients for heads
+        # hidden_out: (B, hidden_size)  (Last activation)
+        hidden_out = self.activations[-1]
+        # d_Weights = Input.T @ Error
+        # Policy: (H, B) @ (B, A) -> (H, A)
+        grad_policy_w = hidden_out.T @ d_policy_logits
+        grad_policy_b = np.sum(d_policy_logits, axis=0)
+        # Value: (H, B) @ (B, 1) -> (H, 1)
+        # d_value_pre_tanh needs shape (B, 1)
+        d_value_pre_tanh = d_value_pre_tanh.reshape(-1, 1)
+        grad_value_w = hidden_out.T @ d_value_pre_tanh
+        grad_value_b = np.sum(d_value_pre_tanh, axis=0)
+        # Backprop through hidden layers
+        # d_hidden_last = d_policy @ W_p.T + d_value @ W_v.T
+        # (B, A) @ (A, H) + (B, 1) @ (1, H) -> (B, H)
+        d_hidden = d_policy_logits @ self.policy_weight.T + d_value_pre_tanh @ self.value_weight.T
+        # Store grads to apply later
+        grads_w = []
+        grads_b = []
+        # Iterate backwards through hidden layers
+        for layer_idx in range(len(self.hidden_weights) - 1, -1, -1):
+            # ReLU derivative: mask where activation > 0
+            # self.activations has inputs at [0], layer 1 out at [1], etc.
+            # layer_idx maps to weights[layer_idx], which produces activations[layer_idx+1]
+            mask = (self.activations[layer_idx + 1] > 0).astype(np.float32)
+            d_hidden = d_hidden * mask
+            prev_activation = self.activations[layer_idx]
+            # Gradients for this layer
+            # (In, B) @ (B, Out) -> (In, Out)
+            g_w = prev_activation.T @ d_hidden
+            g_b = np.sum(d_hidden, axis=0)
+            grads_w.insert(0, g_w)
+            grads_b.insert(0, g_b)
+            if layer_idx > 0:
+                # Propagate to previous layer
+                d_hidden = d_hidden @ self.hidden_weights[layer_idx].T
+        # 4. Apply Gradients (SGD + L2)
+        for i in range(len(self.hidden_weights)):
+            # L2: w = w - lr * (grad + l2 * w)
+            self.hidden_weights[i] -= config.learning_rate * (grads_w[i] + config.l2_reg * self.hidden_weights[i])
+            self.hidden_biases[i] -= config.learning_rate * grads_b[i]
+        self.policy_weight -= config.learning_rate * (grad_policy_w + config.l2_reg * self.policy_weight)
+        self.policy_bias -= config.learning_rate * grad_policy_b
+        self.value_weight -= config.learning_rate * (grad_value_w + config.l2_reg * self.value_weight)
+        self.value_bias -= config.learning_rate * grad_value_b
+        return total_loss, policy_loss, value_loss
+    def save(self, filepath: str) -> None:
+        """Save network weights to file"""
+        # Use allow_pickle and object-array conversion to handle inhomogeneous layer shapes
+        np.savez(
+            filepath,
+            hidden_weights=np.array(self.hidden_weights, dtype=object),
+            hidden_biases=np.array(self.hidden_biases, dtype=object),
+            policy_weight=self.policy_weight,
+            policy_bias=self.policy_bias,
+            value_weight=self.value_weight,
+            value_bias=self.value_bias,
+        )
+    def load(self, filepath: str) -> None:
+        """Load network weights from file"""
+        data = np.load(filepath, allow_pickle=True)
+        # Convert object arrays back to lists of arrays
+        self.hidden_weights = list(data["hidden_weights"])
+        self.hidden_biases = list(data["hidden_biases"])
+        self.policy_weight = data["policy_weight"]
+        self.policy_bias = data["policy_bias"]
+        self.value_weight = data["value_weight"]
+        self.value_bias = data["value_bias"]
+class NeuralMCTS:
+    """MCTS that uses a neural network for policy and value with parallel search"""
+    def __init__(
+        self, network: SimpleNetwork, num_simulations: int = 100, batch_size: int = 8, virtual_loss: float = 3.0
+    ):
+        self.network = network
+        self.num_simulations = num_simulations
+        self.batch_size = batch_size
+        self.c_puct = 1.4
+        self.virtual_loss = virtual_loss
+        self.root = None
+    def get_policy_value(self, state) -> Tuple[np.ndarray, float]:
+        """Get policy and value from neural network"""
+        return self.network.predict(state)
+    def search(self, state) -> np.ndarray:
+        """Run MCTS with neural network guidance (Parallel)"""
+        from ai.mcts import MCTSNode
+        # Initial root expansion (always blocking)
+        policy, _ = self.get_policy_value(state)
+        self.root = MCTSNode()
+        self.root.expand(state, policy)
+        # We can't batch perfectly if simulations not divisible, but approx is fine
+        num_batches = (self.num_simulations + self.batch_size - 1) // self.batch_size
+        for _ in range(num_batches):
+            self._simulate_batch(state, self.batch_size)
+        # Return visit count distribution
+        # Note: visits length must match action_size from network config or game state
+        # MCTSNode children keys are actions.
+        # We need a fixed size array for the policy target.
+        action_size = len(state.get_legal_actions())
+        visits = np.zeros(action_size, dtype=np.float32)
+        for action, child in self.root.children.items():
+            visits[action] = child.visit_count
+        if visits.sum() > 0:
+            visits /= visits.sum()
+        return visits
+    def _simulate_batch(self, root_state, batch_size) -> None:
+        """Run a batch of MCTS simulations parallelized via Virtual Loss"""
+        paths = []
+        leaf_nodes = []
+        request_states = []
+        # 1. Selection Phase for K threads
+        for _ in range(batch_size):
+            node = self.root
+            state = root_state.copy()
+            path = [node]
+            # Selection
+            while node.is_expanded() and not state.is_terminal():
+                action, child = node.select_child(self.c_puct)
+                # Apply Virtual Loss immediately so subsequent selections in this batch diverge
+                child.virtual_loss += self.virtual_loss
+                state = state.step(action)
+                node = child
+                path.append(node)
+            paths.append((path, state))
+            leaf_nodes.append(node)
+            if not state.is_terminal():
+                request_states.append(state)
+        # 2. Evaluation Phase (Batched)
+        responses = []
+        if request_states:
+            if hasattr(self.network, "predict_batch"):
+                responses = self.network.predict_batch(request_states)
+            else:
+                responses = [self.network.predict(s) for s in request_states]
+        # 3. Expansion & Backpropagation Phase
+        resp_idx = 0
+        for i in range(batch_size):
+            path, state = paths[i]
+            leaf = leaf_nodes[i]
+            value = 0.0
+            if state.is_terminal():
+                value = state.get_reward(root_state.current_player)
+            else:
+                # Retrieve prediction
+                policy, v = responses[resp_idx]
+                resp_idx += 1
+                value = v
+                # Expand
+                leaf.expand(state, policy)
+            # Backpropagate
+            for node in reversed(path):
+                node.visit_count += 1
+                node.value_sum += value
+                # Remove Virtual Loss (except from root which we didn't add to?
+                # Wait, select_child returns child, and we added to child.
+                # Root is path[0]. path[1] is first child.
+                # So we should only subtract from path[1:] if we logic matches.
+                # But wait, did we add to root? No.
+                # So check: if node != self.root: node.virtual_loss -= ...
+                if node != self.root:
+                    node.virtual_loss -= self.virtual_loss
+                value = -value
+def train_network(network: SimpleNetwork, training_data: list, epochs: int = 10, batch_size: int = 32) -> None:
+    """
+    Train network on self-play data.
+    Args:
+        network: Network to train
+        training_data: List of (states, policies, winner) tuples
+        epochs: Number of training epochs
+        batch_size: Batch size for training
+    """
+    print(f"Training on {len(training_data)} games...")
+    # Flatten data with rewards
+    all_states = []
+    all_policies = []
+    all_values = []
+    for states, policies, winner, r0, r1 in training_data:
+        for i, (s, p) in enumerate(zip(states, policies, strict=False)):
+            all_states.append(s)
+            all_policies.append(p)
+            # Value from perspective of player who made the move
+            player_idx = i % 2
+            # Use actual calculated reward (with score shaping)
+            if player_idx == 0:
+                all_values.append(r0)
+            else:
+                all_values.append(r1)
+    all_states = np.array(all_states)
+    all_policies = np.array(all_policies)
+    all_values = np.array(all_values)
+    n_samples = len(all_states)
+    for epoch in range(epochs):
+        # Shuffle data
+        indices = np.random.permutation(n_samples)
+        total_loss = 0.0
+        for i in range(0, n_samples, batch_size):
+            batch_idx = indices[i : i + batch_size]
+            loss, p_loss, v_loss = network.train_step(
+                all_states[batch_idx], all_policies[batch_idx], all_values[batch_idx]
+            )
+            total_loss += loss
+        num_batches = (n_samples + batch_size - 1) // batch_size
+        print(f"Epoch {epoch + 1}/{epochs}, Loss: {total_loss / num_batches:.4f}")
+if __name__ == "__main__":
+    # Test network
+    from engine.game.game_state import initialize_game
+    print("Testing neural network...")
+    config = NetworkConfig()
+    network = SimpleNetwork(config)
+    # Test forward pass
+    state = initialize_game()
+    policy, value = network.predict(state)
+    print(f"Policy shape: {policy.shape}")
+    print(f"Policy sum: {policy.sum():.4f}")
+    print(f"Value: {value:.4f}")
+    # Test training step
+    obs = state.get_observation()
+    target_p = np.zeros(config.action_size)
+    target_p[0] = 0.8
+    target_p[1] = 0.2
+    target_v = 0.5
+    loss, p_loss, v_loss = network.train_step(obs.reshape(1, -1), target_p.reshape(1, -1), np.array([target_v]))
+    print(f"Training loss: {loss:.4f} (policy: {p_loss:.4f}, value: {v_loss:.4f})")