Spaces:

gruhit-patel
/

connect-4-API

Sleeping

App Files Files Community

Gruhit Patel commited on Apr 8, 2024

Commit

1fab54b

verified ·

1 Parent(s): 5b133f3

init-backend

Browse files

Files changed (12) hide show

agent.py +72 -0
arena.py +83 -0
buffer.py +50 -0
config.py +47 -0
game.py +131 -0
main.py +63 -0
main2.py +92 -0
mcts.py +282 -0
model.py +106 -0
requirement.txt +7 -0
trainer.py +85 -0
view_board.py +60 -0

agent.py ADDED Viewed

	@@ -0,0 +1,72 @@

+from model import Model
+from buffer import Buffer
+from game import Connect4
+from mcts import MCTS_NN
+import numpy as np
+from typing import Tuple, List
+class Agent:
+    def __init__(self, row:int, col:int, n_action: int, obs_shape: Tuple[int, int, int],
+                 model: Model, iteration: int, temperature:float):
+        self.row = row
+        self.col = col
+        self.n_action = n_action
+        self.obs_shape = obs_shape
+        self.iteration = iteration
+        self.temperature = temperature
+        # Create buffer instance
+        self.buffer = Buffer(n_action=self.n_action, obs_shape=self.obs_shape)
+        # Target model instance
+        self.target_model = model
+    # Reset the MCTS class instance and buffer
+    def reset(self, state: Connect4, reset_buffer: bool = False) -> None:
+        # Reset the state of the Monte-carlo tree search instance
+        self.mcts = MCTS_NN(state=state, model=self.target_model)
+    # Reset the buffer
+    def reset_buffer(self) -> None:
+        self.buffer.reset()
+    # Get the policy from mcts simulation
+    def perform_mcts(self) -> np.ndarray:
+        for _ in range(self.iteration):
+            self.mcts.selection(self.mcts.root, add_dirichlet=True)
+        policy = self.mcts.get_policy_pie(self.temperature)
+        return policy
+    # Get an action for any state
+    def get_action(self) -> int:
+        policy = self.perform_mcts()
+        action = np.random.choice(self.n_action, p=policy)
+        return action, policy
+    # This method updates the buffer and send it to the buffer object
+    def update_buffer(self, episodic_buffer: List)->None:
+        # Get the last index of the episodic buffer
+        idx = len(episodic_buffer) - 1
+        # Always the last state will have value 1 as it would be the winning move
+        value = 1
+        while idx >= 0:
+            episodic_buffer[idx][1] = value
+            value *= -1 # For parent the value is negative
+            idx -= 1 # Go to the previous experience tuple
+        for state, value, policy in episodic_buffer:
+            self.buffer.store_experience(
+                state = state,
+                value = value,
+                policy = policy
+            )
+    # Update the root to set it to one of its child node
+    # based on the actio taken in the above method `get_action()`
+    def update(self, action: int) -> None:
+        self.mcts.update_root(action)

arena.py ADDED Viewed

	@@ -0,0 +1,83 @@

+from game import Connect4
+from agent import Agent
+from model import Model
+from mcts import MCTS_NN
+from typing import Union
+import tqdm
+import numpy as np
+def play_selfgames(agent: Agent, training_games: int):
+    for _ in tqdm(range(training_games)):
+        board = Connect4(row=agent.row, col=agent.col)
+        agent.reset(state = board)
+        # a buffer list to store the transition of current episode
+        episodic_buffer = []
+        while not board.is_win() and not board.is_draw():
+            # While getting the action the search is performed
+            # also the experience is stored in it
+            action, policy = agent.get_action()
+            episodic_buffer.append([
+                board.get_state(),
+                board.player_1,
+                policy
+            ])
+            board, _ = board.drop_piece(action)
+            # Update the root node of MCTS to one of its child node
+            agent.update(action)
+        # When the episode is compelted update the buffer
+        agent.update_buffer(episodic_buffer)
+def get_move_for_bot(state: Connect4, model: Model, tree_iters: int, random_move: bool = False) -> int:
+    mcts = MCTS_NN(state = state, model = model)
+    for _ in range(tree_iters):
+        mcts.selection(mcts.root, random_move)
+    policy = mcts.get_policy_pie()
+    act = np.argmax(policy)
+    return act
+def play_game_against_bot(bot1: Model, bot2: Model, tree_iters:int) -> Union[None, int]:
+    board = Connect4()
+    player_1 = True
+    # In function bot1 will be always datagen model to make 1st move
+    # bot2 will be main_model to make 2nd move
+    # We randomly allow them to make first move based for 50% of time
+    flip = False
+    if np.random.uniform() < 0.5:
+        flip = True
+        (bot1, bot2) = (bot2, bot1)
+        print("Bot has been flipped")
+    while not board.is_win() and not board.is_draw():
+        if player_1:
+            act = get_move_for_bot(board, model=bot1, tree_iters=tree_iters)
+            player_1 = False
+        else:
+            act = get_move_for_bot(board, model=bot2, tree_iters=tree_iters)
+            player_1 = True
+        board, win = board.drop_piece(act)
+        print(board)
+    # Here returning
+    # 0 - draw
+    # 1 - datagen won
+    # -1 - main_model won
+    # Hence when flipped we have to handle the values accordingly
+    if flip:
+        # Thus if we have flipped then main_model who is player 1 if its has won
+        # then we want to return -1 for it and vice-versa
+        return 0 if win == None else win*-1
+    else:
+        return 0 if win == None else win

buffer.py ADDED Viewed

	@@ -0,0 +1,50 @@

+from typing import Tuple
+import numpy as np
+class Buffer:
+    def __init__(self, n_action: int, obs_shape: Tuple[int, int, int]):
+        self.n_action = n_action
+        self.obs_shape = obs_shape
+        self.mem_size = 0
+        # Creating empty lists for storing value. Provide dynamicness
+        self.state = []
+        self.value = []
+        self.policy = []
+    def store_experience(self, state: np.ndarray, value: float, policy: np.ndarray):
+        self.state.append(state)
+        self.value.append(value)
+        self.policy.append(policy)
+        self.mem_size += 1
+    def sample(self, batch_size: int) -> Tuple[
+        np.ndarray,
+        np.ndarray,
+        np.ndarray
+    ]: # type: ignore
+        # shuffle the memmory
+        np.random.shuffle(self.state)
+        np.random.shuffle(self.value)
+        np.random.shuffle(self.policy)
+        for start_idx in range(0, self.mem_size, batch_size):
+            end_idx = min(start_idx+batch_size, self.mem_size)
+            s = self.state[start_idx:end_idx]
+            v = self.value[start_idx:end_idx]
+            p = self.policy[start_idx:end_idx]
+            yield (np.array(s), np.array(v), np.array(p))
+    # Reset the the buffer to store new experience
+    def reset(self) -> None:
+        self.state = []
+        self.value = []
+        self.policy = []
+        self.mem_size = 0
+    # Return the length of the buffer
+    def __len__(self):
+        return self.mem_size

config.py ADDED Viewed

	@@ -0,0 +1,47 @@

+from typing import Tuple
+import torch
+class Config:
+    # Board
+    row:int = 6
+    col:int = 7
+    # Neural Network
+    num_hidden:int = 64
+    num_res_block:int = 4
+    rate: float = 0.3
+    obs_shape: Tuple[int, int, int] = (4, row, col)
+    n_action: int = col
+    device: str = 'cuda:0' if torch.cuda.is_available() else 'cpu'
+    checkpoint_path: str = "../Models/azv3.pt"
+    # Optimizer
+    base_lr: float = 0.01
+    weight_decay: float = 1e-4
+    # Monte-carlo tree search
+    temperature = 1.0
+    tree_iter = 100
+    # Training
+    selfplay_games:int = 50
+    epoch:int = 10
+    batch_size:int = 128
+    # Tournament
+    eval_games: int = 10
+    # How much elo rating should be given per winning
+    k: int = 10
+    # model update threshold
+    threshold: float = 0.55
+    # How many time you want to play selfplay games and train model
+    total_iters:int = 40
+    # Parallel_games
+    parallel_run: int = 4
+    DIRICHLET_ALPHA: float = 0.3 # Avg legal move / 75% of total move
+    EPSILON: float = 0.25

game.py ADDED Viewed

	@@ -0,0 +1,131 @@

+from copy import deepcopy
+import numpy as np
+from typing import Union
+class Connect4:
+    def __init__(self, board:'Connect4'=None, row:int = 6, col:int =7):
+        self.row = row
+        self.col = col
+        self.player_1 = 1
+        self.player_2 = -1
+        self.board = np.zeros((self.row, self.col))
+        self.winning_start = None
+        self.winning_end = None
+        if board is not None:
+            self.__dict__ = deepcopy(board.__dict__)
+    def drop_piece(self, action: int) -> 'Connect4':
+        board = Connect4(board=self)
+        # Find the row in that column which is valid to drop piece
+        valid_row_idx = sum(board.board[:, action] == 0) - 1
+        board.board[valid_row_idx, action] = self.player_1
+        (board.player_1, board.player_2) = (self.player_2, self.player_1)
+        return board, board.is_win()
+    # Get the encoded state for the board
+    def get_state(self) -> np.ndarray:
+        # Create a layer to state the player turn
+        turn = np.ones_like(self.board) if self.player_1 == 1 else np.zeros_like(self.board)
+        enc_state = np.stack(
+            (self.board == -1, self.board == 0, self.board == 1, turn)
+        ).astype(np.int32)
+        return enc_state
+    # check if the board results in a draw state
+    def is_draw(self):
+        return (self.board != 0).all()
+    def is_win(self) -> Union[None, int]:
+        # Initially no one is winner
+        winner = None
+        # Check for columns
+        if self.col_win():
+            winner = self.player_2
+        # Check for rows
+        elif self.row_win():
+            winner = self.player_2
+        # Check for diagonals
+        elif self.diag_win():
+            winner = self.player_2
+        return winner
+    # Check for column win
+    def col_win(self) -> bool:
+        # Iterate over each column
+        for c in range(self.col):
+            # for 4 consequtive rows
+            for r in range(self.row-3):
+                # if the the all 4 element are of player who made move then its win
+                if sum(self.board[r:r+4, c] == self.player_2) == 4:
+                    self.winning_start = (c, r)
+                    self.winning_end = (c, r+3)
+                    return True
+        return False
+    # check for win in row
+    def row_win(self) -> bool:
+        # Iterate over each row
+        for r in range(self.row):
+            # For 4 consequtive cols
+            for c in range(self.col-3):
+                # If all of 4 elements are of player who made move then its win
+                if sum(self.board[r, c:c+4] == self.player_2) == 4:
+                    self.winning_start = (c, r)
+                    self.winning_end = (c+3, r)
+                    return True
+        return False
+    # check for win in diagonal
+    def diag_win(self) -> bool:
+        # For a window of 4x4 if the main diag or other diag has
+        # same disc of player who made move then its a win
+        for r in range(self.row-3):
+            for c in range(self.col-3):
+                # Get a window of size 4x4
+                window = self.board[r:r+4, c:c+4]
+                # If all 4 element of main diag(/) is player who made move then its win
+                if sum(np.diag(window) == self.player_2) == 4:
+                    self.winning_start = (c+3, r)
+                    self.winning_end = (c, r+3)
+                    # print("WinningMain Diag: ", self.winning_start, " - ", self.winning_end)
+                    return True
+                # If all 4 element of other diag(\) is player who made move then its win
+                if sum(np.diag(window[:, ::-1]) == self.player_2) == 4:
+                    self.winning_start = (c, r)
+                    self.winning_end = (c+3, r+3)
+                    # print("WinningMain Diag: ", self.winning_start, " - ", self.winning_end)
+                    return True
+        return False
+    # get a list of valid move that can be played by the current player
+    def get_valid_moves(self) -> np.array:
+        valid_cols = [False]*self.col
+        for c in range(self.col):
+            if self.board[0, c] == 0:
+                valid_cols[c] = True
+        return np.array(valid_cols, dtype=bool)
+    def __str__(self) -> str:
+        print_str = ""
+        for r in range(self.row):
+            for c in range(self.col):
+                print_str += f"{self.board[r, c]:>3.0f}"
+            print_str += "\n"
+        return print_str

main.py ADDED Viewed

	@@ -0,0 +1,63 @@

+from fastapi import FastAPI
+from fastapi.middleware.cors import CORSMiddleware
+from game import Connect4
+from model import Model
+from config import Config
+from pydantic import BaseModel
+from typing import List, Union
+import numpy as np
+from arena import get_move_for_bot
+import torch
+class Request(BaseModel):
+    board: List[List[int]]
+    currentPlayer: str
+    randomMoves: Union[None, bool]
+    mctsIterations: Union[None, int]
+# Create an application instance
+app = FastAPI()
+app.add_middleware(
+    CORSMiddleware,
+    allow_origins=["*"],
+    allow_credentials=True,
+    allow_methods=["GET", "POST"],
+    allow_headers=["*"]
+)
+# Create the model
+model = Model(
+    n_action = Config.n_action,
+    num_hidden = Config.num_hidden,
+    num_resblock = Config.num_res_block,
+    rate = Config.rate,
+    row = Config.row,
+    col = Config.col,
+    device = Config.device
+)
+model.load_state_dict(torch.load(Config.checkpoint_path))
+model.eval()
+@app.get("/")
+def root():
+    return {"message": "This is a temporary response"}
+@app.post("/get_move")
+def get_move(req: Request):
+    global model
+    board_arr = np.array(req.board)
+    board = Connect4()
+    board.board = board_arr
+    if req.currentPlayer == "yellow":
+        (board.player_1, board.player_2) = (board.player_2, board.player_1)
+    # TODO: change the tree_iter to req.parameters
+    act = get_move_for_bot(
+        state = board,
+        model = model,
+        tree_iters = req.mctsIterations,
+        random_move = req.randomMoves
+    )
+    return {'move': int(act)}

main2.py ADDED Viewed

	@@ -0,0 +1,92 @@

+from model import Model
+from config import Config
+from arena import get_move_for_bot
+from game import Connect4
+import pygame
+from view_board import draw_board, draw_winning_line
+import sys
+import torch
+def play_game(model: Model):
+    board = Connect4(
+        row = Config.row,
+        col = Config.col
+    )
+    pygame.init()
+    screen = pygame.display.set_mode((Config.col*100, (Config.row+1)*100))
+    ai_turn = True
+    game_end = False
+    while True:
+        draw_board(screen, board.board)
+        draw_winning_line(screen, board.winning_start, board.winning_end)
+        # render(board.board)
+        if ai_turn and not game_end:
+            # print("Getting move from AI...")
+            act = get_move_for_bot(board, model, Config.tree_iter)
+            # print(f"AI moved in column {act}")
+            board, win = board.drop_piece(act)
+            if win is not None:
+                print("AI has WON")
+                print("Board \n")
+                print(board)
+                print("Winner is...", win)
+                game_end = True
+            ai_turn = False
+            draw_board(screen, board.board)
+            pygame.display.update()
+        for event in pygame.event.get():
+            if event.type == pygame.QUIT:
+                sys.exit()
+            if event.type == pygame.MOUSEBUTTONDOWN and not game_end:
+                posx = event.pos[0]
+                act = posx//100
+                board, win = board.drop_piece(act)
+                ai_turn = True
+                if win is not None:
+                    print("Human has Won")
+                    print("Board \n")
+                    print(board)
+                    game_end = True
+            if event.type == pygame.MOUSEMOTION and not game_end:
+                pygame.draw.rect(screen, (0, 0, 0), (0, 0, 700, 100))
+                posx = event.pos[0]
+                # If ai is turn 1 then player's turn is second
+                if board.player_1 == -1:
+                    pygame.draw.circle(screen, (230,230,20), (posx, int(100//2)), 50)
+                else:
+                    pygame.draw.circle(screen, (52, 186, 235), (posx, int(100//2)), 50)
+        pygame.display.update()
+if __name__ == "__main__":
+    model = Model(
+        n_action = Config.n_action,
+        num_hidden = Config.num_hidden,
+        num_resblock = Config.num_res_block,
+        rate = Config.rate,
+        row = Config.row,
+        col = Config.col,
+        device = Config.device
+    )
+    # This is LR = .01 model
+    # model_path = './Models/C4GruhitSPatel/FullBuffer5x5V1/TargetModel_500.pt'
+    # This is LR = .001 model
+    model_path = "./Models/C4GruhitML/C4CyclicLRV3/TargetModel_500.pt"
+    model.load_state_dict(torch.load(model_path))
+    model.eval()
+    play_game(model)
+    # print("Model Loaded")

mcts.py ADDED Viewed

	@@ -0,0 +1,282 @@

+from model import Model
+from typing import Union, Tuple
+from game import Connect4
+from config import Config
+import torch
+from torch import Tensor
+import numpy as np
+class Node:
+    def __init__(self, state: Union[Connect4, None], model: Model, name: str):
+        # Current state that the node represent
+        self.state = state
+        # Name of the node to trace it
+        self.name = name
+        # A model instance that the node will use to get value and policy
+        self.model = model
+        # visit count
+        self.N = 0
+        # Intermediate reward value
+        self.W = 0
+        # value of the node
+        self.value = None
+        # Prior policy for action from this node
+        self.policy = None
+        # Set the winner of the current node.
+        # Node by default indicating no one has won
+        self.win = None
+        # Children of current node
+        self.children = {}
+        # valid and invalid actions that can be take from this node
+        self.valid_actions = None
+        self.invalid_actions = None
+        # Set the valid and invalid actions
+        self.set_valid_actions()
+        # Initialize the branches to the childrens
+        self.initialize_edges()
+    # Set the valid actions that can be taken from the state that
+    # the node represent
+    def set_valid_actions(self) -> None:
+        if self.state is not None:
+            self.valid_actions = self.state.get_valid_moves()
+            self.invalid_actions = ~self.valid_actions
+    # initialize the edges from this node to potential childrens
+    def initialize_edges(self) -> None:
+        if self.state is not None:
+            self.children = {}
+            for act, valid_move in enumerate(self.valid_actions):
+                if valid_move:
+                    # set state as none for childrens as we do not have it
+                    self.children[act] = Node(
+                        state=None,
+                        model=self.model,
+                        name=self.name + '_' + str(act)
+                    )
+    def preprocess_state(self, x:np.ndarray) -> Tensor:
+        x = torch.tensor(x, dtype=torch.float32, device=Config.device)
+        x = x.unsqueeze(0)
+        return x
+    # define the forward pass for the current node
+    def forward(self) -> None:
+        with torch.no_grad():
+            value, policy = self.model(self.preprocess_state(self.state.get_state()))
+        value = value[0, 0]
+        policy = policy[0]
+        # Mask the invalid actions
+        policy[self.invalid_actions] = 0.
+        # Prevent from all probability from turning 0
+        if policy.sum() == 0:
+            policy[self.valid_actions] = 1.
+        policy = policy.softmax(dim=-1)
+        self.value = value.detach().cpu().numpy()
+        self.policy = policy.detach().cpu().numpy()
+    # Get policy for the current node
+    def get_policy(self) -> np.ndarray:
+        if self.policy is None:
+            self.forward()
+        return self.policy
+    # Get the value associated with the node
+    def get_value(self) -> float:
+        if self.value is None:
+            self.forward()
+        return self.value
+class MCTS_NN:
+    def __init__(self, state:Connect4, model:Model, log=None):
+        self.root = Node(state=state, model=model, name='root')
+        if log is not None:
+            self.log = log
+    # For the simulation on the Monte-carlo tree
+    def selection(self, node: Node, add_dirichlet:bool=False, iter:int=0) -> float:
+        # Get the best child of the current node
+        # self.log.write(f'\nSelecting Best child of {node.name}')
+        best_child, best_action = self.get_best_child(node, add_dirichlet, iter)
+        # self.log.write(f"Iteartion {iter} - Best Action - {best_action} - Node: {node.name}")
+        # If the child is a leaf node(i.e.) either is terminal or is not expanded
+        # expand that node
+        if best_child.state is None:
+            # self.log.write(f'\nExpanding node {best_child.name}')
+            val = self.expolore_and_expand(parent=node, child=best_child, action=best_action, iter=iter)
+        # If the node is already expanded than traverse that node further
+        else:
+            # As per paper only add dirichlet noise for root node's
+            # child selection and not later on
+            # self.log.write(f'\nSelecting node further on {best_child.name}')
+            val = self.selection(node=best_child, add_dirichlet=False, iter=iter)
+        node.N += 1
+        node.W += val
+        return -val
+    # Expore and expand the tree
+    def expolore_and_expand(self, parent: Node, child: None, action: int, iter=0) -> float:
+        # self.log.write(f'\n<========== Explore or Expand Iteration {iter} ==========>')
+        # Check if the current state is a terminal state
+        if child.win is None:
+            # It is not expanded and is not terminal
+            # Perform the action for the parent state to get the next state
+            next_state, win = parent.state.drop_piece(action)
+            # First check if somone won in this next state
+            if win is not None:
+                val = -1 if win == parent.state.player_1 else 1
+                child.win = win
+                # self.log.write(f'\nPlayer Turn for child is {next_state.player_1} | [Winner Found]')
+                # self.log.write(f'\nWinner in that state {win} - child.Value is {val}')
+                # self.log.write(f'\nWinning Child in state {child.name}: state\n{next_state}\n')
+                # self.log.write('='*100)
+                # self.log.write('\n')
+            # else check if the next state results in draw
+            elif next_state.is_draw():
+                # 0 value if no one has won in the state
+                val = 0
+                # 0 for win means no one won
+                child.win = 0
+                # self.log.write(f'\nPlayer Turn for child is {next_state.player_1}')
+                # self.log.write(f'\nDraw Child in state {child.name}: state\n{next_state}\n')
+                # self.log.write('='*100)
+                # self.log.write('\n')
+            # if the next_state is not winning nor it is draw
+            # then expand it normally
+            else:
+                # If no one is winning yet then get the value for the current
+                # state from the child's mode and set it
+                child.state = next_state
+                child.set_valid_actions()
+                child.initialize_edges()
+                val = child.get_value()
+                # self.log.write(f'\nPlayer Turn for child is {next_state.player_1} | [No Winner]')
+                # self.log.write(f'\nLeaf node expanded for "{child.name}" with val {val:.5f}\n')
+                # self.log.write('='*100)
+                # self.log.write('\n')
+        else:
+            # If the current child represent a draw state then give value 0
+            if child.win == 0:
+                # self.log.write(f'\nTerminal DRAW state reached for child {child.name}\n')
+                # self.log.write('='*100)
+                # self.log.write('\n')
+                val = 0
+            # If the winner in child node was the player who played a move
+            # in the parent node then set -1 as value as it means that
+            # the player in child node has lost
+            elif child.win == parent.state.player_1:
+                # self.log.write(f'\nTerminal Parent Winning state reached for child {child.name}\n')
+                # self.log.write('='*100)
+                # self.log.write('\n')
+                val = -1
+            # if the winner of child node is the same as the player of child node
+            # then provide value of +1
+            else:
+                # self.log.write(f'\nTerminal child Winning state reached for child {child.name}\n')
+                # self.log.write('='*100)
+                # self.log.write('\n')
+                val = 1
+        # Update the visit count and intermidiate reward of child node
+        child.N += 1
+        child.W += val
+        # Return negative of val because the player in parent node will be
+        # the opposite player from the current node. Hence what is good
+        # for current node's player should be bad for the parent node's player
+        return -val
+    # Calculate the PUCT score for a node's children
+    def get_puct_score(self, parent: Node, child: Node, prior: float) -> float:
+        # PUCT is the sum of q_value of current node + the U(S, a)
+        q_value = 0
+        if child.N == 0:
+            q_value = 0
+        else:
+            # q_value = 1 - ((child.W/child.N) + 1)/2
+            q_value = -child.W/child.N
+        # C_puct represent the exploration constant
+        c_puct = 1
+        u_sa = c_puct * prior * (np.sqrt(parent.N))/(1+child.N)
+        return q_value + u_sa
+    def get_dirichlet_noise(self, node: None) -> np.ndarray:
+        num_valid_action = node.valid_actions.sum()
+        noise_vec = np.random.dirichlet([Config.DIRICHLET_ALPHA]*num_valid_action)
+        noise_arr = np.zeros((len(node.valid_actions),), dtype=noise_vec.dtype)
+        noise_arr[node.valid_actions] = noise_vec
+        return noise_arr
+    # Get the best child for any node
+    def get_best_child(self, node: Node, add_dirichlet: bool, iter=0) -> Tuple[Node, int]:
+        # the best node is simple the one with highest PUCT value
+        policy = node.get_policy()
+        if add_dirichlet:
+            noise_arr = self.get_dirichlet_noise(node)
+            policy = (1-Config.EPSILON)*policy + Config.EPSILON*noise_arr
+        best_puct = float('-inf')
+        best_child = None
+        best_action = None
+        # self.log.write(f'\n\n==================== Iteration {iter} ====================\n')
+        for action, child in node.children.items():
+            puct = self.get_puct_score(parent=node, child=child, prior=policy[action])
+            # self.log.write(f'{action} - PUCT: {puct:.4f} | N = {child.N} | W = {child.W:.4f} | P = {policy[action]:.4f}\n')
+            if puct > best_puct:
+                best_puct = puct
+                best_child = child
+                best_action = action
+        return best_child, best_action
+    # return the policy pie for the root node based on the visit count
+    def get_policy_pie(self, temperature:float=1):
+        actions = np.zeros((len(self.root.valid_actions),))
+        for action, child in self.root.children.items():
+            actions[action] = (child.N)**(1/temperature)
+        actions /= actions.sum()
+        return actions
+    # Traverse the tree by steping to one of the child node of root node
+    def update_root(self, action: int) -> None:
+        self.root = self.root.children[action]

model.py ADDED Viewed

	@@ -0,0 +1,106 @@

+import torch
+from torch import Tensor
+import torch.nn as nn
+import torch.nn.functional as F
+class ResNetBlock(nn.Module):
+    def __init__(self, num_hidden: int):
+        super(ResNetBlock, self).__init__()
+        self.conv1 = nn.Conv2d(num_hidden, num_hidden, kernel_size=3, padding=1, bias=False)
+        self.bn1 = nn.BatchNorm2d(num_hidden)
+        self.conv2 = nn.Conv2d(num_hidden, num_hidden, kernel_size=3, padding=1, bias=False)
+        self.bn2 = nn.BatchNorm2d(num_hidden)
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        residual = x
+        x = F.relu(self.bn1(self.conv1(x)))
+        x = self.bn2(self.conv2(x))
+        x += residual
+        x = F.relu(x)
+        return x
+class DropoutBlock(nn.Module):
+    def __init__(self, in_units: int, out_units: int, rate: float):
+        super(DropoutBlock, self).__init__()
+        self.model = nn.Sequential(
+            nn.Linear(in_units, out_units),
+            nn.BatchNorm1d(out_units),
+            nn.ReLU(),
+            nn.Dropout(rate)
+        )
+    def forward(self, x: Tensor) -> Tensor:
+        return self.model(x)
+class Model(nn.Module):
+    def __init__(self, n_action: int, num_hidden: int, num_resblock:int,
+                 rate:float, row:int, col: int, device: str):
+        super(Model, self).__init__()
+        # Bottom layer
+        self.initial_block = nn.Sequential(
+            nn.Conv2d(4, num_hidden, kernel_size=3, padding=1),
+            nn.BatchNorm2d(num_hidden),
+            nn.ReLU()
+        ).to(device)
+        self.res_blocks = nn.Sequential(
+            *[ResNetBlock(num_hidden) for _ in range(num_resblock)]
+        ).to(device)
+        self.dropout_model = nn.Sequential(
+            DropoutBlock(num_hidden*row*col, 200, rate),
+            DropoutBlock(200, 100, rate)
+        )
+        self.model = nn.Sequential(
+            self.initial_block,
+            self.res_blocks,
+            nn.Flatten(),
+            self.dropout_model
+        )
+        self.policy_head = nn.Sequential(
+            nn.Linear(100, 100),
+            nn.ReLU(),
+            nn.Linear(100, n_action),
+        ).to(device)
+        self.value_head = nn.Sequential(
+            nn.Linear(100, 100),
+            nn.ReLU(),
+            nn.Linear(100, 1),
+            nn.Tanh()
+        ).to(device)
+        self.to(device)
+        self.device = device
+        # Losses
+        # Mean Square Error for minimizing the difference between estimated value and target value
+        self.mse_loss = nn.MSELoss()
+        # Cross entropy loss to evaluate the correct policy as compared to target policy
+        self.ce_loss = nn.CrossEntropyLoss()
+    def forward(self, x):
+        x = self.model(x)
+        value = self.value_head(x)
+        policy = self.policy_head(x)
+        return value, policy
+    # Perform the loss calculation
+    def get_loss(self, pred_val, pred_policy, true_val, true_policy):
+        val_loss = self.mse_loss(pred_val, true_val)
+        policy_loss = self.ce_loss(pred_policy, true_policy)
+        final_loss = val_loss + policy_loss
+        return {
+            'total_loss': final_loss,
+            'value_loss': val_loss,
+            'policy_loss': policy_loss
+        }

requirement.txt ADDED Viewed

	@@ -0,0 +1,7 @@

+torch==2.2.2
+tqdm==4.66.2
+pygame==2.5.2
+fastapi==0.110.1
+pydantic==2.6.4
+uvicorn==0.29.0
+numpy==1.26.4

trainer.py ADDED Viewed

	@@ -0,0 +1,85 @@

+import torch
+from torch import optim
+from torch.utils.tensorboard import SummaryWriter
+import tqdm
+import numpy as np
+from model import Model
+from buffer import Buffer
+class Trainer:
+    def __init__(self, model: Model, buffer: Buffer, base_lr:float = 0.001,
+                 weight_decay=1e-4, device:str='cpu'):
+        self.main_model = model
+        self.main_buffer = buffer
+        self.global_step = 0
+        self.device = device
+        # optimizer
+        self.optimizer = optim.SGD(
+            self.main_model.parameters(),
+            lr = base_lr,
+            weight_decay = weight_decay,
+            momentum = 0.9
+        )
+        # self.scheduler = optim.lr_scheduler.CyclicLR(
+        #     self.optimizer,
+        #     base_lr = base_lr,
+        #     max_lr = 0.1
+        # )
+        # Tensorboard summary writer
+        self.writer = SummaryWriter()
+    def transfer_buffer(self, buffer) -> None:
+        for state, value, policy in zip(buffer.state, buffer.value, buffer.policy):
+            self.main_buffer.store_experience(
+                state = state,
+                value = value,
+                policy = policy
+            )
+    def reset_buffer(self) -> None:
+        self.main_buffer.reset()
+    # learn from the buffer
+    def learn(self, state: np.ndarray, value: np.ndarray, policy: np.ndarray) -> float:
+        state = torch.tensor(state, dtype=torch.float32, device=self.device)
+        value = torch.tensor(value, dtype=torch.float32, device=self.device).unsqueeze(-1)
+        policy = torch.tensor(policy, dtype=torch.float32, device=self.device)
+        pred_val, pred_policy = self.main_model(state)
+        self.optimizer.zero_grad()
+        loss = self.main_model.get_loss(pred_val, pred_policy, value, policy)
+        loss.backward()
+        self.optimizer.step()
+        return loss.detach().cpu().numpy()
+    # Training loop for the model
+    def train_model(self, epochs: int, batch_size: int):
+        train_steps = np.ceil(len(self.main_buffer) / batch_size).astype(np.int32)
+        # perform the training
+        for epoch in range(epochs):
+            for state, value, policy in tqdm(self.main_buffer.sample(batch_size), total=train_steps, desc=f'Epoch:{epoch+1}'):
+                loss = self.learn(state, value, policy)
+                self.writer.add_scalar("loss", loss, self.global_step)
+                self.global_step += 1
+        self.writer.flush()
+    # close the writer
+    def close_writer(self):
+        self.writer.close()
+    # Save the model
+    def save_model(self, step: int):
+        torch.save(self.main_model.state_dict(), f'TargetModel_{step}.pt')
+        torch.save(self.optimizer.state_dict(), f'Optimizer_{step}.pt')

view_board.py ADDED Viewed

	@@ -0,0 +1,60 @@

+import pygame
+import numpy as np
+import sys
+from typing import Tuple, Union
+from config import Config
+SQUARESIZE = 100
+def draw_board(screen, board):
+    COLUMN_COUNT = Config.col
+    ROW_COUNT = Config.row
+    SQUARESIZE = 100
+    RADIUS = int(SQUARESIZE/2 - 5)
+    BLUE = (52, 186, 235)
+    GREY = (70, 71, 70)
+    WHITE = (255,255,255)
+    YELLOW = (230,230,20)
+    width = COLUMN_COUNT * SQUARESIZE
+    height = (ROW_COUNT+1) * SQUARESIZE
+    size = (width, height)
+    board = np.flip(board,0)
+    for c in range(COLUMN_COUNT):
+        for r in range(ROW_COUNT):
+            pygame.draw.rect(screen, GREY, (c*SQUARESIZE, r*SQUARESIZE+SQUARESIZE, SQUARESIZE, SQUARESIZE))
+            pygame.draw.circle(screen, WHITE, (int(c*SQUARESIZE+SQUARESIZE/2), int(r*SQUARESIZE+SQUARESIZE+SQUARESIZE/2)), RADIUS)
+    for c in range(COLUMN_COUNT):
+        for r in range(ROW_COUNT):
+            if board[r][c] == 1:
+                pygame.draw.circle(screen, BLUE, (int(c*SQUARESIZE+SQUARESIZE/2), height-int(r*SQUARESIZE+SQUARESIZE/2)), RADIUS)
+            elif board[r][c] == -1:
+                pygame.draw.circle(screen, YELLOW, (int(c*SQUARESIZE+SQUARESIZE/2), height-int(r*SQUARESIZE+SQUARESIZE/2)), RADIUS)
+def draw_winning_line(screen, start_pos:Union[None, Tuple[int, int]], end_pos:Union[None, Tuple[int, int]]):
+    if start_pos is None or end_pos is None:
+        return
+    offset = SQUARESIZE//2
+    start_line = (SQUARESIZE*start_pos[0]+1+offset, SQUARESIZE*(start_pos[1]+1)+offset)
+    end_line = (SQUARESIZE*end_pos[0]+offset, SQUARESIZE*(end_pos[1]+1)+offset)
+    # print("Start pos: ", start_pos)
+    # print("End pos: ", end_pos)
+    # print("Start line: ", start_line)
+    # print("End Line: ", end_line)
+    pygame.draw.line(screen, (255, 0, 0), start_line, end_line, 10)
+def render(board):
+    pygame.init()
+    screen = pygame.display.set_mode((700,700))
+    while True:
+        for event in pygame.event.get():
+            if event.type == pygame.QUIT:
+                sys.exit()
+            draw_board(screen,board)
+            pygame.display.update()