from game import Connect4 from agent import Agent from model import Model from mcts import MCTS_NN from typing import Union import tqdm import numpy as np def play_selfgames(agent: Agent, training_games: int): for _ in tqdm(range(training_games)): board = Connect4(row=agent.row, col=agent.col) agent.reset(state = board) # a buffer list to store the transition of current episode episodic_buffer = [] while not board.is_win() and not board.is_draw(): # While getting the action the search is performed # also the experience is stored in it action, policy = agent.get_action() episodic_buffer.append([ board.get_state(), board.player_1, policy ]) board, _ = board.drop_piece(action) # Update the root node of MCTS to one of its child node agent.update(action) # When the episode is compelted update the buffer agent.update_buffer(episodic_buffer) def get_move_for_bot(state: Connect4, model: Model, tree_iters: int, random_move: bool = False) -> int: mcts = MCTS_NN(state = state, model = model) for _ in range(tree_iters): mcts.selection(mcts.root, random_move) policy = mcts.get_policy_pie() act = np.argmax(policy) return act def play_game_against_bot(bot1: Model, bot2: Model, tree_iters:int) -> Union[None, int]: board = Connect4() player_1 = True # In function bot1 will be always datagen model to make 1st move # bot2 will be main_model to make 2nd move # We randomly allow them to make first move based for 50% of time flip = False if np.random.uniform() < 0.5: flip = True (bot1, bot2) = (bot2, bot1) print("Bot has been flipped") while not board.is_win() and not board.is_draw(): if player_1: act = get_move_for_bot(board, model=bot1, tree_iters=tree_iters) player_1 = False else: act = get_move_for_bot(board, model=bot2, tree_iters=tree_iters) player_1 = True board, win = board.drop_piece(act) print(board) # Here returning # 0 - draw # 1 - datagen won # -1 - main_model won # Hence when flipped we have to handle the values accordingly if flip: # Thus if we have flipped then main_model who is player 1 if its has won # then we want to return -1 for it and vice-versa return 0 if win == None else win*-1 else: return 0 if win == None else win