AlphaQuoridor / self_play.py
doraking's picture
Upload 10 files
2437f34 verified
# ====================
# Self-Play Part
# ====================
# Importing packages
from game import State
from pv_mcts import pv_mcts_scores
from dual_network import DN_OUTPUT_SIZE
from datetime import datetime
from tensorflow.keras.models import load_model
from tensorflow.keras import backend as K
from pathlib import Path
import numpy as np
import pickle
import os
from copy import deepcopy
# Preparing parameters
SP_GAME_COUNT = 50 # Number of games for self-play (25000 in the original version)
SP_TEMPERATURE = 1.0 # Temperature parameter for Boltzmann distribution
# Value of the first player
def first_player_value(ended_state):
# 1: First player wins, -1: First player loses, 0: Draw
if ended_state.is_lose():
return -1 if ended_state.is_first_player() else 1
return 0
# Saving training data
def write_data(history):
now = datetime.now()
os.makedirs('./data/', exist_ok=True) # Create folder if it does not exist
path = './data/{:04}{:02}{:02}{:02}{:02}{:02}.history'.format(
now.year, now.month, now.day, now.hour, now.minute, now.second)
with open(path, mode='wb') as f:
pickle.dump(history, f)
# Executing one game
def play(model):
# Training data
history = []
# Generating the state
state = State()
while True:
# When the game ends
if state.is_done():
break
# Getting the probability distribution of legal moves
scores = pv_mcts_scores(model, deepcopy(state), SP_TEMPERATURE)
# Adding the state and policy to the training data
policies = [0] * DN_OUTPUT_SIZE
for action, policy in zip(state.legal_actions(), scores):
policies[action] = policy
history.append([state.pieces_array(), policies, None])
# Getting the action
action = np.random.choice(state.legal_actions(), p=scores)
# Getting the next state
state = state.next(action)
# Adding the value to the training data
value = first_player_value(state)
for i in range(len(history)):
history[i][2] = value
value = -value
return history
# Self-Play
def self_play():
# Training data
history = []
# Loading the best player's model
model = load_model('./model/best.keras')
# Executing multiple games
for i in range(SP_GAME_COUNT):
# Executing one game
h = play(model)
history.extend(h)
# Output
print('\rSelfPlay {}/{}'.format(i+1, SP_GAME_COUNT), end='')
print('')
# Saving the training data
write_data(history)
# Clearing the model
K.clear_session()
del model
# Running the function
if __name__ == '__main__':
self_play()