Spaces:
Sleeping
Sleeping
File size: 4,596 Bytes
c6d22b8 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 | import glob
import os
import sys
import numpy as np
from sb3_contrib import MaskablePPO
# Ensure project root is in path
sys.path.append(os.getcwd())
from ai.vector_env import VectorGameState
def run_self_play():
print("--- PPO Self-Play Verification ---")
# 1. Config
USE_LATEST = True
MODEL_PATH = "checkpoints/vector/interrupted_model.zip"
BATCH_SIZE = 50
N_GAMES = 100
if USE_LATEST:
list_of_files = glob.glob("checkpoints/vector/*.zip")
if list_of_files:
MODEL_PATH = max(list_of_files, key=os.path.getmtime)
print(f"Model: {MODEL_PATH}")
# 2. Load Model
print("Loading PPO Model...")
model = MaskablePPO.load(MODEL_PATH, device="cpu")
# 3. Init Raw VectorGameState (No Adapter, manually stepping for self-play)
print(f"Initializing {BATCH_SIZE} environments...")
env = VectorGameState(num_envs=BATCH_SIZE)
env.reset()
total_wins_p0 = 0
total_wins_p1 = 0
total_draws = 0
games_played = 0
num_batches = (N_GAMES + BATCH_SIZE - 1) // BATCH_SIZE
for b in range(num_batches):
env.reset()
active = np.ones(BATCH_SIZE, dtype=bool)
# Track if game ended in this batch loop
batch_wins_p0 = np.zeros(BATCH_SIZE, dtype=bool)
batch_wins_p1 = np.zeros(BATCH_SIZE, dtype=bool)
batch_draws = np.zeros(BATCH_SIZE, dtype=bool)
step_count = 0
while np.any(active) and step_count < 150: # Slightly longer for self-play
# Player 0 Turn (Perspective 0)
obs0 = env.get_observations(player_id=0)
masks0 = env.get_action_masks(player_id=0)
num_legal = np.sum(masks0[0])
if step_count == 0:
print(f" Step {step_count}: {num_legal} legal actions.")
act0_raw, _ = model.predict(obs0, action_masks=masks0, deterministic=True)
act0 = act0_raw.astype(np.int32)
# Player 1 Turn (Perspective 1)
obs1 = env.get_observations(player_id=1)
masks1 = env.get_action_masks(player_id=1)
act1_raw, _ = model.predict(obs1, action_masks=masks1, deterministic=True)
act1 = act1_raw.astype(np.int32)
# Step both!
env.step(act0, opp_actions=act1)
# Detailed Logging for Turn 1-5
if step_count < 5:
# Get more context for P0
stg0 = env.batch_stage[0]
sc0 = env.batch_scores[0]
ph0 = env.batch_global_ctx[0, 8]
print(f" T{step_count + 1} | P0 Act: {act0[0]} | Stage: {stg0} | Score: {sc0} | Ph: {ph0}")
# Check for dones (Custom logic since no adapter)
for i in range(BATCH_SIZE):
if active[i]:
sc0 = env.batch_scores[i]
sc1 = env.opp_scores[i]
is_done = False
if sc0 >= 3 or sc1 >= 3:
is_done = True
elif env.turn >= 50: # Shorten for speed in debug
is_done = True
if is_done:
active[i] = False
if sc0 >= 3 and sc0 > sc1:
batch_wins_p0[i] = True
elif sc1 >= 3 and sc1 > sc0:
batch_wins_p1[i] = True
else:
batch_draws[i] = True
step_count += 1
total_wins_p0 += np.sum(batch_wins_p0)
total_wins_p1 += np.sum(batch_wins_p1)
total_draws += np.sum(batch_draws)
games_played += BATCH_SIZE
print(f"Batch {b + 1} done. P0 Wins: {total_wins_p0}, P1 Wins: {total_wins_p1}, Draws: {total_draws}")
with open("benchmarks/self_play_results.txt", "w") as f:
f.write("\n--- Final Self-Play Results ---\n")
f.write(f"Total Games: {games_played}\n")
f.write(f"Player 0 Wins: {total_wins_p0}\n")
f.write(f"Player 1 Wins: {total_wins_p1}\n")
f.write(f"Draws: {total_draws}\n")
# Analyze
if total_wins_p0 + total_wins_p1 > 0:
f.write("RESULT: Agent CAN win games when playing against itself.\n")
else:
f.write("RESULT: Agent fails to win even in self-play. Policy likely broken.\n")
print("Results saved to benchmarks/self_play_results.txt")
if __name__ == "__main__":
run_self_play()
|