import glob import os import sys import numpy as np from sb3_contrib import MaskablePPO # Ensure project root is in path sys.path.append(os.getcwd()) from ai.vector_env import VectorGameState def run_self_play(): print("--- PPO Self-Play Verification ---") # 1. Config USE_LATEST = True MODEL_PATH = "checkpoints/vector/interrupted_model.zip" BATCH_SIZE = 50 N_GAMES = 100 if USE_LATEST: list_of_files = glob.glob("checkpoints/vector/*.zip") if list_of_files: MODEL_PATH = max(list_of_files, key=os.path.getmtime) print(f"Model: {MODEL_PATH}") # 2. Load Model print("Loading PPO Model...") model = MaskablePPO.load(MODEL_PATH, device="cpu") # 3. Init Raw VectorGameState (No Adapter, manually stepping for self-play) print(f"Initializing {BATCH_SIZE} environments...") env = VectorGameState(num_envs=BATCH_SIZE) env.reset() total_wins_p0 = 0 total_wins_p1 = 0 total_draws = 0 games_played = 0 num_batches = (N_GAMES + BATCH_SIZE - 1) // BATCH_SIZE for b in range(num_batches): env.reset() active = np.ones(BATCH_SIZE, dtype=bool) # Track if game ended in this batch loop batch_wins_p0 = np.zeros(BATCH_SIZE, dtype=bool) batch_wins_p1 = np.zeros(BATCH_SIZE, dtype=bool) batch_draws = np.zeros(BATCH_SIZE, dtype=bool) step_count = 0 while np.any(active) and step_count < 150: # Slightly longer for self-play # Player 0 Turn (Perspective 0) obs0 = env.get_observations(player_id=0) masks0 = env.get_action_masks(player_id=0) num_legal = np.sum(masks0[0]) if step_count == 0: print(f" Step {step_count}: {num_legal} legal actions.") act0_raw, _ = model.predict(obs0, action_masks=masks0, deterministic=True) act0 = act0_raw.astype(np.int32) # Player 1 Turn (Perspective 1) obs1 = env.get_observations(player_id=1) masks1 = env.get_action_masks(player_id=1) act1_raw, _ = model.predict(obs1, action_masks=masks1, deterministic=True) act1 = act1_raw.astype(np.int32) # Step both! env.step(act0, opp_actions=act1) # Detailed Logging for Turn 1-5 if step_count < 5: # Get more context for P0 stg0 = env.batch_stage[0] sc0 = env.batch_scores[0] ph0 = env.batch_global_ctx[0, 8] print(f" T{step_count + 1} | P0 Act: {act0[0]} | Stage: {stg0} | Score: {sc0} | Ph: {ph0}") # Check for dones (Custom logic since no adapter) for i in range(BATCH_SIZE): if active[i]: sc0 = env.batch_scores[i] sc1 = env.opp_scores[i] is_done = False if sc0 >= 3 or sc1 >= 3: is_done = True elif env.turn >= 50: # Shorten for speed in debug is_done = True if is_done: active[i] = False if sc0 >= 3 and sc0 > sc1: batch_wins_p0[i] = True elif sc1 >= 3 and sc1 > sc0: batch_wins_p1[i] = True else: batch_draws[i] = True step_count += 1 total_wins_p0 += np.sum(batch_wins_p0) total_wins_p1 += np.sum(batch_wins_p1) total_draws += np.sum(batch_draws) games_played += BATCH_SIZE print(f"Batch {b + 1} done. P0 Wins: {total_wins_p0}, P1 Wins: {total_wins_p1}, Draws: {total_draws}") with open("benchmarks/self_play_results.txt", "w") as f: f.write("\n--- Final Self-Play Results ---\n") f.write(f"Total Games: {games_played}\n") f.write(f"Player 0 Wins: {total_wins_p0}\n") f.write(f"Player 1 Wins: {total_wins_p1}\n") f.write(f"Draws: {total_draws}\n") # Analyze if total_wins_p0 + total_wins_p1 > 0: f.write("RESULT: Agent CAN win games when playing against itself.\n") else: f.write("RESULT: Agent fails to win even in self-play. Policy likely broken.\n") print("Results saved to benchmarks/self_play_results.txt") if __name__ == "__main__": run_self_play()