""" SPIRAL: Interactive Reasoning Game Simulator Demonstrates key concepts from "Self-Play in Zero-Sum Games Incentivizes Reasoning via Multi-Agent Multi-Turn Reinforcement Learning" This simplified demo shows how strategic reasoning emerges from self-play in zero-sum games like TicTacToe. """ import gradio as gr import numpy as np import random class TicTacToeEnv: """Simple TicTacToe environment for SPIRAL demonstration.""" def __init__(self): self.reset() def reset(self): """Reset the game to initial state.""" self.board = np.zeros((3, 3), dtype=np.int8) self.current_player = 1 # Player 1 starts (X) self.game_over = False self.winner = None self.move_count = 0 return self.board.copy() def step(self, action): """Execute one step in the environment.""" if self.game_over: return self.board.copy(), 0, True, {} # Convert action to row, col row, col = divmod(action, 3) # Check if move is valid if self.board[row, col] != 0: return self.board.copy(), -1, True, {"invalid_move": True} # Make the move self.board[row, col] = self.current_player self.move_count += 1 # Check for win winner = self._check_winner() if winner is not None: self.game_over = True self.winner = winner reward = 1 if winner == self.current_player else -1 return self.board.copy(), reward, True, {} elif self.move_count >= 9: # Draw self.game_over = True return self.board.copy(), 0, True, {} else: # Game continues self.current_player *= -1 # Switch player return self.board.copy(), 0, False, {} def _check_winner(self): """Check if there's a winner.""" # Check rows for row in range(3): if abs(self.board[row, :].sum()) == 3: return self.board[row, 0] # Check columns for col in range(3): if abs(self.board[:, col].sum()) == 3: return self.board[0, col] # Check diagonals if abs(self.board.diagonal().sum()) == 3: return self.board[0, 0] if abs(np.fliplr(self.board).diagonal().sum()) == 3: return self.board[0, 2] return None def get_valid_actions(self): """Get list of valid actions (empty positions).""" valid_actions = [] for i in range(9): row, col = divmod(i, 3) if self.board[row, col] == 0: valid_actions.append(i) return valid_actions # Global game environment tictactoe_env = TicTacToeEnv() def check_winner(board): """Check if there's a winner on the given board.""" # Check rows for row in range(3): if abs(board[row, :].sum()) == 3: return board[row, 0] # Check columns for col in range(3): if abs(board[:, col].sum()) == 3: return board[0, col] # Check diagonals if abs(board.diagonal().sum()) == 3: return board[0, 0] if abs(np.fliplr(board).diagonal().sum()) == 3: return board[0, 2] return None def get_valid_moves(board): """Get valid moves for the given board.""" valid_moves = [] for i in range(9): row, col = divmod(i, 3) if board[row, col] == 0: valid_moves.append(i) return valid_moves def minimax(board, player, depth=0): """Minimax algorithm - demonstrates strategic reasoning.""" # Base cases winner = check_winner(board) if winner == 1: # Human wins return -10 + depth, None elif winner == -1: # AI wins return 10 - depth, None elif len(get_valid_moves(board)) == 0: # Draw return 0, None best_move = None if player == -1: # AI is maximizing player best_score = -float('inf') for move in get_valid_moves(board): row, col = divmod(move, 3) board[row, col] = -1 score, _ = minimax(board.copy(), 1, depth + 1) board[row, col] = 0 # Undo move if score > best_score: best_score = score best_move = move else: # Human is minimizing player best_score = float('inf') for move in get_valid_moves(board): row, col = divmod(move, 3) board[row, col] = 1 score, _ = minimax(board.copy(), -1, depth + 1) board[row, col] = 0 # Undo move if score < best_score: best_score = score best_move = move return best_score, best_move def generate_reasoning(board_state, human_move, ai_move): """Generate reasoning explanation based on game state.""" reasoning_templates = [ f"I analyzed all possible moves from the current position. After you played position {human_move}, I considered {len(get_valid_moves(board_state))} possible responses. Using minimax tree search, I determined that position {ai_move} gives me the best strategic advantage.", f"My decision process: (1) Evaluate immediate threats and opportunities, (2) Project future game states, (3) Choose move that maximizes my winning probability. Position {ai_move} emerged as optimal after analyzing the full game tree.", f"Strategic analysis: Your move at {human_move} created a new board configuration. I used recursive tree search to evaluate all possible future sequences. Position {ai_move} either creates a winning opportunity or blocks your potential victories.", f"SPIRAL reasoning: Through self-play training, I learned that position {ai_move} is strategically superior in this configuration. This demonstrates how strategic reasoning emerges from multi-agent interaction in zero-sum games." ] return random.choice(reasoning_templates) def create_interface(): """Create the main Gradio interface.""" # Custom CSS to style the TicTacToe board css = """ .ttt-board { display: flex; flex-direction: column; align-items: center; max-width: 300px; margin: 0 auto; } .ttt-board > div { display: flex; flex-direction: row; justify-content: center; gap: 8px; margin: 4px 0; } .ttt-board button { width: 80px !important; height: 80px !important; min-width: 80px !important; min-height: 80px !important; max-width: 80px !important; max-height: 80px !important; font-size: 24px !important; font-weight: bold !important; border: 2px solid #374151 !important; border-radius: 8px !important; background: #1f2937 !important; color: white !important; display: flex !important; align-items: center !important; justify-content: center !important; } .ttt-board button:hover { background: #374151 !important; border-color: #6b7280 !important; } .ttt-board button:disabled { opacity: 0.8 !important; cursor: not-allowed !important; } .ttt-stats { text-align: center !important; margin: 20px 0 !important; font-size: 16px !important; } .ttt-stats p { margin: 0 !important; color: #9ca3af !important; } """ with gr.Blocks(title="SPIRAL: Self-Play Reasoning Demo", theme=gr.themes.Soft(), css=css) as demo: gr.Markdown("# 🎮 SPIRAL: Self-Play Reasoning Demo") gr.Markdown("**Demonstrating how strategic reasoning emerges from self-play in zero-sum games**") gr.Markdown("*Based on: \"Self-Play in Zero-Sum Games Incentivizes Reasoning via Multi-Agent Multi-Turn Reinforcement Learning\"*") def update_board_buttons(): """Create a list of gr.Button updates from the current board state.""" updates = [] for i in range(9): row, col = divmod(i, 3) cell = tictactoe_env.board[row, col] val = "" interactive = True if cell == 1: val = '❌' interactive = False elif cell == -1: val = '⭕' interactive = False if tictactoe_env.game_over: interactive = False updates.append(gr.Button(value=val, interactive=interactive)) return updates ttt_stats = gr.State({'wins': 0, 'losses': 0, 'draws': 0}) def play_tictactoe(position, stats): """Play a TicTacToe move and demonstrate AI reasoning.""" if tictactoe_env.game_over: yield *update_board_buttons(), "Game is over! Click 'New Game' to start again.", "", stats return try: position = int(position) # Human move board_state, reward, done, info = tictactoe_env.step(position) if done: if info.get("invalid_move"): yield *update_board_buttons(), "Invalid move! Try again.", "", stats return winner = "You" if tictactoe_env.winner == 1 else "AI" if tictactoe_env.winner == -1 else "Draw" if winner == "You": stats['wins'] += 1 elif winner == "AI": stats['losses'] += 1 else: stats['draws'] += 1 yield *update_board_buttons(), f"Game Over! {winner} won!", "", stats return # Show AI thinking yield *update_board_buttons(), "AI is analyzing the game tree...", "🧠 Strategic reasoning in progress...", stats # AI move using minimax _, ai_action = minimax(tictactoe_env.board.copy(), -1) if ai_action is None: valid_actions = tictactoe_env.get_valid_actions() if not valid_actions: yield *update_board_buttons(), "Game is a draw!", "", stats return ai_action = random.choice(valid_actions) # Generate reasoning explanation reasoning = generate_reasoning(tictactoe_env.board.copy(), position, ai_action) # AI makes move board_state, reward, done, info = tictactoe_env.step(ai_action) if done: winner = "You" if tictactoe_env.winner == 1 else "AI" if tictactoe_env.winner == -1 else "Draw" if winner == "You": stats['wins'] += 1 elif winner == "AI": stats['losses'] += 1 else: stats['draws'] += 1 yield *update_board_buttons(), f"Game Over! {winner} won! AI played position {ai_action}.", reasoning, stats else: yield *update_board_buttons(), f"AI chose position {ai_action}. Your turn!", reasoning, stats except Exception as e: yield *update_board_buttons(), f"Error: {str(e)}", "", stats def reset_tictactoe(stats): """Reset TicTacToe game.""" tictactoe_env.reset() return *update_board_buttons(), "New game started! You are ❌ (X). Click a square to demonstrate strategic reasoning.", "The AI will explain its strategic decision-making process...", stats # Initialize the board tictactoe_env.reset() # Game interface with gr.Row(): gr.Markdown("### Strategic TicTacToe") gr.Markdown("") # spacer ttt_reset_btn = gr.Button("🔄 New Game", variant="secondary", size="sm") gr.Markdown("**You are ❌ (X)** - The AI uses minimax tree search to demonstrate strategic reasoning") # Game board with gr.Column(elem_classes=["ttt-board"]): board_buttons = [] for i in range(3): with gr.Row(elem_classes=["ttt-row"]): for j in range(3): pos = i * 3 + j button = gr.Button("", elem_id=f"ttt-cell-{pos}", size="lg", value="") board_buttons.append(button) # Stats display with gr.Row(): ttt_stats_display = gr.Markdown(value="**Wins: 0 | Losses: 0 | Draws: 0**", elem_classes=["ttt-stats"]) # Game status and AI reasoning ttt_message = gr.Textbox( label="🎯 Game Status", value="Click a square to start! Watch how the AI reasons strategically.", lines=2, interactive=False ) ttt_reasoning = gr.Textbox( label="🧠 AI Strategic Reasoning", value="The AI will explain its strategic decision-making process here, demonstrating how reasoning emerges from self-play training in zero-sum games.", lines=4, interactive=False ) # Event handlers def on_board_click(pos, stats): yield from play_tictactoe(pos, stats) for i in range(9): board_buttons[i].click( fn=on_board_click, inputs=[gr.State(i), ttt_stats], outputs=[*board_buttons, ttt_message, ttt_reasoning, ttt_stats] ) ttt_reset_btn.click( fn=reset_tictactoe, inputs=[ttt_stats], outputs=[*board_buttons, ttt_message, ttt_reasoning, ttt_stats] ) # Update stats display ttt_stats.change( fn=lambda s: f"**Wins: {s['wins']} | Losses: {s['losses']} | Draws: {s['draws']}**", inputs=ttt_stats, outputs=ttt_stats_display ) # Initialize board display on load demo.load( fn=lambda stats: (*update_board_buttons(), "Click a square to start! Watch how the AI reasons strategically.", "The AI will explain its strategic decision-making process here, demonstrating how reasoning emerges from self-play training in zero-sum games.", stats), inputs=[ttt_stats], outputs=[*board_buttons, ttt_message, ttt_reasoning, ttt_stats] ) # Key concepts section gr.Markdown("---") gr.Markdown("## 🧠 Key SPIRAL Concepts Demonstrated") with gr.Row(): with gr.Column(): gr.Markdown(""" **🎯 Strategic Reasoning** - AI uses minimax tree search - Evaluates all possible future moves - Chooses optimal strategic actions """) with gr.Column(): gr.Markdown(""" **🔄 Self-Play Learning** - Strategic patterns emerge from competition - Zero-sum games incentivize reasoning - Multi-agent interactions develop intelligence """) gr.Markdown(""" ### About SPIRAL This demo illustrates key findings from the SPIRAL research: - **Zero-sum games** like TicTacToe create competitive pressure that incentivizes strategic thinking - **Self-play training** allows AI agents to discover optimal strategies through repeated interaction - **Multi-turn reasoning** emerges naturally from the need to plan ahead in strategic environments - **Tree search algorithms** like minimax demonstrate how strategic reasoning can be formalized and executed The AI's explanations show how it evaluates different moves, considers future possibilities, and makes strategic decisions - core capabilities that transfer to general reasoning tasks. """) return demo if __name__ == "__main__": demo = create_interface() demo.launch()