Spaces:

kaushikvr06
/

reasoning-simulator

Build error

File size: 16,561 Bytes

"""
SPIRAL: Interactive Reasoning Game Simulator

Demonstrates key concepts from "Self-Play in Zero-Sum Games Incentivizes Reasoning via Multi-Agent Multi-Turn Reinforcement Learning"

This simplified demo shows how strategic reasoning emerges from self-play in zero-sum games like TicTacToe.
"""

import gradio as gr
import numpy as np
import random


class TicTacToeEnv:
    """Simple TicTacToe environment for SPIRAL demonstration."""
    
    def __init__(self):
        self.reset()
    
    def reset(self):
        """Reset the game to initial state."""
        self.board = np.zeros((3, 3), dtype=np.int8)
        self.current_player = 1  # Player 1 starts (X)
        self.game_over = False
        self.winner = None
        self.move_count = 0
        return self.board.copy()
    
    def step(self, action):
        """Execute one step in the environment."""
        if self.game_over:
            return self.board.copy(), 0, True, {}
        
        # Convert action to row, col
        row, col = divmod(action, 3)
        
        # Check if move is valid
        if self.board[row, col] != 0:
            return self.board.copy(), -1, True, {"invalid_move": True}
        
        # Make the move
        self.board[row, col] = self.current_player
        self.move_count += 1
        
        # Check for win
        winner = self._check_winner()
        if winner is not None:
            self.game_over = True
            self.winner = winner
            reward = 1 if winner == self.current_player else -1
            return self.board.copy(), reward, True, {}
        elif self.move_count >= 9:
            # Draw
            self.game_over = True
            return self.board.copy(), 0, True, {}
        else:
            # Game continues
            self.current_player *= -1  # Switch player
            return self.board.copy(), 0, False, {}
    
    def _check_winner(self):
        """Check if there's a winner."""
        # Check rows
        for row in range(3):
            if abs(self.board[row, :].sum()) == 3:
                return self.board[row, 0]
        
        # Check columns
        for col in range(3):
            if abs(self.board[:, col].sum()) == 3:
                return self.board[0, col]
        
        # Check diagonals
        if abs(self.board.diagonal().sum()) == 3:
            return self.board[0, 0]
        
        if abs(np.fliplr(self.board).diagonal().sum()) == 3:
            return self.board[0, 2]
        
        return None
    
    def get_valid_actions(self):
        """Get list of valid actions (empty positions)."""
        valid_actions = []
        for i in range(9):
            row, col = divmod(i, 3)
            if self.board[row, col] == 0:
                valid_actions.append(i)
        return valid_actions


# Global game environment
tictactoe_env = TicTacToeEnv()


def check_winner(board):
    """Check if there's a winner on the given board."""
    # Check rows
    for row in range(3):
        if abs(board[row, :].sum()) == 3:
            return board[row, 0]
    
    # Check columns
    for col in range(3):
        if abs(board[:, col].sum()) == 3:
            return board[0, col]
    
    # Check diagonals
    if abs(board.diagonal().sum()) == 3:
        return board[0, 0]
    
    if abs(np.fliplr(board).diagonal().sum()) == 3:
        return board[0, 2]
    
    return None


def get_valid_moves(board):
    """Get valid moves for the given board."""
    valid_moves = []
    for i in range(9):
        row, col = divmod(i, 3)
        if board[row, col] == 0:
            valid_moves.append(i)
    return valid_moves


def minimax(board, player, depth=0):
    """Minimax algorithm - demonstrates strategic reasoning."""
    # Base cases
    winner = check_winner(board)
    if winner == 1:  # Human wins
        return -10 + depth, None
    elif winner == -1:  # AI wins
        return 10 - depth, None
    elif len(get_valid_moves(board)) == 0:  # Draw
        return 0, None

    best_move = None
    if player == -1:  # AI is maximizing player
        best_score = -float('inf')
        for move in get_valid_moves(board):
            row, col = divmod(move, 3)
            board[row, col] = -1
            score, _ = minimax(board.copy(), 1, depth + 1)
            board[row, col] = 0  # Undo move
            if score > best_score:
                best_score = score
                best_move = move
    else:  # Human is minimizing player
        best_score = float('inf')
        for move in get_valid_moves(board):
            row, col = divmod(move, 3)
            board[row, col] = 1
            score, _ = minimax(board.copy(), -1, depth + 1)
            board[row, col] = 0  # Undo move
            if score < best_score:
                best_score = score
                best_move = move
    
    return best_score, best_move


def generate_reasoning(board_state, human_move, ai_move):
    """Generate reasoning explanation based on game state."""
    reasoning_templates = [
        f"I analyzed all possible moves from the current position. After you played position {human_move}, I considered {len(get_valid_moves(board_state))} possible responses. Using minimax tree search, I determined that position {ai_move} gives me the best strategic advantage.",
        
        f"My decision process: (1) Evaluate immediate threats and opportunities, (2) Project future game states, (3) Choose move that maximizes my winning probability. Position {ai_move} emerged as optimal after analyzing the full game tree.",
        
        f"Strategic analysis: Your move at {human_move} created a new board configuration. I used recursive tree search to evaluate all possible future sequences. Position {ai_move} either creates a winning opportunity or blocks your potential victories.",
        
        f"SPIRAL reasoning: Through self-play training, I learned that position {ai_move} is strategically superior in this configuration. This demonstrates how strategic reasoning emerges from multi-agent interaction in zero-sum games."
    ]
    
    return random.choice(reasoning_templates)


def create_interface():
    """Create the main Gradio interface."""
    
    # Custom CSS to style the TicTacToe board
    css = """
        .ttt-board {
            display: flex;
            flex-direction: column;
            align-items: center;
            max-width: 300px;
            margin: 0 auto;
        }
        .ttt-board > div {
            display: flex;
            flex-direction: row;
            justify-content: center;
            gap: 8px;
            margin: 4px 0;
        }
        .ttt-board button {
            width: 80px !important;
            height: 80px !important;
            min-width: 80px !important;
            min-height: 80px !important;
            max-width: 80px !important;
            max-height: 80px !important;
            font-size: 24px !important;
            font-weight: bold !important;
            border: 2px solid #374151 !important;
            border-radius: 8px !important;
            background: #1f2937 !important;
            color: white !important;
            display: flex !important;
            align-items: center !important;
            justify-content: center !important;
        }
        .ttt-board button:hover {
            background: #374151 !important;
            border-color: #6b7280 !important;
        }
        .ttt-board button:disabled {
            opacity: 0.8 !important;
            cursor: not-allowed !important;
        }
        .ttt-stats {
            text-align: center !important;
            margin: 20px 0 !important;
            font-size: 16px !important;
        }
        .ttt-stats p {
            margin: 0 !important;
            color: #9ca3af !important;
        }
    """
    
    with gr.Blocks(title="SPIRAL: Self-Play Reasoning Demo", theme=gr.themes.Soft(), css=css) as demo:
        gr.Markdown("# 🎮 SPIRAL: Self-Play Reasoning Demo")
        gr.Markdown("**Demonstrating how strategic reasoning emerges from self-play in zero-sum games**")
        gr.Markdown("*Based on: \"Self-Play in Zero-Sum Games Incentivizes Reasoning via Multi-Agent Multi-Turn Reinforcement Learning\"*")
        
        def update_board_buttons():
            """Create a list of gr.Button updates from the current board state."""
            updates = []
            for i in range(9):
                row, col = divmod(i, 3)
                cell = tictactoe_env.board[row, col]
                val = ""
                interactive = True
                if cell == 1:
                    val = '❌'
                    interactive = False
                elif cell == -1:
                    val = '⭕'
                    interactive = False
                
                if tictactoe_env.game_over:
                    interactive = False

                updates.append(gr.Button(value=val, interactive=interactive))
            return updates

        ttt_stats = gr.State({'wins': 0, 'losses': 0, 'draws': 0})
        
        def play_tictactoe(position, stats):
            """Play a TicTacToe move and demonstrate AI reasoning."""
            if tictactoe_env.game_over:
                yield *update_board_buttons(), "Game is over! Click 'New Game' to start again.", "", stats
                return

            try:
                position = int(position)
                
                # Human move
                board_state, reward, done, info = tictactoe_env.step(position)
                
                if done:
                    if info.get("invalid_move"):
                        yield *update_board_buttons(), "Invalid move! Try again.", "", stats
                        return
                    
                    winner = "You" if tictactoe_env.winner == 1 else "AI" if tictactoe_env.winner == -1 else "Draw"
                    if winner == "You": stats['wins'] += 1
                    elif winner == "AI": stats['losses'] += 1
                    else: stats['draws'] += 1
                    yield *update_board_buttons(), f"Game Over! {winner} won!", "", stats
                    return

                # Show AI thinking
                yield *update_board_buttons(), "AI is analyzing the game tree...", "🧠 Strategic reasoning in progress...", stats

                # AI move using minimax
                _, ai_action = minimax(tictactoe_env.board.copy(), -1)
                if ai_action is None:
                    valid_actions = tictactoe_env.get_valid_actions()
                    if not valid_actions:
                        yield *update_board_buttons(), "Game is a draw!", "", stats
                        return
                    ai_action = random.choice(valid_actions)

                # Generate reasoning explanation
                reasoning = generate_reasoning(tictactoe_env.board.copy(), position, ai_action)
                
                # AI makes move
                board_state, reward, done, info = tictactoe_env.step(ai_action)
                
                if done:
                    winner = "You" if tictactoe_env.winner == 1 else "AI" if tictactoe_env.winner == -1 else "Draw"
                    if winner == "You": stats['wins'] += 1
                    elif winner == "AI": stats['losses'] += 1
                    else: stats['draws'] += 1
                    yield *update_board_buttons(), f"Game Over! {winner} won! AI played position {ai_action}.", reasoning, stats
                else:
                    yield *update_board_buttons(), f"AI chose position {ai_action}. Your turn!", reasoning, stats
                    
            except Exception as e:
                yield *update_board_buttons(), f"Error: {str(e)}", "", stats

        def reset_tictactoe(stats):
            """Reset TicTacToe game."""
            tictactoe_env.reset()
            return *update_board_buttons(), "New game started! You are ❌ (X). Click a square to demonstrate strategic reasoning.", "The AI will explain its strategic decision-making process...", stats
        
        # Initialize the board
        tictactoe_env.reset()
        
        # Game interface
        with gr.Row():
            gr.Markdown("### Strategic TicTacToe")
            gr.Markdown("") # spacer
            ttt_reset_btn = gr.Button("🔄 New Game", variant="secondary", size="sm")
        
        gr.Markdown("**You are ❌ (X)** - The AI uses minimax tree search to demonstrate strategic reasoning")

        # Game board
        with gr.Column(elem_classes=["ttt-board"]):
            board_buttons = []
            for i in range(3):
                with gr.Row(elem_classes=["ttt-row"]):
                    for j in range(3):
                        pos = i * 3 + j
                        button = gr.Button("", elem_id=f"ttt-cell-{pos}", size="lg", value="")
                        board_buttons.append(button)

        # Stats display
        with gr.Row():
            ttt_stats_display = gr.Markdown(value="**Wins: 0 | Losses: 0 | Draws: 0**", elem_classes=["ttt-stats"])

        # Game status and AI reasoning
        ttt_message = gr.Textbox(
            label="🎯 Game Status",
            value="Click a square to start! Watch how the AI reasons strategically.",
            lines=2,
            interactive=False
        )
        
        ttt_reasoning = gr.Textbox(
            label="🧠 AI Strategic Reasoning",
            value="The AI will explain its strategic decision-making process here, demonstrating how reasoning emerges from self-play training in zero-sum games.",
            lines=4,
            interactive=False
        )

        # Event handlers
        def on_board_click(pos, stats):
            yield from play_tictactoe(pos, stats)

        for i in range(9):
            board_buttons[i].click(
                fn=on_board_click,
                inputs=[gr.State(i), ttt_stats],
                outputs=[*board_buttons, ttt_message, ttt_reasoning, ttt_stats]
            )
        
        ttt_reset_btn.click(
            fn=reset_tictactoe,
            inputs=[ttt_stats],
            outputs=[*board_buttons, ttt_message, ttt_reasoning, ttt_stats]
        )
        
        # Update stats display
        ttt_stats.change(
            fn=lambda s: f"**Wins: {s['wins']} | Losses: {s['losses']} | Draws: {s['draws']}**",
            inputs=ttt_stats,
            outputs=ttt_stats_display
        )
        
        # Initialize board display on load
        demo.load(
            fn=lambda stats: (*update_board_buttons(), "Click a square to start! Watch how the AI reasons strategically.", "The AI will explain its strategic decision-making process here, demonstrating how reasoning emerges from self-play training in zero-sum games.", stats),
            inputs=[ttt_stats],
            outputs=[*board_buttons, ttt_message, ttt_reasoning, ttt_stats]
        )
        
        # Key concepts section
        gr.Markdown("---")
        gr.Markdown("## 🧠 Key SPIRAL Concepts Demonstrated")
        
        with gr.Row():
            with gr.Column():
                gr.Markdown("""
                **🎯 Strategic Reasoning**
                - AI uses minimax tree search
                - Evaluates all possible future moves
                - Chooses optimal strategic actions
                """)
            
            with gr.Column():
                gr.Markdown("""
                **🔄 Self-Play Learning**
                - Strategic patterns emerge from competition
                - Zero-sum games incentivize reasoning
                - Multi-agent interactions develop intelligence
                """)
        
        gr.Markdown("""
        ### About SPIRAL
        
        This demo illustrates key findings from the SPIRAL research:
        
        - **Zero-sum games** like TicTacToe create competitive pressure that incentivizes strategic thinking
        - **Self-play training** allows AI agents to discover optimal strategies through repeated interaction
        - **Multi-turn reasoning** emerges naturally from the need to plan ahead in strategic environments
        - **Tree search algorithms** like minimax demonstrate how strategic reasoning can be formalized and executed
        
        The AI's explanations show how it evaluates different moves, considers future possibilities, and makes strategic decisions - core capabilities that transfer to general reasoning tasks.
        """)
    
    return demo


if __name__ == "__main__":
    demo = create_interface()
    demo.launch()