Kaushik Rajan
Simplify codebase: focused SPIRAL TicTacToe demo with key research concepts
842d62b
raw
history blame
16.6 kB
"""
SPIRAL: Interactive Reasoning Game Simulator
Demonstrates key concepts from "Self-Play in Zero-Sum Games Incentivizes Reasoning via Multi-Agent Multi-Turn Reinforcement Learning"
This simplified demo shows how strategic reasoning emerges from self-play in zero-sum games like TicTacToe.
"""
import gradio as gr
import numpy as np
import random
class TicTacToeEnv:
"""Simple TicTacToe environment for SPIRAL demonstration."""
def __init__(self):
self.reset()
def reset(self):
"""Reset the game to initial state."""
self.board = np.zeros((3, 3), dtype=np.int8)
self.current_player = 1 # Player 1 starts (X)
self.game_over = False
self.winner = None
self.move_count = 0
return self.board.copy()
def step(self, action):
"""Execute one step in the environment."""
if self.game_over:
return self.board.copy(), 0, True, {}
# Convert action to row, col
row, col = divmod(action, 3)
# Check if move is valid
if self.board[row, col] != 0:
return self.board.copy(), -1, True, {"invalid_move": True}
# Make the move
self.board[row, col] = self.current_player
self.move_count += 1
# Check for win
winner = self._check_winner()
if winner is not None:
self.game_over = True
self.winner = winner
reward = 1 if winner == self.current_player else -1
return self.board.copy(), reward, True, {}
elif self.move_count >= 9:
# Draw
self.game_over = True
return self.board.copy(), 0, True, {}
else:
# Game continues
self.current_player *= -1 # Switch player
return self.board.copy(), 0, False, {}
def _check_winner(self):
"""Check if there's a winner."""
# Check rows
for row in range(3):
if abs(self.board[row, :].sum()) == 3:
return self.board[row, 0]
# Check columns
for col in range(3):
if abs(self.board[:, col].sum()) == 3:
return self.board[0, col]
# Check diagonals
if abs(self.board.diagonal().sum()) == 3:
return self.board[0, 0]
if abs(np.fliplr(self.board).diagonal().sum()) == 3:
return self.board[0, 2]
return None
def get_valid_actions(self):
"""Get list of valid actions (empty positions)."""
valid_actions = []
for i in range(9):
row, col = divmod(i, 3)
if self.board[row, col] == 0:
valid_actions.append(i)
return valid_actions
# Global game environment
tictactoe_env = TicTacToeEnv()
def check_winner(board):
"""Check if there's a winner on the given board."""
# Check rows
for row in range(3):
if abs(board[row, :].sum()) == 3:
return board[row, 0]
# Check columns
for col in range(3):
if abs(board[:, col].sum()) == 3:
return board[0, col]
# Check diagonals
if abs(board.diagonal().sum()) == 3:
return board[0, 0]
if abs(np.fliplr(board).diagonal().sum()) == 3:
return board[0, 2]
return None
def get_valid_moves(board):
"""Get valid moves for the given board."""
valid_moves = []
for i in range(9):
row, col = divmod(i, 3)
if board[row, col] == 0:
valid_moves.append(i)
return valid_moves
def minimax(board, player, depth=0):
"""Minimax algorithm - demonstrates strategic reasoning."""
# Base cases
winner = check_winner(board)
if winner == 1: # Human wins
return -10 + depth, None
elif winner == -1: # AI wins
return 10 - depth, None
elif len(get_valid_moves(board)) == 0: # Draw
return 0, None
best_move = None
if player == -1: # AI is maximizing player
best_score = -float('inf')
for move in get_valid_moves(board):
row, col = divmod(move, 3)
board[row, col] = -1
score, _ = minimax(board.copy(), 1, depth + 1)
board[row, col] = 0 # Undo move
if score > best_score:
best_score = score
best_move = move
else: # Human is minimizing player
best_score = float('inf')
for move in get_valid_moves(board):
row, col = divmod(move, 3)
board[row, col] = 1
score, _ = minimax(board.copy(), -1, depth + 1)
board[row, col] = 0 # Undo move
if score < best_score:
best_score = score
best_move = move
return best_score, best_move
def generate_reasoning(board_state, human_move, ai_move):
"""Generate reasoning explanation based on game state."""
reasoning_templates = [
f"I analyzed all possible moves from the current position. After you played position {human_move}, I considered {len(get_valid_moves(board_state))} possible responses. Using minimax tree search, I determined that position {ai_move} gives me the best strategic advantage.",
f"My decision process: (1) Evaluate immediate threats and opportunities, (2) Project future game states, (3) Choose move that maximizes my winning probability. Position {ai_move} emerged as optimal after analyzing the full game tree.",
f"Strategic analysis: Your move at {human_move} created a new board configuration. I used recursive tree search to evaluate all possible future sequences. Position {ai_move} either creates a winning opportunity or blocks your potential victories.",
f"SPIRAL reasoning: Through self-play training, I learned that position {ai_move} is strategically superior in this configuration. This demonstrates how strategic reasoning emerges from multi-agent interaction in zero-sum games."
]
return random.choice(reasoning_templates)
def create_interface():
"""Create the main Gradio interface."""
# Custom CSS to style the TicTacToe board
css = """
.ttt-board {
display: flex;
flex-direction: column;
align-items: center;
max-width: 300px;
margin: 0 auto;
}
.ttt-board > div {
display: flex;
flex-direction: row;
justify-content: center;
gap: 8px;
margin: 4px 0;
}
.ttt-board button {
width: 80px !important;
height: 80px !important;
min-width: 80px !important;
min-height: 80px !important;
max-width: 80px !important;
max-height: 80px !important;
font-size: 24px !important;
font-weight: bold !important;
border: 2px solid #374151 !important;
border-radius: 8px !important;
background: #1f2937 !important;
color: white !important;
display: flex !important;
align-items: center !important;
justify-content: center !important;
}
.ttt-board button:hover {
background: #374151 !important;
border-color: #6b7280 !important;
}
.ttt-board button:disabled {
opacity: 0.8 !important;
cursor: not-allowed !important;
}
.ttt-stats {
text-align: center !important;
margin: 20px 0 !important;
font-size: 16px !important;
}
.ttt-stats p {
margin: 0 !important;
color: #9ca3af !important;
}
"""
with gr.Blocks(title="SPIRAL: Self-Play Reasoning Demo", theme=gr.themes.Soft(), css=css) as demo:
gr.Markdown("# 🎮 SPIRAL: Self-Play Reasoning Demo")
gr.Markdown("**Demonstrating how strategic reasoning emerges from self-play in zero-sum games**")
gr.Markdown("*Based on: \"Self-Play in Zero-Sum Games Incentivizes Reasoning via Multi-Agent Multi-Turn Reinforcement Learning\"*")
def update_board_buttons():
"""Create a list of gr.Button updates from the current board state."""
updates = []
for i in range(9):
row, col = divmod(i, 3)
cell = tictactoe_env.board[row, col]
val = ""
interactive = True
if cell == 1:
val = '❌'
interactive = False
elif cell == -1:
val = '⭕'
interactive = False
if tictactoe_env.game_over:
interactive = False
updates.append(gr.Button(value=val, interactive=interactive))
return updates
ttt_stats = gr.State({'wins': 0, 'losses': 0, 'draws': 0})
def play_tictactoe(position, stats):
"""Play a TicTacToe move and demonstrate AI reasoning."""
if tictactoe_env.game_over:
yield *update_board_buttons(), "Game is over! Click 'New Game' to start again.", "", stats
return
try:
position = int(position)
# Human move
board_state, reward, done, info = tictactoe_env.step(position)
if done:
if info.get("invalid_move"):
yield *update_board_buttons(), "Invalid move! Try again.", "", stats
return
winner = "You" if tictactoe_env.winner == 1 else "AI" if tictactoe_env.winner == -1 else "Draw"
if winner == "You": stats['wins'] += 1
elif winner == "AI": stats['losses'] += 1
else: stats['draws'] += 1
yield *update_board_buttons(), f"Game Over! {winner} won!", "", stats
return
# Show AI thinking
yield *update_board_buttons(), "AI is analyzing the game tree...", "🧠 Strategic reasoning in progress...", stats
# AI move using minimax
_, ai_action = minimax(tictactoe_env.board.copy(), -1)
if ai_action is None:
valid_actions = tictactoe_env.get_valid_actions()
if not valid_actions:
yield *update_board_buttons(), "Game is a draw!", "", stats
return
ai_action = random.choice(valid_actions)
# Generate reasoning explanation
reasoning = generate_reasoning(tictactoe_env.board.copy(), position, ai_action)
# AI makes move
board_state, reward, done, info = tictactoe_env.step(ai_action)
if done:
winner = "You" if tictactoe_env.winner == 1 else "AI" if tictactoe_env.winner == -1 else "Draw"
if winner == "You": stats['wins'] += 1
elif winner == "AI": stats['losses'] += 1
else: stats['draws'] += 1
yield *update_board_buttons(), f"Game Over! {winner} won! AI played position {ai_action}.", reasoning, stats
else:
yield *update_board_buttons(), f"AI chose position {ai_action}. Your turn!", reasoning, stats
except Exception as e:
yield *update_board_buttons(), f"Error: {str(e)}", "", stats
def reset_tictactoe(stats):
"""Reset TicTacToe game."""
tictactoe_env.reset()
return *update_board_buttons(), "New game started! You are ❌ (X). Click a square to demonstrate strategic reasoning.", "The AI will explain its strategic decision-making process...", stats
# Initialize the board
tictactoe_env.reset()
# Game interface
with gr.Row():
gr.Markdown("### Strategic TicTacToe")
gr.Markdown("") # spacer
ttt_reset_btn = gr.Button("🔄 New Game", variant="secondary", size="sm")
gr.Markdown("**You are ❌ (X)** - The AI uses minimax tree search to demonstrate strategic reasoning")
# Game board
with gr.Column(elem_classes=["ttt-board"]):
board_buttons = []
for i in range(3):
with gr.Row(elem_classes=["ttt-row"]):
for j in range(3):
pos = i * 3 + j
button = gr.Button("", elem_id=f"ttt-cell-{pos}", size="lg", value="")
board_buttons.append(button)
# Stats display
with gr.Row():
ttt_stats_display = gr.Markdown(value="**Wins: 0 | Losses: 0 | Draws: 0**", elem_classes=["ttt-stats"])
# Game status and AI reasoning
ttt_message = gr.Textbox(
label="🎯 Game Status",
value="Click a square to start! Watch how the AI reasons strategically.",
lines=2,
interactive=False
)
ttt_reasoning = gr.Textbox(
label="🧠 AI Strategic Reasoning",
value="The AI will explain its strategic decision-making process here, demonstrating how reasoning emerges from self-play training in zero-sum games.",
lines=4,
interactive=False
)
# Event handlers
def on_board_click(pos, stats):
yield from play_tictactoe(pos, stats)
for i in range(9):
board_buttons[i].click(
fn=on_board_click,
inputs=[gr.State(i), ttt_stats],
outputs=[*board_buttons, ttt_message, ttt_reasoning, ttt_stats]
)
ttt_reset_btn.click(
fn=reset_tictactoe,
inputs=[ttt_stats],
outputs=[*board_buttons, ttt_message, ttt_reasoning, ttt_stats]
)
# Update stats display
ttt_stats.change(
fn=lambda s: f"**Wins: {s['wins']} | Losses: {s['losses']} | Draws: {s['draws']}**",
inputs=ttt_stats,
outputs=ttt_stats_display
)
# Initialize board display on load
demo.load(
fn=lambda stats: (*update_board_buttons(), "Click a square to start! Watch how the AI reasons strategically.", "The AI will explain its strategic decision-making process here, demonstrating how reasoning emerges from self-play training in zero-sum games.", stats),
inputs=[ttt_stats],
outputs=[*board_buttons, ttt_message, ttt_reasoning, ttt_stats]
)
# Key concepts section
gr.Markdown("---")
gr.Markdown("## 🧠 Key SPIRAL Concepts Demonstrated")
with gr.Row():
with gr.Column():
gr.Markdown("""
**🎯 Strategic Reasoning**
- AI uses minimax tree search
- Evaluates all possible future moves
- Chooses optimal strategic actions
""")
with gr.Column():
gr.Markdown("""
**🔄 Self-Play Learning**
- Strategic patterns emerge from competition
- Zero-sum games incentivize reasoning
- Multi-agent interactions develop intelligence
""")
gr.Markdown("""
### About SPIRAL
This demo illustrates key findings from the SPIRAL research:
- **Zero-sum games** like TicTacToe create competitive pressure that incentivizes strategic thinking
- **Self-play training** allows AI agents to discover optimal strategies through repeated interaction
- **Multi-turn reasoning** emerges naturally from the need to plan ahead in strategic environments
- **Tree search algorithms** like minimax demonstrate how strategic reasoning can be formalized and executed
The AI's explanations show how it evaluates different moves, considers future possibilities, and makes strategic decisions - core capabilities that transfer to general reasoning tasks.
""")
return demo
if __name__ == "__main__":
demo = create_interface()
demo.launch()