Spaces:
Build error
Build error
File size: 16,561 Bytes
b3a9ec4 185e9d2 842d62b b3a9ec4 185e9d2 c59d6c7 842d62b c59d6c7 842d62b 4420646 842d62b 6be63cd 842d62b ee800d8 c59d6c7 a3e1550 a530f7b feb1933 a3e1550 feb1933 a3e1550 feb1933 a530f7b feb1933 a530f7b feb1933 a530f7b feb1933 a3e1550 47b257f a3e1550 842d62b 5c7fb25 842d62b 5c7fb25 842d62b b1670f3 842d62b b1670f3 842d62b b1670f3 842d62b b1670f3 842d62b 4420646 842d62b 5c7fb25 842d62b a530f7b 842d62b 5c7fb25 842d62b 5c7fb25 842d62b 5c7fb25 b1670f3 85310d8 842d62b 85310d8 842d62b c59d6c7 842d62b c59d6c7 842d62b 6be63cd 842d62b 6be63cd 842d62b b3a9ec4 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 |
"""
SPIRAL: Interactive Reasoning Game Simulator
Demonstrates key concepts from "Self-Play in Zero-Sum Games Incentivizes Reasoning via Multi-Agent Multi-Turn Reinforcement Learning"
This simplified demo shows how strategic reasoning emerges from self-play in zero-sum games like TicTacToe.
"""
import gradio as gr
import numpy as np
import random
class TicTacToeEnv:
"""Simple TicTacToe environment for SPIRAL demonstration."""
def __init__(self):
self.reset()
def reset(self):
"""Reset the game to initial state."""
self.board = np.zeros((3, 3), dtype=np.int8)
self.current_player = 1 # Player 1 starts (X)
self.game_over = False
self.winner = None
self.move_count = 0
return self.board.copy()
def step(self, action):
"""Execute one step in the environment."""
if self.game_over:
return self.board.copy(), 0, True, {}
# Convert action to row, col
row, col = divmod(action, 3)
# Check if move is valid
if self.board[row, col] != 0:
return self.board.copy(), -1, True, {"invalid_move": True}
# Make the move
self.board[row, col] = self.current_player
self.move_count += 1
# Check for win
winner = self._check_winner()
if winner is not None:
self.game_over = True
self.winner = winner
reward = 1 if winner == self.current_player else -1
return self.board.copy(), reward, True, {}
elif self.move_count >= 9:
# Draw
self.game_over = True
return self.board.copy(), 0, True, {}
else:
# Game continues
self.current_player *= -1 # Switch player
return self.board.copy(), 0, False, {}
def _check_winner(self):
"""Check if there's a winner."""
# Check rows
for row in range(3):
if abs(self.board[row, :].sum()) == 3:
return self.board[row, 0]
# Check columns
for col in range(3):
if abs(self.board[:, col].sum()) == 3:
return self.board[0, col]
# Check diagonals
if abs(self.board.diagonal().sum()) == 3:
return self.board[0, 0]
if abs(np.fliplr(self.board).diagonal().sum()) == 3:
return self.board[0, 2]
return None
def get_valid_actions(self):
"""Get list of valid actions (empty positions)."""
valid_actions = []
for i in range(9):
row, col = divmod(i, 3)
if self.board[row, col] == 0:
valid_actions.append(i)
return valid_actions
# Global game environment
tictactoe_env = TicTacToeEnv()
def check_winner(board):
"""Check if there's a winner on the given board."""
# Check rows
for row in range(3):
if abs(board[row, :].sum()) == 3:
return board[row, 0]
# Check columns
for col in range(3):
if abs(board[:, col].sum()) == 3:
return board[0, col]
# Check diagonals
if abs(board.diagonal().sum()) == 3:
return board[0, 0]
if abs(np.fliplr(board).diagonal().sum()) == 3:
return board[0, 2]
return None
def get_valid_moves(board):
"""Get valid moves for the given board."""
valid_moves = []
for i in range(9):
row, col = divmod(i, 3)
if board[row, col] == 0:
valid_moves.append(i)
return valid_moves
def minimax(board, player, depth=0):
"""Minimax algorithm - demonstrates strategic reasoning."""
# Base cases
winner = check_winner(board)
if winner == 1: # Human wins
return -10 + depth, None
elif winner == -1: # AI wins
return 10 - depth, None
elif len(get_valid_moves(board)) == 0: # Draw
return 0, None
best_move = None
if player == -1: # AI is maximizing player
best_score = -float('inf')
for move in get_valid_moves(board):
row, col = divmod(move, 3)
board[row, col] = -1
score, _ = minimax(board.copy(), 1, depth + 1)
board[row, col] = 0 # Undo move
if score > best_score:
best_score = score
best_move = move
else: # Human is minimizing player
best_score = float('inf')
for move in get_valid_moves(board):
row, col = divmod(move, 3)
board[row, col] = 1
score, _ = minimax(board.copy(), -1, depth + 1)
board[row, col] = 0 # Undo move
if score < best_score:
best_score = score
best_move = move
return best_score, best_move
def generate_reasoning(board_state, human_move, ai_move):
"""Generate reasoning explanation based on game state."""
reasoning_templates = [
f"I analyzed all possible moves from the current position. After you played position {human_move}, I considered {len(get_valid_moves(board_state))} possible responses. Using minimax tree search, I determined that position {ai_move} gives me the best strategic advantage.",
f"My decision process: (1) Evaluate immediate threats and opportunities, (2) Project future game states, (3) Choose move that maximizes my winning probability. Position {ai_move} emerged as optimal after analyzing the full game tree.",
f"Strategic analysis: Your move at {human_move} created a new board configuration. I used recursive tree search to evaluate all possible future sequences. Position {ai_move} either creates a winning opportunity or blocks your potential victories.",
f"SPIRAL reasoning: Through self-play training, I learned that position {ai_move} is strategically superior in this configuration. This demonstrates how strategic reasoning emerges from multi-agent interaction in zero-sum games."
]
return random.choice(reasoning_templates)
def create_interface():
"""Create the main Gradio interface."""
# Custom CSS to style the TicTacToe board
css = """
.ttt-board {
display: flex;
flex-direction: column;
align-items: center;
max-width: 300px;
margin: 0 auto;
}
.ttt-board > div {
display: flex;
flex-direction: row;
justify-content: center;
gap: 8px;
margin: 4px 0;
}
.ttt-board button {
width: 80px !important;
height: 80px !important;
min-width: 80px !important;
min-height: 80px !important;
max-width: 80px !important;
max-height: 80px !important;
font-size: 24px !important;
font-weight: bold !important;
border: 2px solid #374151 !important;
border-radius: 8px !important;
background: #1f2937 !important;
color: white !important;
display: flex !important;
align-items: center !important;
justify-content: center !important;
}
.ttt-board button:hover {
background: #374151 !important;
border-color: #6b7280 !important;
}
.ttt-board button:disabled {
opacity: 0.8 !important;
cursor: not-allowed !important;
}
.ttt-stats {
text-align: center !important;
margin: 20px 0 !important;
font-size: 16px !important;
}
.ttt-stats p {
margin: 0 !important;
color: #9ca3af !important;
}
"""
with gr.Blocks(title="SPIRAL: Self-Play Reasoning Demo", theme=gr.themes.Soft(), css=css) as demo:
gr.Markdown("# ๐ฎ SPIRAL: Self-Play Reasoning Demo")
gr.Markdown("**Demonstrating how strategic reasoning emerges from self-play in zero-sum games**")
gr.Markdown("*Based on: \"Self-Play in Zero-Sum Games Incentivizes Reasoning via Multi-Agent Multi-Turn Reinforcement Learning\"*")
def update_board_buttons():
"""Create a list of gr.Button updates from the current board state."""
updates = []
for i in range(9):
row, col = divmod(i, 3)
cell = tictactoe_env.board[row, col]
val = ""
interactive = True
if cell == 1:
val = 'โ'
interactive = False
elif cell == -1:
val = 'โญ'
interactive = False
if tictactoe_env.game_over:
interactive = False
updates.append(gr.Button(value=val, interactive=interactive))
return updates
ttt_stats = gr.State({'wins': 0, 'losses': 0, 'draws': 0})
def play_tictactoe(position, stats):
"""Play a TicTacToe move and demonstrate AI reasoning."""
if tictactoe_env.game_over:
yield *update_board_buttons(), "Game is over! Click 'New Game' to start again.", "", stats
return
try:
position = int(position)
# Human move
board_state, reward, done, info = tictactoe_env.step(position)
if done:
if info.get("invalid_move"):
yield *update_board_buttons(), "Invalid move! Try again.", "", stats
return
winner = "You" if tictactoe_env.winner == 1 else "AI" if tictactoe_env.winner == -1 else "Draw"
if winner == "You": stats['wins'] += 1
elif winner == "AI": stats['losses'] += 1
else: stats['draws'] += 1
yield *update_board_buttons(), f"Game Over! {winner} won!", "", stats
return
# Show AI thinking
yield *update_board_buttons(), "AI is analyzing the game tree...", "๐ง Strategic reasoning in progress...", stats
# AI move using minimax
_, ai_action = minimax(tictactoe_env.board.copy(), -1)
if ai_action is None:
valid_actions = tictactoe_env.get_valid_actions()
if not valid_actions:
yield *update_board_buttons(), "Game is a draw!", "", stats
return
ai_action = random.choice(valid_actions)
# Generate reasoning explanation
reasoning = generate_reasoning(tictactoe_env.board.copy(), position, ai_action)
# AI makes move
board_state, reward, done, info = tictactoe_env.step(ai_action)
if done:
winner = "You" if tictactoe_env.winner == 1 else "AI" if tictactoe_env.winner == -1 else "Draw"
if winner == "You": stats['wins'] += 1
elif winner == "AI": stats['losses'] += 1
else: stats['draws'] += 1
yield *update_board_buttons(), f"Game Over! {winner} won! AI played position {ai_action}.", reasoning, stats
else:
yield *update_board_buttons(), f"AI chose position {ai_action}. Your turn!", reasoning, stats
except Exception as e:
yield *update_board_buttons(), f"Error: {str(e)}", "", stats
def reset_tictactoe(stats):
"""Reset TicTacToe game."""
tictactoe_env.reset()
return *update_board_buttons(), "New game started! You are โ (X). Click a square to demonstrate strategic reasoning.", "The AI will explain its strategic decision-making process...", stats
# Initialize the board
tictactoe_env.reset()
# Game interface
with gr.Row():
gr.Markdown("### Strategic TicTacToe")
gr.Markdown("") # spacer
ttt_reset_btn = gr.Button("๐ New Game", variant="secondary", size="sm")
gr.Markdown("**You are โ (X)** - The AI uses minimax tree search to demonstrate strategic reasoning")
# Game board
with gr.Column(elem_classes=["ttt-board"]):
board_buttons = []
for i in range(3):
with gr.Row(elem_classes=["ttt-row"]):
for j in range(3):
pos = i * 3 + j
button = gr.Button("", elem_id=f"ttt-cell-{pos}", size="lg", value="")
board_buttons.append(button)
# Stats display
with gr.Row():
ttt_stats_display = gr.Markdown(value="**Wins: 0 | Losses: 0 | Draws: 0**", elem_classes=["ttt-stats"])
# Game status and AI reasoning
ttt_message = gr.Textbox(
label="๐ฏ Game Status",
value="Click a square to start! Watch how the AI reasons strategically.",
lines=2,
interactive=False
)
ttt_reasoning = gr.Textbox(
label="๐ง AI Strategic Reasoning",
value="The AI will explain its strategic decision-making process here, demonstrating how reasoning emerges from self-play training in zero-sum games.",
lines=4,
interactive=False
)
# Event handlers
def on_board_click(pos, stats):
yield from play_tictactoe(pos, stats)
for i in range(9):
board_buttons[i].click(
fn=on_board_click,
inputs=[gr.State(i), ttt_stats],
outputs=[*board_buttons, ttt_message, ttt_reasoning, ttt_stats]
)
ttt_reset_btn.click(
fn=reset_tictactoe,
inputs=[ttt_stats],
outputs=[*board_buttons, ttt_message, ttt_reasoning, ttt_stats]
)
# Update stats display
ttt_stats.change(
fn=lambda s: f"**Wins: {s['wins']} | Losses: {s['losses']} | Draws: {s['draws']}**",
inputs=ttt_stats,
outputs=ttt_stats_display
)
# Initialize board display on load
demo.load(
fn=lambda stats: (*update_board_buttons(), "Click a square to start! Watch how the AI reasons strategically.", "The AI will explain its strategic decision-making process here, demonstrating how reasoning emerges from self-play training in zero-sum games.", stats),
inputs=[ttt_stats],
outputs=[*board_buttons, ttt_message, ttt_reasoning, ttt_stats]
)
# Key concepts section
gr.Markdown("---")
gr.Markdown("## ๐ง Key SPIRAL Concepts Demonstrated")
with gr.Row():
with gr.Column():
gr.Markdown("""
**๐ฏ Strategic Reasoning**
- AI uses minimax tree search
- Evaluates all possible future moves
- Chooses optimal strategic actions
""")
with gr.Column():
gr.Markdown("""
**๐ Self-Play Learning**
- Strategic patterns emerge from competition
- Zero-sum games incentivize reasoning
- Multi-agent interactions develop intelligence
""")
gr.Markdown("""
### About SPIRAL
This demo illustrates key findings from the SPIRAL research:
- **Zero-sum games** like TicTacToe create competitive pressure that incentivizes strategic thinking
- **Self-play training** allows AI agents to discover optimal strategies through repeated interaction
- **Multi-turn reasoning** emerges naturally from the need to plan ahead in strategic environments
- **Tree search algorithms** like minimax demonstrate how strategic reasoning can be formalized and executed
The AI's explanations show how it evaluates different moves, considers future possibilities, and makes strategic decisions - core capabilities that transfer to general reasoning tasks.
""")
return demo
if __name__ == "__main__":
demo = create_interface()
demo.launch()
|