File size: 16,561 Bytes
b3a9ec4
 
185e9d2
842d62b
 
 
b3a9ec4
185e9d2
c59d6c7
 
 
842d62b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
c59d6c7
842d62b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4420646
842d62b
6be63cd
842d62b
 
 
 
 
 
ee800d8
 
c59d6c7
 
 
a3e1550
 
a530f7b
feb1933
 
 
a3e1550
feb1933
a3e1550
feb1933
 
 
 
 
 
 
 
 
a530f7b
 
feb1933
 
 
 
 
 
 
 
 
 
 
 
a530f7b
feb1933
 
 
a530f7b
feb1933
 
 
a3e1550
47b257f
 
 
 
 
 
 
 
 
a3e1550
 
842d62b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5c7fb25
842d62b
 
5c7fb25
842d62b
 
 
 
 
 
 
 
 
 
b1670f3
842d62b
 
 
 
 
 
 
 
 
 
 
 
 
b1670f3
 
842d62b
 
 
 
 
 
 
 
 
b1670f3
842d62b
b1670f3
842d62b
 
 
 
 
 
 
 
 
 
 
 
 
 
4420646
842d62b
 
5c7fb25
842d62b
 
a530f7b
842d62b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5c7fb25
842d62b
 
 
5c7fb25
842d62b
 
 
 
5c7fb25
b1670f3
85310d8
842d62b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
85310d8
842d62b
 
 
 
 
 
 
c59d6c7
842d62b
 
c59d6c7
842d62b
 
 
 
 
 
 
 
 
6be63cd
842d62b
6be63cd
 
842d62b
b3a9ec4
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
"""
SPIRAL: Interactive Reasoning Game Simulator

Demonstrates key concepts from "Self-Play in Zero-Sum Games Incentivizes Reasoning via Multi-Agent Multi-Turn Reinforcement Learning"

This simplified demo shows how strategic reasoning emerges from self-play in zero-sum games like TicTacToe.
"""

import gradio as gr
import numpy as np
import random


class TicTacToeEnv:
    """Simple TicTacToe environment for SPIRAL demonstration."""
    
    def __init__(self):
        self.reset()
    
    def reset(self):
        """Reset the game to initial state."""
        self.board = np.zeros((3, 3), dtype=np.int8)
        self.current_player = 1  # Player 1 starts (X)
        self.game_over = False
        self.winner = None
        self.move_count = 0
        return self.board.copy()
    
    def step(self, action):
        """Execute one step in the environment."""
        if self.game_over:
            return self.board.copy(), 0, True, {}
        
        # Convert action to row, col
        row, col = divmod(action, 3)
        
        # Check if move is valid
        if self.board[row, col] != 0:
            return self.board.copy(), -1, True, {"invalid_move": True}
        
        # Make the move
        self.board[row, col] = self.current_player
        self.move_count += 1
        
        # Check for win
        winner = self._check_winner()
        if winner is not None:
            self.game_over = True
            self.winner = winner
            reward = 1 if winner == self.current_player else -1
            return self.board.copy(), reward, True, {}
        elif self.move_count >= 9:
            # Draw
            self.game_over = True
            return self.board.copy(), 0, True, {}
        else:
            # Game continues
            self.current_player *= -1  # Switch player
            return self.board.copy(), 0, False, {}
    
    def _check_winner(self):
        """Check if there's a winner."""
        # Check rows
        for row in range(3):
            if abs(self.board[row, :].sum()) == 3:
                return self.board[row, 0]
        
        # Check columns
        for col in range(3):
            if abs(self.board[:, col].sum()) == 3:
                return self.board[0, col]
        
        # Check diagonals
        if abs(self.board.diagonal().sum()) == 3:
            return self.board[0, 0]
        
        if abs(np.fliplr(self.board).diagonal().sum()) == 3:
            return self.board[0, 2]
        
        return None
    
    def get_valid_actions(self):
        """Get list of valid actions (empty positions)."""
        valid_actions = []
        for i in range(9):
            row, col = divmod(i, 3)
            if self.board[row, col] == 0:
                valid_actions.append(i)
        return valid_actions


# Global game environment
tictactoe_env = TicTacToeEnv()


def check_winner(board):
    """Check if there's a winner on the given board."""
    # Check rows
    for row in range(3):
        if abs(board[row, :].sum()) == 3:
            return board[row, 0]
    
    # Check columns
    for col in range(3):
        if abs(board[:, col].sum()) == 3:
            return board[0, col]
    
    # Check diagonals
    if abs(board.diagonal().sum()) == 3:
        return board[0, 0]
    
    if abs(np.fliplr(board).diagonal().sum()) == 3:
        return board[0, 2]
    
    return None


def get_valid_moves(board):
    """Get valid moves for the given board."""
    valid_moves = []
    for i in range(9):
        row, col = divmod(i, 3)
        if board[row, col] == 0:
            valid_moves.append(i)
    return valid_moves


def minimax(board, player, depth=0):
    """Minimax algorithm - demonstrates strategic reasoning."""
    # Base cases
    winner = check_winner(board)
    if winner == 1:  # Human wins
        return -10 + depth, None
    elif winner == -1:  # AI wins
        return 10 - depth, None
    elif len(get_valid_moves(board)) == 0:  # Draw
        return 0, None

    best_move = None
    if player == -1:  # AI is maximizing player
        best_score = -float('inf')
        for move in get_valid_moves(board):
            row, col = divmod(move, 3)
            board[row, col] = -1
            score, _ = minimax(board.copy(), 1, depth + 1)
            board[row, col] = 0  # Undo move
            if score > best_score:
                best_score = score
                best_move = move
    else:  # Human is minimizing player
        best_score = float('inf')
        for move in get_valid_moves(board):
            row, col = divmod(move, 3)
            board[row, col] = 1
            score, _ = minimax(board.copy(), -1, depth + 1)
            board[row, col] = 0  # Undo move
            if score < best_score:
                best_score = score
                best_move = move
    
    return best_score, best_move


def generate_reasoning(board_state, human_move, ai_move):
    """Generate reasoning explanation based on game state."""
    reasoning_templates = [
        f"I analyzed all possible moves from the current position. After you played position {human_move}, I considered {len(get_valid_moves(board_state))} possible responses. Using minimax tree search, I determined that position {ai_move} gives me the best strategic advantage.",
        
        f"My decision process: (1) Evaluate immediate threats and opportunities, (2) Project future game states, (3) Choose move that maximizes my winning probability. Position {ai_move} emerged as optimal after analyzing the full game tree.",
        
        f"Strategic analysis: Your move at {human_move} created a new board configuration. I used recursive tree search to evaluate all possible future sequences. Position {ai_move} either creates a winning opportunity or blocks your potential victories.",
        
        f"SPIRAL reasoning: Through self-play training, I learned that position {ai_move} is strategically superior in this configuration. This demonstrates how strategic reasoning emerges from multi-agent interaction in zero-sum games."
    ]
    
    return random.choice(reasoning_templates)


def create_interface():
    """Create the main Gradio interface."""
    
    # Custom CSS to style the TicTacToe board
    css = """
        .ttt-board {
            display: flex;
            flex-direction: column;
            align-items: center;
            max-width: 300px;
            margin: 0 auto;
        }
        .ttt-board > div {
            display: flex;
            flex-direction: row;
            justify-content: center;
            gap: 8px;
            margin: 4px 0;
        }
        .ttt-board button {
            width: 80px !important;
            height: 80px !important;
            min-width: 80px !important;
            min-height: 80px !important;
            max-width: 80px !important;
            max-height: 80px !important;
            font-size: 24px !important;
            font-weight: bold !important;
            border: 2px solid #374151 !important;
            border-radius: 8px !important;
            background: #1f2937 !important;
            color: white !important;
            display: flex !important;
            align-items: center !important;
            justify-content: center !important;
        }
        .ttt-board button:hover {
            background: #374151 !important;
            border-color: #6b7280 !important;
        }
        .ttt-board button:disabled {
            opacity: 0.8 !important;
            cursor: not-allowed !important;
        }
        .ttt-stats {
            text-align: center !important;
            margin: 20px 0 !important;
            font-size: 16px !important;
        }
        .ttt-stats p {
            margin: 0 !important;
            color: #9ca3af !important;
        }
    """
    
    with gr.Blocks(title="SPIRAL: Self-Play Reasoning Demo", theme=gr.themes.Soft(), css=css) as demo:
        gr.Markdown("# ๐ŸŽฎ SPIRAL: Self-Play Reasoning Demo")
        gr.Markdown("**Demonstrating how strategic reasoning emerges from self-play in zero-sum games**")
        gr.Markdown("*Based on: \"Self-Play in Zero-Sum Games Incentivizes Reasoning via Multi-Agent Multi-Turn Reinforcement Learning\"*")
        
        def update_board_buttons():
            """Create a list of gr.Button updates from the current board state."""
            updates = []
            for i in range(9):
                row, col = divmod(i, 3)
                cell = tictactoe_env.board[row, col]
                val = ""
                interactive = True
                if cell == 1:
                    val = 'โŒ'
                    interactive = False
                elif cell == -1:
                    val = 'โญ•'
                    interactive = False
                
                if tictactoe_env.game_over:
                    interactive = False

                updates.append(gr.Button(value=val, interactive=interactive))
            return updates

        ttt_stats = gr.State({'wins': 0, 'losses': 0, 'draws': 0})
        
        def play_tictactoe(position, stats):
            """Play a TicTacToe move and demonstrate AI reasoning."""
            if tictactoe_env.game_over:
                yield *update_board_buttons(), "Game is over! Click 'New Game' to start again.", "", stats
                return

            try:
                position = int(position)
                
                # Human move
                board_state, reward, done, info = tictactoe_env.step(position)
                
                if done:
                    if info.get("invalid_move"):
                        yield *update_board_buttons(), "Invalid move! Try again.", "", stats
                        return
                    
                    winner = "You" if tictactoe_env.winner == 1 else "AI" if tictactoe_env.winner == -1 else "Draw"
                    if winner == "You": stats['wins'] += 1
                    elif winner == "AI": stats['losses'] += 1
                    else: stats['draws'] += 1
                    yield *update_board_buttons(), f"Game Over! {winner} won!", "", stats
                    return

                # Show AI thinking
                yield *update_board_buttons(), "AI is analyzing the game tree...", "๐Ÿง  Strategic reasoning in progress...", stats

                # AI move using minimax
                _, ai_action = minimax(tictactoe_env.board.copy(), -1)
                if ai_action is None:
                    valid_actions = tictactoe_env.get_valid_actions()
                    if not valid_actions:
                        yield *update_board_buttons(), "Game is a draw!", "", stats
                        return
                    ai_action = random.choice(valid_actions)

                # Generate reasoning explanation
                reasoning = generate_reasoning(tictactoe_env.board.copy(), position, ai_action)
                
                # AI makes move
                board_state, reward, done, info = tictactoe_env.step(ai_action)
                
                if done:
                    winner = "You" if tictactoe_env.winner == 1 else "AI" if tictactoe_env.winner == -1 else "Draw"
                    if winner == "You": stats['wins'] += 1
                    elif winner == "AI": stats['losses'] += 1
                    else: stats['draws'] += 1
                    yield *update_board_buttons(), f"Game Over! {winner} won! AI played position {ai_action}.", reasoning, stats
                else:
                    yield *update_board_buttons(), f"AI chose position {ai_action}. Your turn!", reasoning, stats
                    
            except Exception as e:
                yield *update_board_buttons(), f"Error: {str(e)}", "", stats

        def reset_tictactoe(stats):
            """Reset TicTacToe game."""
            tictactoe_env.reset()
            return *update_board_buttons(), "New game started! You are โŒ (X). Click a square to demonstrate strategic reasoning.", "The AI will explain its strategic decision-making process...", stats
        
        # Initialize the board
        tictactoe_env.reset()
        
        # Game interface
        with gr.Row():
            gr.Markdown("### Strategic TicTacToe")
            gr.Markdown("") # spacer
            ttt_reset_btn = gr.Button("๐Ÿ”„ New Game", variant="secondary", size="sm")
        
        gr.Markdown("**You are โŒ (X)** - The AI uses minimax tree search to demonstrate strategic reasoning")

        # Game board
        with gr.Column(elem_classes=["ttt-board"]):
            board_buttons = []
            for i in range(3):
                with gr.Row(elem_classes=["ttt-row"]):
                    for j in range(3):
                        pos = i * 3 + j
                        button = gr.Button("", elem_id=f"ttt-cell-{pos}", size="lg", value="")
                        board_buttons.append(button)

        # Stats display
        with gr.Row():
            ttt_stats_display = gr.Markdown(value="**Wins: 0 | Losses: 0 | Draws: 0**", elem_classes=["ttt-stats"])

        # Game status and AI reasoning
        ttt_message = gr.Textbox(
            label="๐ŸŽฏ Game Status",
            value="Click a square to start! Watch how the AI reasons strategically.",
            lines=2,
            interactive=False
        )
        
        ttt_reasoning = gr.Textbox(
            label="๐Ÿง  AI Strategic Reasoning",
            value="The AI will explain its strategic decision-making process here, demonstrating how reasoning emerges from self-play training in zero-sum games.",
            lines=4,
            interactive=False
        )

        # Event handlers
        def on_board_click(pos, stats):
            yield from play_tictactoe(pos, stats)

        for i in range(9):
            board_buttons[i].click(
                fn=on_board_click,
                inputs=[gr.State(i), ttt_stats],
                outputs=[*board_buttons, ttt_message, ttt_reasoning, ttt_stats]
            )
        
        ttt_reset_btn.click(
            fn=reset_tictactoe,
            inputs=[ttt_stats],
            outputs=[*board_buttons, ttt_message, ttt_reasoning, ttt_stats]
        )
        
        # Update stats display
        ttt_stats.change(
            fn=lambda s: f"**Wins: {s['wins']} | Losses: {s['losses']} | Draws: {s['draws']}**",
            inputs=ttt_stats,
            outputs=ttt_stats_display
        )
        
        # Initialize board display on load
        demo.load(
            fn=lambda stats: (*update_board_buttons(), "Click a square to start! Watch how the AI reasons strategically.", "The AI will explain its strategic decision-making process here, demonstrating how reasoning emerges from self-play training in zero-sum games.", stats),
            inputs=[ttt_stats],
            outputs=[*board_buttons, ttt_message, ttt_reasoning, ttt_stats]
        )
        
        # Key concepts section
        gr.Markdown("---")
        gr.Markdown("## ๐Ÿง  Key SPIRAL Concepts Demonstrated")
        
        with gr.Row():
            with gr.Column():
                gr.Markdown("""
                **๐ŸŽฏ Strategic Reasoning**
                - AI uses minimax tree search
                - Evaluates all possible future moves
                - Chooses optimal strategic actions
                """)
            
            with gr.Column():
                gr.Markdown("""
                **๐Ÿ”„ Self-Play Learning**
                - Strategic patterns emerge from competition
                - Zero-sum games incentivize reasoning
                - Multi-agent interactions develop intelligence
                """)
        
        gr.Markdown("""
        ### About SPIRAL
        
        This demo illustrates key findings from the SPIRAL research:
        
        - **Zero-sum games** like TicTacToe create competitive pressure that incentivizes strategic thinking
        - **Self-play training** allows AI agents to discover optimal strategies through repeated interaction
        - **Multi-turn reasoning** emerges naturally from the need to plan ahead in strategic environments
        - **Tree search algorithms** like minimax demonstrate how strategic reasoning can be formalized and executed
        
        The AI's explanations show how it evaluates different moves, considers future possibilities, and makes strategic decisions - core capabilities that transfer to general reasoning tasks.
        """)
    
    return demo


if __name__ == "__main__":
    demo = create_interface()
    demo.launch()