Kaushik Rajan commited on
Commit
e526e6a
·
0 Parent(s):

Phase 1: Initial SPIRAL project setup

Browse files

Complete structure with Gradio interface, config, and documentation

.gitignore ADDED
@@ -0,0 +1,227 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Byte-compiled / optimized / DLL files
2
+ __pycache__/
3
+ *.py[cod]
4
+ *$py.class
5
+
6
+ # C extensions
7
+ *.so
8
+
9
+ # Distribution / packaging
10
+ .Python
11
+ build/
12
+ develop-eggs/
13
+ dist/
14
+ downloads/
15
+ eggs/
16
+ .eggs/
17
+ lib/
18
+ lib64/
19
+ parts/
20
+ sdist/
21
+ var/
22
+ wheels/
23
+ share/python-wheels/
24
+ *.egg-info/
25
+ .installed.cfg
26
+ *.egg
27
+ MANIFEST
28
+
29
+ # PyInstaller
30
+ # Usually these files are written by a python script from a template
31
+ # before PyInstaller builds the exe, so as to inject date/other infos into it.
32
+ *.manifest
33
+ *.spec
34
+
35
+ # Installer logs
36
+ pip-log.txt
37
+ pip-delete-this-directory.txt
38
+
39
+ # Unit test / coverage reports
40
+ htmlcov/
41
+ .tox/
42
+ .nox/
43
+ .coverage
44
+ .coverage.*
45
+ .cache
46
+ nosetests.xml
47
+ coverage.xml
48
+ *.cover
49
+ *.py,cover
50
+ .hypothesis/
51
+ .pytest_cache/
52
+ cover/
53
+
54
+ # Translations
55
+ *.mo
56
+ *.pot
57
+
58
+ # Django stuff:
59
+ *.log
60
+ local_settings.py
61
+ db.sqlite3
62
+ db.sqlite3-journal
63
+
64
+ # Flask stuff:
65
+ instance/
66
+ .webassets-cache
67
+
68
+ # Scrapy stuff:
69
+ .scrapy
70
+
71
+ # Sphinx documentation
72
+ docs/_build/
73
+
74
+ # PyBuilder
75
+ .pybuilder/
76
+ target/
77
+
78
+ # Jupyter Notebook
79
+ .ipynb_checkpoints
80
+
81
+ # IPython
82
+ profile_default/
83
+ ipython_config.py
84
+
85
+ # pyenv
86
+ # For a library or package, you might want to ignore these files since the code is
87
+ # intended to run in multiple environments; otherwise, check them in:
88
+ # .python-version
89
+
90
+ # pipenv
91
+ # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
92
+ # However, in case of collaboration, if having platform-specific dependencies or dependencies
93
+ # having no cross-platform support, pipenv may install dependencies that don't work, or not
94
+ # install all needed dependencies.
95
+ #Pipfile.lock
96
+
97
+ # poetry
98
+ # Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
99
+ # This is especially recommended for binary packages to ensure reproducibility, and is more
100
+ # commonly ignored for libraries.
101
+ # https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
102
+ #poetry.lock
103
+
104
+ # pdm
105
+ # Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
106
+ #pdm.lock
107
+ # pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
108
+ # in version control.
109
+ # https://pdm.fming.dev/#use-with-ide
110
+ .pdm.toml
111
+
112
+ # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
113
+ __pypackages__/
114
+
115
+ # Celery stuff
116
+ celerybeat-schedule
117
+ celerybeat.pid
118
+
119
+ # SageMath parsed files
120
+ *.sage.py
121
+
122
+ # Environments
123
+ .env
124
+ .venv
125
+ env/
126
+ venv/
127
+ ENV/
128
+ env.bak/
129
+ venv.bak/
130
+
131
+ # Spyder project settings
132
+ .spyderproject
133
+ .spyproject
134
+
135
+ # Rope project settings
136
+ .ropeproject
137
+
138
+ # mkdocs documentation
139
+ /site
140
+
141
+ # mypy
142
+ .mypy_cache/
143
+ .dmypy.json
144
+ dmypy.json
145
+
146
+ # Pyre type checker
147
+ .pyre/
148
+
149
+ # pytype static type analyzer
150
+ .pytype/
151
+
152
+ # Cython debug symbols
153
+ cython_debug/
154
+
155
+ # PyCharm
156
+ # JetBrains specific template is maintained in a separate JetBrains.gitignore that can
157
+ # be added to the global gitignore or merged into this project gitignore. For a PyCharm
158
+ # project, it is recommended to include the template in the project gitignore.
159
+ # https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
160
+ # and can be added to the global gitignore or merged into this project gitignore. For a PyCharm
161
+ # project, it is recommended to include the template in the project gitignore.
162
+ # https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
163
+ .idea/
164
+
165
+ # VS Code
166
+ .vscode/
167
+
168
+ # macOS
169
+ .DS_Store
170
+ .AppleDouble
171
+ .LSOverride
172
+
173
+ # Windows
174
+ Thumbs.db
175
+ Thumbs.db:encryptable
176
+ ehthumbs.db
177
+ ehthumbs_vista.db
178
+ *.stackdump
179
+ [Dd]esktop.ini
180
+ $RECYCLE.BIN/
181
+ *.cab
182
+ *.msi
183
+ *.msix
184
+ *.msm
185
+ *.msp
186
+ *.lnk
187
+
188
+ # Model files and large data
189
+ *.bin
190
+ *.safetensors
191
+ *.pt
192
+ *.pth
193
+ *.ckpt
194
+ *.h5
195
+ *.pkl
196
+ *.pickle
197
+ models/*/
198
+
199
+ # Logs and experiments
200
+ logs/
201
+ wandb/
202
+ tensorboard/
203
+ *.log
204
+
205
+ # Temporary files
206
+ tmp/
207
+ temp/
208
+ *.tmp
209
+ *.temp
210
+
211
+ # Data files
212
+ data/*/
213
+ !data/README.md
214
+
215
+ # Hugging Face cache
216
+ .cache/
217
+ transformers_cache/
218
+
219
+ # Local environment variables
220
+ .env.local
221
+ .env.development.local
222
+ .env.test.local
223
+ .env.production.local
224
+
225
+ # Gradio temporary files
226
+ flagged/
227
+ gradio_cached_examples/
README.md ADDED
@@ -0,0 +1,101 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # SPIRAL: Interactive Reasoning Game Simulator
2
+
3
+ A practical, interactive tool based on the SPIRAL paper ("Self-Play on Zero-Sum Games Incentivizes Reasoning via Multi-Agent Multi-Turn Reinforcement Learning") deployed on Hugging Face Spaces.
4
+
5
+ ## Overview
6
+
7
+ This tool demonstrates how self-play training on zero-sum games can improve AI reasoning capabilities. Users can:
8
+
9
+ - **Play Games**: Engage with AI in games like Kuhn Poker and TicTacToe
10
+ - **View Reasoning**: See step-by-step AI reasoning traces during gameplay
11
+ - **Test Transfer**: Evaluate AI's reasoning skills on math problems and logic puzzles
12
+ - **Learn**: Understand AI decision-making through interactive visualizations
13
+
14
+ ## Features
15
+
16
+ ### For Non-Technical Users
17
+ - Simple web interface for playing games
18
+ - Visual reasoning explanations
19
+ - Educational tutorials about AI thinking
20
+ - No setup required - runs in browser
21
+
22
+ ### For Technical Users
23
+ - Access to model weights and training scripts
24
+ - API endpoints for extending the system
25
+ - Custom game integration capabilities
26
+ - Fine-tuning examples and documentation
27
+
28
+ ## Project Structure
29
+
30
+ ```
31
+ SPIRAL/
32
+ ├── src/ # Core implementation
33
+ │ ├── games/ # Game environments
34
+ │ ├── models/ # SPIRAL model implementation
35
+ │ ├── training/ # Self-play training logic
36
+ │ └── reasoning/ # Reasoning trace generation
37
+ ├── models/ # Trained model weights
38
+ ├── data/ # Game datasets and benchmarks
39
+ ├── app/ # Gradio web interface
40
+ ├── tests/ # Unit and integration tests
41
+ └── docs/ # Documentation and tutorials
42
+ ```
43
+
44
+ ## Technology Stack
45
+
46
+ - **Backend**: Python 3.8+
47
+ - **ML Framework**: PyTorch, Transformers
48
+ - **RL Library**: Gymnasium, Stable Baselines3
49
+ - **Web Interface**: Gradio
50
+ - **Base Model**: Qwen-4B from Hugging Face
51
+ - **Deployment**: Hugging Face Spaces
52
+
53
+ ## Development Phases
54
+
55
+ 1. **Research and Planning** ✅
56
+ 2. **Implementation** 🔄
57
+ 3. **Testing and Optimization** 📋
58
+ 4. **Deployment and Documentation** 📋
59
+ 5. **Maintenance and Iteration** 📋
60
+
61
+ ## Getting Started
62
+
63
+ ### Prerequisites
64
+ - Python 3.8+
65
+ - PyTorch
66
+ - Hugging Face account (for model access)
67
+
68
+ ### Installation
69
+ ```bash
70
+ pip install -r requirements.txt
71
+ ```
72
+
73
+ ### Quick Start
74
+ ```bash
75
+ python app/app.py
76
+ ```
77
+
78
+ ## Citation
79
+
80
+ If you use this tool in your research, please cite the original SPIRAL paper:
81
+
82
+ ```bibtex
83
+ @article{spiral2024,
84
+ title={Self-Play on Zero-Sum Games Incentivizes Reasoning via Multi-Agent Multi-Turn Reinforcement Learning},
85
+ author={[Authors]},
86
+ journal={[Journal]},
87
+ year={2024}
88
+ }
89
+ ```
90
+
91
+ ## License
92
+
93
+ This project is licensed under the MIT License - see the LICENSE file for details.
94
+
95
+ ## Contributing
96
+
97
+ We welcome contributions! Please see CONTRIBUTING.md for guidelines.
98
+
99
+ ## Support
100
+
101
+ For issues and questions, please use the GitHub Issues or contact us via Hugging Face Spaces.
app/app.py ADDED
@@ -0,0 +1,255 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ SPIRAL Interactive Reasoning Game Simulator - Main Gradio App
3
+
4
+ A practical tool demonstrating how self-play training on zero-sum games
5
+ can improve AI reasoning capabilities.
6
+ """
7
+
8
+ import gradio as gr
9
+ import yaml
10
+ import os
11
+ import sys
12
+
13
+ # Add the src directory to the path for imports
14
+ sys.path.append(os.path.join(os.path.dirname(__file__), '..', 'src'))
15
+
16
+ from typing import Tuple, Dict, Any, List, Optional
17
+ import logging
18
+
19
+ # Configure logging
20
+ logging.basicConfig(level=logging.INFO)
21
+ logger = logging.getLogger(__name__)
22
+
23
+ class SpiralApp:
24
+ """Main application class for the SPIRAL reasoning simulator."""
25
+
26
+ def __init__(self, config_path: str = "../config.yaml"):
27
+ """Initialize the SPIRAL app with configuration."""
28
+ self.config = self._load_config(config_path)
29
+ self.setup_logging()
30
+
31
+ # Initialize components (will be implemented in Phase 2)
32
+ self.game_interface = None
33
+ self.reasoning_interface = None
34
+ self.transfer_interface = None
35
+
36
+ logger.info("SPIRAL App initialized successfully")
37
+
38
+ def _load_config(self, config_path: str) -> Dict[str, Any]:
39
+ """Load configuration from YAML file."""
40
+ try:
41
+ with open(config_path, 'r') as f:
42
+ config = yaml.safe_load(f)
43
+ return config
44
+ except FileNotFoundError:
45
+ logger.warning(f"Config file not found: {config_path}. Using defaults.")
46
+ return self._get_default_config()
47
+
48
+ def _get_default_config(self) -> Dict[str, Any]:
49
+ """Get default configuration."""
50
+ return {
51
+ 'interface': {
52
+ 'title': 'SPIRAL: Interactive Reasoning Game Simulator',
53
+ 'description': 'Play games against AI and explore reasoning capabilities',
54
+ 'theme': 'default'
55
+ },
56
+ 'games': {
57
+ 'kuhn_poker': {'name': 'Kuhn Poker'},
58
+ 'tictactoe': {'name': 'TicTacToe'}
59
+ }
60
+ }
61
+
62
+ def setup_logging(self):
63
+ """Set up logging configuration."""
64
+ log_config = self.config.get('logging', {})
65
+ level = getattr(logging, log_config.get('level', 'INFO'))
66
+ logging.getLogger().setLevel(level)
67
+
68
+ def play_game(self, game_type: str, user_move: str, game_state: str = "") -> Tuple[str, str, str]:
69
+ """
70
+ Handle game play interaction.
71
+
72
+ Args:
73
+ game_type: Type of game (kuhn_poker, tictactoe)
74
+ user_move: User's move input
75
+ game_state: Current game state
76
+
77
+ Returns:
78
+ Tuple of (updated_game_state, ai_response, reasoning_trace)
79
+ """
80
+ # Placeholder implementation - will be completed in Phase 2
81
+ if not user_move:
82
+ return game_state, "Please enter a move!", ""
83
+
84
+ # Simulate AI response
85
+ ai_response = f"AI responds to your move: {user_move}"
86
+ reasoning_trace = f"AI thinking: Analyzing move '{user_move}' in {game_type}..."
87
+ updated_state = f"{game_state}\nUser: {user_move}\nAI: {ai_response}"
88
+
89
+ return updated_state, ai_response, reasoning_trace
90
+
91
+ def test_reasoning(self, prompt: str, task_type: str = "math") -> Tuple[str, str]:
92
+ """
93
+ Test AI reasoning on non-game tasks.
94
+
95
+ Args:
96
+ prompt: User's reasoning prompt
97
+ task_type: Type of reasoning task
98
+
99
+ Returns:
100
+ Tuple of (response, reasoning_trace)
101
+ """
102
+ # Placeholder implementation - will be completed in Phase 2
103
+ if not prompt:
104
+ return "Please enter a reasoning prompt!", ""
105
+
106
+ response = f"AI response to: {prompt}"
107
+ reasoning_trace = f"Step-by-step reasoning for '{prompt}'..."
108
+
109
+ return response, reasoning_trace
110
+
111
+ def create_interface(self) -> gr.Blocks:
112
+ """Create the main Gradio interface."""
113
+ title = self.config['interface']['title']
114
+ description = self.config['interface']['description']
115
+
116
+ with gr.Blocks(title=title, theme=self.config['interface']['theme']) as demo:
117
+ gr.Markdown(f"# {title}")
118
+ gr.Markdown(description)
119
+
120
+ with gr.Tabs():
121
+ # Game Play Tab
122
+ with gr.TabItem("🎮 Game Play"):
123
+ gr.Markdown("### Play zero-sum games against AI")
124
+
125
+ with gr.Row():
126
+ with gr.Column():
127
+ game_selector = gr.Dropdown(
128
+ choices=["kuhn_poker", "tictactoe"],
129
+ value="kuhn_poker",
130
+ label="Select Game"
131
+ )
132
+ user_move = gr.Textbox(
133
+ label="Your Move",
134
+ placeholder="Enter your move..."
135
+ )
136
+ play_button = gr.Button("Play Move", variant="primary")
137
+
138
+ with gr.Column():
139
+ game_state = gr.Textbox(
140
+ label="Game State",
141
+ lines=10,
142
+ interactive=False
143
+ )
144
+ ai_response = gr.Textbox(
145
+ label="AI Response",
146
+ lines=3,
147
+ interactive=False
148
+ )
149
+
150
+ reasoning_trace = gr.Textbox(
151
+ label="AI Reasoning Trace",
152
+ lines=5,
153
+ interactive=False
154
+ )
155
+
156
+ play_button.click(
157
+ fn=self.play_game,
158
+ inputs=[game_selector, user_move, game_state],
159
+ outputs=[game_state, ai_response, reasoning_trace]
160
+ )
161
+
162
+ # Reasoning Test Tab
163
+ with gr.TabItem("🧠 Reasoning Test"):
164
+ gr.Markdown("### Test AI reasoning on math and logic problems")
165
+
166
+ with gr.Row():
167
+ with gr.Column():
168
+ task_type = gr.Dropdown(
169
+ choices=["math", "logic", "strategic"],
170
+ value="math",
171
+ label="Task Type"
172
+ )
173
+ reasoning_prompt = gr.Textbox(
174
+ label="Reasoning Prompt",
175
+ placeholder="Enter a math problem or logic puzzle...",
176
+ lines=3
177
+ )
178
+ test_button = gr.Button("Test Reasoning", variant="primary")
179
+
180
+ with gr.Column():
181
+ reasoning_response = gr.Textbox(
182
+ label="AI Response",
183
+ lines=8,
184
+ interactive=False
185
+ )
186
+ reasoning_steps = gr.Textbox(
187
+ label="Step-by-Step Reasoning",
188
+ lines=8,
189
+ interactive=False
190
+ )
191
+
192
+ test_button.click(
193
+ fn=self.test_reasoning,
194
+ inputs=[reasoning_prompt, task_type],
195
+ outputs=[reasoning_response, reasoning_steps]
196
+ )
197
+
198
+ # About Tab
199
+ with gr.TabItem("ℹ️ About"):
200
+ gr.Markdown("""
201
+ ### About SPIRAL
202
+
203
+ This tool demonstrates the SPIRAL methodology: "Self-Play on Zero-Sum Games
204
+ Incentivizes Reasoning via Multi-Agent Multi-Turn Reinforcement Learning."
205
+
206
+ **Key Features:**
207
+ - **Game Play**: Interactive games with AI opponents
208
+ - **Reasoning Traces**: Transparent AI decision-making
209
+ - **Transfer Learning**: Test reasoning on non-game tasks
210
+ - **Educational**: Learn about AI reasoning capabilities
211
+
212
+ **How it works:**
213
+ 1. AI agents are trained via self-play on zero-sum games
214
+ 2. Role-conditioned advantage estimation improves learning
215
+ 3. Reasoning skills transfer to mathematical and logical tasks
216
+ 4. Interactive interface shows the AI's thinking process
217
+
218
+ **Games Available:**
219
+ - **Kuhn Poker**: Simple poker variant with betting
220
+ - **TicTacToe**: Classic strategy game
221
+
222
+ **Technical Details:**
223
+ - Base Model: Qwen-4B from Hugging Face
224
+ - Training: PPO with self-play
225
+ - Interface: Gradio web app
226
+ """)
227
+
228
+ return demo
229
+
230
+ def launch(self, **kwargs):
231
+ """Launch the Gradio app."""
232
+ demo = self.create_interface()
233
+
234
+ # Get launch configuration
235
+ gradio_config = self.config.get('interface', {}).get('gradio', {})
236
+
237
+ launch_kwargs = {
238
+ 'server_name': gradio_config.get('server_name', '0.0.0.0'),
239
+ 'server_port': gradio_config.get('server_port', 7860),
240
+ 'share': gradio_config.get('share', False),
241
+ 'inbrowser': gradio_config.get('inbrowser', True),
242
+ 'enable_queue': gradio_config.get('enable_queue', True),
243
+ **kwargs
244
+ }
245
+
246
+ logger.info(f"Launching SPIRAL app with config: {launch_kwargs}")
247
+ demo.launch(**launch_kwargs)
248
+
249
+ def main():
250
+ """Main entry point for the application."""
251
+ app = SpiralApp()
252
+ app.launch()
253
+
254
+ if __name__ == "__main__":
255
+ main()
config.yaml ADDED
@@ -0,0 +1,124 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # SPIRAL Interactive Reasoning Game Simulator Configuration
2
+
3
+ # Model Configuration
4
+ model:
5
+ name: "Qwen/Qwen2.5-4B-Instruct"
6
+ max_length: 2048
7
+ temperature: 0.7
8
+ do_sample: true
9
+ quantization:
10
+ load_in_4bit: true
11
+ bnb_4bit_compute_dtype: "float16"
12
+ bnb_4bit_use_double_quant: true
13
+
14
+ # Games Configuration
15
+ games:
16
+ kuhn_poker:
17
+ name: "Kuhn Poker"
18
+ max_rounds: 50
19
+ deck_size: 3
20
+ betting_rounds: 2
21
+
22
+ tictactoe:
23
+ name: "TicTacToe"
24
+ board_size: 3
25
+ max_moves: 9
26
+ win_condition: 3
27
+
28
+ # Training Configuration
29
+ training:
30
+ algorithm: "PPO"
31
+ episodes: 1000
32
+ batch_size: 32
33
+ learning_rate: 0.0003
34
+ gamma: 0.99
35
+ gae_lambda: 0.95
36
+ clip_range: 0.2
37
+ entropy_coef: 0.01
38
+ value_loss_coef: 0.5
39
+ max_grad_norm: 0.5
40
+
41
+ # Self-play specific
42
+ self_play:
43
+ update_opponent_every: 100
44
+ opponent_pool_size: 5
45
+
46
+ # Role-conditioned advantage estimation
47
+ rae:
48
+ enable: true
49
+ role_embedding_dim: 64
50
+ advantage_weighting: 0.5
51
+
52
+ # Reasoning Configuration
53
+ reasoning:
54
+ enable_traces: true
55
+ trace_depth: 3
56
+ chain_of_thought: true
57
+ explanation_length: 150
58
+
59
+ # Transfer learning evaluation
60
+ transfer_tasks:
61
+ - "GSM8K"
62
+ - "Logic Puzzles"
63
+ - "Strategic Reasoning"
64
+
65
+ # Web Interface Configuration
66
+ interface:
67
+ title: "SPIRAL: Interactive Reasoning Game Simulator"
68
+ description: "Play games against AI and explore reasoning capabilities"
69
+ theme: "default"
70
+
71
+ # Gradio settings
72
+ gradio:
73
+ share: false
74
+ inbrowser: true
75
+ server_name: "0.0.0.0"
76
+ server_port: 7860
77
+ enable_queue: true
78
+ max_threads: 4
79
+
80
+ # Logging Configuration
81
+ logging:
82
+ level: "INFO"
83
+ format: "%(asctime)s - %(name)s - %(levelname)s - %(message)s"
84
+ file: "logs/spiral.log"
85
+
86
+ # Experiment tracking
87
+ wandb:
88
+ enable: false
89
+ project: "spiral-reasoning"
90
+ entity: "your-username"
91
+
92
+ tensorboard:
93
+ enable: true
94
+ log_dir: "logs/tensorboard"
95
+
96
+ # Data Configuration
97
+ data:
98
+ cache_dir: "data/cache"
99
+ datasets_dir: "data/datasets"
100
+ models_dir: "models"
101
+
102
+ # Benchmark datasets
103
+ benchmarks:
104
+ gsm8k: "data/benchmarks/gsm8k.json"
105
+ logic_puzzles: "data/benchmarks/logic_puzzles.json"
106
+
107
+ # Deployment Configuration
108
+ deployment:
109
+ huggingface:
110
+ space_name: "kaushikvr06/reasoning-simulator"
111
+ private: false
112
+
113
+ # Performance settings
114
+ performance:
115
+ max_concurrent_users: 10
116
+ timeout_seconds: 30
117
+ memory_limit: "2GB"
118
+
119
+ # Debug Configuration
120
+ debug:
121
+ enable: false
122
+ verbose_traces: false
123
+ save_game_logs: true
124
+ profile_inference: false
data/README.md ADDED
@@ -0,0 +1,45 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # SPIRAL Data Directory
2
+
3
+ This directory contains datasets, benchmarks, and cached data for the SPIRAL Interactive Reasoning Game Simulator.
4
+
5
+ ## Structure
6
+
7
+ ```
8
+ data/
9
+ ├── cache/ # Cached model outputs and processed data
10
+ ├── datasets/ # Game datasets and training data
11
+ ├── benchmarks/ # Evaluation benchmarks for transfer learning
12
+ │ ├── gsm8k.json # GSM8K math problems
13
+ │ └── logic_puzzles.json # Logic reasoning puzzles
14
+ └── README.md # This file
15
+ ```
16
+
17
+ ## Datasets
18
+
19
+ ### Game Datasets
20
+ - **Kuhn Poker**: Training games and strategies
21
+ - **TicTacToe**: Game states and optimal moves
22
+
23
+ ### Benchmark Datasets
24
+ - **GSM8K**: Grade School Math 8K dataset for mathematical reasoning
25
+ - **Logic Puzzles**: Custom logic and reasoning problems
26
+ - **Strategic Reasoning**: Game-theory based reasoning tasks
27
+
28
+ ## Usage
29
+
30
+ Datasets are automatically downloaded and cached when first used. To manually download:
31
+
32
+ ```python
33
+ from src.data_utils import download_datasets
34
+ download_datasets()
35
+ ```
36
+
37
+ ## Data Sources
38
+
39
+ - GSM8K: [Cobbe et al. 2021](https://arxiv.org/abs/2110.14168)
40
+ - Logic Puzzles: Curated collection from various sources
41
+ - Game Data: Generated through self-play training
42
+
43
+ ## License
44
+
45
+ Please refer to individual dataset licenses for usage rights.
execution-plan.md ADDED
@@ -0,0 +1,54 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # SPIRAL Demo App Execution Plan
2
+
3
+ This execution plan outlines the development of a practical, interactive tool on Hugging Face Spaces based on the SPIRAL paper ("Self-Play on Zero-Sum Games Incentivizes Reasoning via Multi-Agent Multi-Turn Reinforcement Learning"). The tool will be an **Interactive Reasoning Game Simulator**: Users can play zero-sum games (e.g., Kuhn Poker, TicTacToe) against a self-play trained AI, view step-by-step reasoning traces, and test the AI's transferred reasoning skills on non-game tasks like math problems or logic puzzles.
4
+
5
+ **Utility Focus**:
6
+ - **Non-Technical Users**: Simple web interface to play games, learn about AI reasoning through visualizations, and experiment with prompts for educational fun (e.g., "How does AI think in games?").
7
+ - **Technical Users**: Access to model weights, training scripts, and APIs for extending the self-play system (e.g., custom games or fine-tuning).
8
+ - **Practicality**: Free to use, no setup required; demonstrates real-world AI applications in strategy, education, and decision-making. Aims for broad appeal: 1000+ users via HF community sharing.
9
+
10
+ The plan is divided into phases with checkboxes for sub-tasks. Each phase includes detailed "how" steps.
11
+
12
+ ## Phase 1: Research and Planning
13
+ - [ ] Review SPIRAL Paper and Gather Resources
14
+ - How: Read the full paper (use attached snips as reference). Identify key components: self-play RL on games like Kuhn Poker, role-conditioned advantage estimation (RAE), multi-agent multi-turn training. Download base models (e.g., Qwen-4B from HF) and RL libs (Gym, Stable Baselines). Collect datasets: Simple game rules/implementations from GitHub; math benchmarks like GSM8K for transfer testing.
15
+ - [ ] Define Tool Features
16
+ - How: Brainstorm user flows. Core: Game mode (user vs. AI play), Reasoning Viewer (display traces), Transfer Tester (input math/logic queries). Add tutorials for non-tech users, exportable logs for tech users. Ensure accessibility: Mobile-friendly UI, low-latency inference.
17
+ - [ ] Scope Requirements and Tech Stack
18
+ - How: Choose Python for backend; Gradio for HF Spaces UI (easy interactive elements like buttons for moves). Use Transformers for LLM, Gym for games, PPO from Stable Baselines for RL demo. Estimate: 1-2 weeks dev time, free HF tier for hosting (upgrade to GPU if needed for training demos).
19
+
20
+ ## Phase 2: Implementation
21
+ - [ ] Set Up Project Structure
22
+ - How: Create a Git repo. Folders: `src/` for code, `models/` for weights, `data/` for game datasets, `app/` for Gradio script. Initialize with `requirements.txt`: transformers, torch, gymnasium, stable-baselines3, gradio.
23
+ - [ ] Implement Game Environments
24
+ - How: Code Gym envs for Kuhn Poker/TicTacToe (e.g., class KuhnPokerEnv(gym.Env) with action_space, observation_space, reward for wins). Add multi-turn logic: Track game state, player turns.
25
+ - [ ] Train SPIRAL Model
26
+ - How: Load base LLM (Qwen-4B). Implement self-play: Clone agent, train via PPO with RAE (custom advantage function: advantage = reward + value - baseline, conditioned on roles like 'player' vs. 'opponent'). Train on 1000+ episodes (simulate self-improvement). Save checkpoints to HF Model Hub.
27
+ - [ ] Build Reasoning and Transfer Components
28
+ - How: For games, generate traces (e.g., "Opponent bet high → Likely strong hand → Fold"). For transfer, prompt model with math tasks post-training. Use chain-of-thought prompting for visibility.
29
+ - [ ] Develop User Interface
30
+ - How: Use Gradio Blocks: Tab 1: Game Play (dropdown for game, text input for moves, output panel for AI response/trace). Tab 2: Tester (input prompt, show output). Add buttons for "Explain Reasoning" and "Export Session". Style with CSS for modern UX (e.g., cards, animations).
31
+
32
+ ## Phase 3: Testing and Optimization
33
+ - [ ] Unit and Integration Testing
34
+ - How: Test game logic (e.g., assert win conditions). Run self-play simulations to verify improvements (e.g., win rate >50% after training). Use pytest for automation.
35
+ - [ ] User Testing
36
+ - How: Simulate non-tech users (play games, check intuitiveness). For tech users, test API endpoints. Gather feedback via HF Spaces comments or a built-in form. Measure metrics: Latency <2s per move, accuracy on benchmarks (+8% as per paper).
37
+ - [ ] Optimize for HF Spaces
38
+ - How: Profile for CPU/GPU usage; use model quantization (e.g., bitsandbytes) for faster inference. Ensure no interactive flags needed (e.g., --yes for installs).
39
+
40
+ ## Phase 4: Deployment and Documentation
41
+ - [ ] Deploy to Hugging Face Spaces
42
+ - How: Create Space, upload repo via Git. Set entry point to Gradio app.py. Enable public access, add tags like "AI", "Games", "Reasoning" for discoverability.
43
+ - [ ] Create Documentation and Tutorials
44
+ - How: Write README.md with paper summary, usage guide (screenshots), and code explanations. Add in-app help: Tooltips for buttons, video demo. For tech users: Include training scripts and extension guides.
45
+ - [ ] Launch and Promote
46
+ - How: Share on HF forums, Reddit (r/MachineLearning), Twitter. Monitor usage via HF analytics; iterate based on feedback (e.g., add more games).
47
+
48
+ ## Phase 5: Maintenance and Iteration
49
+ - [ ] Monitor and Update
50
+ - How: Check for issues (e.g., via GitHub Issues). Update model with new games or better RL algos. Aim for v2: Multimodal (add image-based games).
51
+ - [ ] Measure Impact
52
+ - How: Track metrics: User sessions, feedback ratings. Goal: 1000+ interactions in first month, positive reviews highlighting educational value.
53
+
54
+ This plan ensures a useful tool that's easy to use, educational, and extensible.
requirements.txt ADDED
@@ -0,0 +1,44 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Core ML and Deep Learning
2
+ torch>=2.0.0
3
+ transformers>=4.30.0
4
+ accelerate>=0.20.0
5
+ bitsandbytes>=0.41.0
6
+
7
+ # Reinforcement Learning
8
+ gymnasium>=0.28.0
9
+ stable-baselines3>=2.0.0
10
+ sb3-contrib>=2.0.0
11
+
12
+ # Web Interface
13
+ gradio>=4.0.0
14
+
15
+ # Data Processing and Utilities
16
+ numpy>=1.21.0
17
+ pandas>=1.3.0
18
+ matplotlib>=3.5.0
19
+ seaborn>=0.11.0
20
+ plotly>=5.0.0
21
+
22
+ # Game Theory and Math
23
+ scipy>=1.7.0
24
+ networkx>=2.6.0
25
+
26
+ # Model Management
27
+ huggingface-hub>=0.16.0
28
+ datasets>=2.10.0
29
+
30
+ # Testing and Development
31
+ pytest>=7.0.0
32
+ pytest-cov>=4.0.0
33
+ black>=22.0.0
34
+ flake8>=5.0.0
35
+
36
+ # Logging and Monitoring
37
+ wandb>=0.15.0
38
+ tensorboard>=2.10.0
39
+
40
+ # Utilities
41
+ tqdm>=4.64.0
42
+ pyyaml>=6.0.0
43
+ python-dotenv>=1.0.0
44
+ requests>=2.28.0
src/__init__.py ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ SPIRAL: Interactive Reasoning Game Simulator
3
+
4
+ A practical tool demonstrating how self-play training on zero-sum games
5
+ can improve AI reasoning capabilities.
6
+ """
7
+
8
+ __version__ = "0.1.0"
9
+ __author__ = "SPIRAL Team"
10
+ __email__ = "contact@spiral-reasoning.com"
11
+
12
+ from .games import *
13
+ from .models import *
14
+ from .training import *
15
+ from .reasoning import *
src/games/__init__.py ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Game environments for SPIRAL reasoning simulator.
3
+
4
+ This module contains implementations of zero-sum games used for self-play training,
5
+ including Kuhn Poker, TicTacToe, and other strategic games.
6
+ """
7
+
8
+ from .kuhn_poker import KuhnPokerEnv
9
+ from .tictactoe import TicTacToeEnv
10
+ from .base_game import BaseGameEnv
11
+
12
+ __all__ = ["KuhnPokerEnv", "TicTacToeEnv", "BaseGameEnv"]
src/models/__init__.py ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Model implementations for SPIRAL reasoning simulator.
3
+
4
+ This module contains the SPIRAL model architecture, role-conditioned advantage
5
+ estimation, and other model components for self-play training.
6
+ """
7
+
8
+ from .spiral_model import SpiralModel
9
+ from .rae import RoleConditionedAdvantageEstimator
10
+ from .policy_network import PolicyNetwork
11
+ from .value_network import ValueNetwork
12
+
13
+ __all__ = ["SpiralModel", "RoleConditionedAdvantageEstimator", "PolicyNetwork", "ValueNetwork"]
src/reasoning/__init__.py ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Reasoning components for SPIRAL reasoning simulator.
3
+
4
+ This module contains reasoning trace generation, chain-of-thought processing,
5
+ and transfer learning evaluation for testing reasoning capabilities.
6
+ """
7
+
8
+ from .trace_generator import TraceGenerator
9
+ from .chain_of_thought import ChainOfThought
10
+ from .transfer_evaluator import TransferEvaluator
11
+ from .reasoning_utils import ReasoningUtils
12
+
13
+ __all__ = ["TraceGenerator", "ChainOfThought", "TransferEvaluator", "ReasoningUtils"]
src/training/__init__.py ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Training components for SPIRAL reasoning simulator.
3
+
4
+ This module contains the self-play training logic, PPO implementation with
5
+ role-conditioned advantage estimation, and training utilities.
6
+ """
7
+
8
+ from .self_play_trainer import SelfPlayTrainer
9
+ from .ppo_trainer import PPOTrainer
10
+ from .opponent_manager import OpponentManager
11
+ from .training_utils import TrainingUtils
12
+
13
+ __all__ = ["SelfPlayTrainer", "PPOTrainer", "OpponentManager", "TrainingUtils"]
tests/test_basic.py ADDED
@@ -0,0 +1,130 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Basic tests for SPIRAL Interactive Reasoning Game Simulator.
3
+
4
+ This module contains fundamental tests to verify the core functionality
5
+ of the SPIRAL system components.
6
+ """
7
+
8
+ import pytest
9
+ import os
10
+ import sys
11
+ import yaml
12
+
13
+ # Add the src directory to the path for imports
14
+ sys.path.append(os.path.join(os.path.dirname(__file__), '..', 'src'))
15
+ sys.path.append(os.path.join(os.path.dirname(__file__), '..', 'app'))
16
+
17
+ from app import SpiralApp
18
+
19
+ class TestSpiralApp:
20
+ """Test cases for the main SPIRAL application."""
21
+
22
+ def test_app_initialization(self):
23
+ """Test that the app initializes correctly."""
24
+ app = SpiralApp()
25
+ assert app is not None
26
+ assert hasattr(app, 'config')
27
+ assert hasattr(app, 'play_game')
28
+ assert hasattr(app, 'test_reasoning')
29
+
30
+ def test_config_loading(self):
31
+ """Test configuration loading."""
32
+ app = SpiralApp()
33
+ assert 'interface' in app.config
34
+ assert 'games' in app.config
35
+ assert app.config['interface']['title'] is not None
36
+
37
+ def test_play_game_basic(self):
38
+ """Test basic game play functionality."""
39
+ app = SpiralApp()
40
+
41
+ # Test with valid input
42
+ state, response, trace = app.play_game("kuhn_poker", "bet", "")
43
+ assert state is not None
44
+ assert response is not None
45
+ assert trace is not None
46
+ assert "bet" in state
47
+
48
+ # Test with empty input
49
+ state, response, trace = app.play_game("kuhn_poker", "", "")
50
+ assert "Please enter a move!" in response
51
+
52
+ def test_reasoning_basic(self):
53
+ """Test basic reasoning functionality."""
54
+ app = SpiralApp()
55
+
56
+ # Test with valid input
57
+ response, trace = app.test_reasoning("What is 2+2?", "math")
58
+ assert response is not None
59
+ assert trace is not None
60
+ assert "2+2" in response
61
+
62
+ # Test with empty input
63
+ response, trace = app.test_reasoning("", "math")
64
+ assert "Please enter a reasoning prompt!" in response
65
+
66
+ def test_interface_creation(self):
67
+ """Test that the Gradio interface can be created."""
68
+ app = SpiralApp()
69
+ demo = app.create_interface()
70
+ assert demo is not None
71
+
72
+ class TestConfiguration:
73
+ """Test cases for configuration management."""
74
+
75
+ def test_config_file_structure(self):
76
+ """Test that config.yaml has the expected structure."""
77
+ config_path = os.path.join(os.path.dirname(__file__), '..', 'config.yaml')
78
+
79
+ if os.path.exists(config_path):
80
+ with open(config_path, 'r') as f:
81
+ config = yaml.safe_load(f)
82
+
83
+ # Check required sections
84
+ assert 'model' in config
85
+ assert 'games' in config
86
+ assert 'training' in config
87
+ assert 'reasoning' in config
88
+ assert 'interface' in config
89
+
90
+ # Check model configuration
91
+ assert 'name' in config['model']
92
+ assert 'max_length' in config['model']
93
+
94
+ # Check games configuration
95
+ assert 'kuhn_poker' in config['games']
96
+ assert 'tictactoe' in config['games']
97
+
98
+ class TestProjectStructure:
99
+ """Test cases for project structure and imports."""
100
+
101
+ def test_src_directory_structure(self):
102
+ """Test that the src directory has the expected structure."""
103
+ src_path = os.path.join(os.path.dirname(__file__), '..', 'src')
104
+
105
+ # Check that required directories exist
106
+ assert os.path.exists(os.path.join(src_path, 'games'))
107
+ assert os.path.exists(os.path.join(src_path, 'models'))
108
+ assert os.path.exists(os.path.join(src_path, 'training'))
109
+ assert os.path.exists(os.path.join(src_path, 'reasoning'))
110
+
111
+ # Check that __init__.py files exist
112
+ assert os.path.exists(os.path.join(src_path, '__init__.py'))
113
+ assert os.path.exists(os.path.join(src_path, 'games', '__init__.py'))
114
+ assert os.path.exists(os.path.join(src_path, 'models', '__init__.py'))
115
+ assert os.path.exists(os.path.join(src_path, 'training', '__init__.py'))
116
+ assert os.path.exists(os.path.join(src_path, 'reasoning', '__init__.py'))
117
+
118
+ def test_required_files_exist(self):
119
+ """Test that required project files exist."""
120
+ project_root = os.path.join(os.path.dirname(__file__), '..')
121
+
122
+ # Check essential files
123
+ assert os.path.exists(os.path.join(project_root, 'requirements.txt'))
124
+ assert os.path.exists(os.path.join(project_root, 'README.md'))
125
+ assert os.path.exists(os.path.join(project_root, 'config.yaml'))
126
+ assert os.path.exists(os.path.join(project_root, '.gitignore'))
127
+ assert os.path.exists(os.path.join(project_root, 'app', 'app.py'))
128
+
129
+ if __name__ == "__main__":
130
+ pytest.main([__file__])