nathanael-fijalkow commited on
Commit
f8cdc2f
·
1 Parent(s): d0298ce

First commit

Browse files
Files changed (3) hide show
  1. README.md +29 -8
  2. app.py +601 -0
  3. requirements.txt +6 -0
README.md CHANGED
@@ -1,13 +1,34 @@
1
  ---
2
- title: Chess1MChallenge
3
- emoji: 🦀
4
- colorFrom: pink
5
- colorTo: gray
6
  sdk: gradio
7
- sdk_version: 6.2.0
8
  app_file: app.py
9
- pinned: false
10
- short_description: Train a 1M parameter LLM to play chess!
11
  ---
12
 
13
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  ---
2
+ title: Chess Challenge Arena
3
+ emoji: ♟️
4
+ colorFrom: gray
5
+ colorTo: yellow
6
  sdk: gradio
7
+ sdk_version: 4.44.0
8
  app_file: app.py
9
+ pinned: true
10
+ license: mit
11
  ---
12
 
13
+ # Chess Challenge Arena
14
+
15
+ This Space hosts the evaluation arena for the LLM Chess Challenge.
16
+
17
+ ## Features
18
+
19
+ - **Interactive Demo**: Test any submitted model against Stockfish
20
+ - **Leaderboard**: See rankings of all submitted models
21
+ - **Statistics**: View detailed performance metrics
22
+
23
+ ## How to Submit
24
+
25
+ Students should push their trained models to this organization:
26
+
27
+ ```python
28
+ from chess_challenge import ChessForCausalLM, ChessTokenizer
29
+
30
+ model.push_to_hub("your-model-name", organization="LLM-course")
31
+ tokenizer.push_to_hub("your-model-name", organization="LLM-course")
32
+ ```
33
+
34
+ Models will be automatically evaluated and added to the leaderboard.
app.py ADDED
@@ -0,0 +1,601 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Chess Challenge Arena - Hugging Face Space
3
+
4
+ This Gradio app provides:
5
+ 1. Interactive demo to test models
6
+ 2. Leaderboard of submitted models
7
+ 3. Live game visualization
8
+ """
9
+
10
+ import json
11
+ import os
12
+ from datetime import datetime
13
+ from pathlib import Path
14
+ from typing import Optional
15
+
16
+ import gradio as gr
17
+
18
+ # Configuration
19
+ ORGANIZATION = os.environ.get("HF_ORGANIZATION", "your-org-name")
20
+ LEADERBOARD_FILE = "leaderboard.json"
21
+ STOCKFISH_LEVELS = {
22
+ "Beginner (Level 0)": 0,
23
+ "Easy (Level 1)": 1,
24
+ "Medium (Level 3)": 3,
25
+ "Hard (Level 5)": 5,
26
+ }
27
+
28
+
29
+ def load_leaderboard() -> list:
30
+ """Load leaderboard from file or return empty list."""
31
+ if Path(LEADERBOARD_FILE).exists():
32
+ with open(LEADERBOARD_FILE, "r") as f:
33
+ return json.load(f)
34
+ return []
35
+
36
+
37
+ def save_leaderboard(data: list):
38
+ """Save leaderboard to file."""
39
+ with open(LEADERBOARD_FILE, "w") as f:
40
+ json.dump(data, f, indent=2)
41
+
42
+
43
+ def get_available_models() -> list:
44
+ """Fetch available models from the organization."""
45
+ try:
46
+ from huggingface_hub import list_models
47
+
48
+ models = list_models(author=ORGANIZATION)
49
+ return [m.id for m in models if "chess" in m.id.lower()]
50
+ except Exception as e:
51
+ print(f"Error fetching models: {e}")
52
+ return ["No models available"]
53
+
54
+
55
+ def format_leaderboard_html(data: list) -> str:
56
+ """Format leaderboard data as HTML table."""
57
+ if not data:
58
+ return "<p>No models evaluated yet. Be the first to submit!</p>"
59
+
60
+ # Sort by ELO
61
+ sorted_data = sorted(data, key=lambda x: x.get("elo", 0), reverse=True)
62
+
63
+ html = """
64
+ <style>
65
+ .leaderboard-table {
66
+ width: 100%;
67
+ border-collapse: collapse;
68
+ font-family: 'Segoe UI', Tahoma, Geneva, Verdana, sans-serif;
69
+ }
70
+ .leaderboard-table th {
71
+ background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
72
+ color: white;
73
+ padding: 12px;
74
+ text-align: left;
75
+ }
76
+ .leaderboard-table td {
77
+ padding: 10px 12px;
78
+ border-bottom: 1px solid #ddd;
79
+ }
80
+ .leaderboard-table tr:nth-child(even) {
81
+ background-color: #f8f9fa;
82
+ }
83
+ .leaderboard-table tr:hover {
84
+ background-color: #e9ecef;
85
+ }
86
+ .rank-1 { color: #ffd700; font-weight: bold; }
87
+ .rank-2 { color: #c0c0c0; font-weight: bold; }
88
+ .rank-3 { color: #cd7f32; font-weight: bold; }
89
+ .model-link { color: #667eea; text-decoration: none; }
90
+ .model-link:hover { text-decoration: underline; }
91
+ .legal-good { color: #28a745; }
92
+ .legal-medium { color: #ffc107; }
93
+ .legal-bad { color: #dc3545; }
94
+ </style>
95
+ <table class="leaderboard-table">
96
+ <thead>
97
+ <tr>
98
+ <th>Rank</th>
99
+ <th>Model</th>
100
+ <th>Legal Rate</th>
101
+ <th>ELO</th>
102
+ <th>Win Rate</th>
103
+ <th>Games</th>
104
+ <th>Last Updated</th>
105
+ </tr>
106
+ </thead>
107
+ <tbody>
108
+ """
109
+
110
+ for i, entry in enumerate(sorted_data, 1):
111
+ rank_class = f"rank-{i}" if i <= 3 else ""
112
+ rank_display = ["🥇", "🥈", "🥉"][i-1] if i <= 3 else str(i)
113
+
114
+ model_url = f"https://huggingface.co/{entry['model_id']}"
115
+
116
+ # Color code legal rate
117
+ legal_rate = entry.get('legal_rate', 0)
118
+ if legal_rate >= 0.9:
119
+ legal_class = "legal-good"
120
+ elif legal_rate >= 0.7:
121
+ legal_class = "legal-medium"
122
+ else:
123
+ legal_class = "legal-bad"
124
+
125
+ html += f"""
126
+ <tr>
127
+ <td class="{rank_class}">{rank_display}</td>
128
+ <td><a href="{model_url}" target="_blank" class="model-link">{entry['model_id'].split('/')[-1]}</a></td>
129
+ <td class="{legal_class}">{legal_rate*100:.1f}%</td>
130
+ <td><strong>{entry.get('elo', 'N/A'):.0f}</strong></td>
131
+ <td>{entry.get('win_rate', 0)*100:.1f}%</td>
132
+ <td>{entry.get('games_played', 0)}</td>
133
+ <td>{entry.get('last_updated', 'N/A')}</td>
134
+ </tr>
135
+ """
136
+
137
+ html += "</tbody></table>"
138
+ return html
139
+
140
+
141
+ def render_board_svg(fen: str = "startpos") -> str:
142
+ """Render a chess board as SVG."""
143
+ try:
144
+ import chess
145
+ import chess.svg
146
+
147
+ if fen == "startpos":
148
+ board = chess.Board()
149
+ else:
150
+ board = chess.Board(fen)
151
+
152
+ return chess.svg.board(board, size=400)
153
+ except ImportError:
154
+ return "<p>Install python-chess to see the board</p>"
155
+
156
+
157
+ def play_move(
158
+ model_id: str,
159
+ current_fen: str,
160
+ move_history: str,
161
+ temperature: float,
162
+ ) -> tuple:
163
+ """Play a move with the selected model."""
164
+ try:
165
+ import chess
166
+ from transformers import AutoModelForCausalLM, AutoTokenizer
167
+ import torch
168
+
169
+ # Load model
170
+ tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)
171
+ model = AutoModelForCausalLM.from_pretrained(model_id, trust_remote_code=True)
172
+ model.eval()
173
+
174
+ # Setup board
175
+ board = chess.Board(current_fen) if current_fen != "startpos" else chess.Board()
176
+
177
+ # Tokenize history
178
+ if move_history:
179
+ inputs = tokenizer(move_history, return_tensors="pt")
180
+ else:
181
+ inputs = tokenizer(tokenizer.bos_token, return_tensors="pt")
182
+
183
+ # Generate move
184
+ with torch.no_grad():
185
+ outputs = model(**inputs)
186
+ logits = outputs.logits[:, -1, :] / temperature
187
+ probs = torch.softmax(logits, dim=-1)
188
+ next_token = torch.multinomial(probs, num_samples=1)
189
+
190
+ move_token = tokenizer.decode(next_token[0])
191
+
192
+ # Parse move
193
+ if len(move_token) >= 6:
194
+ uci_move = move_token[2:4] + move_token[4:6]
195
+ try:
196
+ move = chess.Move.from_uci(uci_move)
197
+ if move in board.legal_moves:
198
+ board.push(move)
199
+ new_history = f"{move_history} {move_token}".strip()
200
+ return (
201
+ render_board_svg(board.fen()),
202
+ board.fen(),
203
+ new_history,
204
+ f"Model played: {move_token} ({uci_move})",
205
+ )
206
+ except:
207
+ pass
208
+
209
+ return (
210
+ render_board_svg(current_fen if current_fen != "startpos" else None),
211
+ current_fen,
212
+ move_history,
213
+ f"⚠️ Model generated illegal move: {move_token}",
214
+ )
215
+
216
+ except Exception as e:
217
+ return (
218
+ render_board_svg(),
219
+ "startpos",
220
+ "",
221
+ f"❌ Error: {str(e)}",
222
+ )
223
+
224
+
225
+ def evaluate_legal_moves(
226
+ model_id: str,
227
+ n_positions: int,
228
+ progress: gr.Progress = gr.Progress(),
229
+ ) -> str:
230
+ """Evaluate a model's legal move generation."""
231
+ try:
232
+ import sys
233
+ sys.path.insert(0, str(Path(__file__).parent.parent / "src"))
234
+
235
+ from chess_challenge.evaluate import ChessEvaluator, load_model_from_hub
236
+
237
+ progress(0, desc="Loading model...")
238
+ model, tokenizer = load_model_from_hub(model_id)
239
+
240
+ progress(0.1, desc="Setting up evaluator...")
241
+ evaluator = ChessEvaluator(
242
+ model=model,
243
+ tokenizer=tokenizer,
244
+ stockfish_level=1, # Not used for legal move eval
245
+ )
246
+
247
+ progress(0.2, desc=f"Testing {n_positions} positions...")
248
+ results = evaluator.evaluate_legal_moves(n_positions=n_positions, verbose=False)
249
+
250
+ # Update leaderboard
251
+ leaderboard = load_leaderboard()
252
+ entry = next((e for e in leaderboard if e["model_id"] == model_id), None)
253
+ if entry is None:
254
+ entry = {"model_id": model_id}
255
+ leaderboard.append(entry)
256
+
257
+ entry.update({
258
+ "legal_rate": results.get("legal_rate_with_retry", 0),
259
+ "legal_rate_first_try": results.get("legal_rate_first_try", 0),
260
+ "last_updated": datetime.now().strftime("%Y-%m-%d %H:%M"),
261
+ })
262
+
263
+ save_leaderboard(leaderboard)
264
+ progress(1.0, desc="Done!")
265
+
266
+ return f"""
267
+ ## ✅ Legal Move Evaluation for {model_id.split('/')[-1]}
268
+
269
+ | Metric | Value |
270
+ |--------|-------|
271
+ | **Positions Tested** | {results['total_positions']} |
272
+ | **Legal (1st try)** | {results['legal_first_try']} ({results['legal_rate_first_try']*100:.1f}%) |
273
+ | **Legal (with retries)** | {results['legal_first_try'] + results['legal_with_retry']} ({results['legal_rate_with_retry']*100:.1f}%) |
274
+ | **Always Illegal** | {results['illegal_all_retries']} ({results['illegal_rate']*100:.1f}%) |
275
+
276
+ ### Interpretation
277
+ - **>90% legal rate**: Great! Model has learned chess rules well.
278
+ - **70-90% legal rate**: Decent, but room for improvement.
279
+ - **<70% legal rate**: Model struggles with legal move generation.
280
+ """
281
+
282
+ except Exception as e:
283
+ return f"❌ Evaluation failed: {str(e)}"
284
+
285
+
286
+ def evaluate_winrate(
287
+ model_id: str,
288
+ stockfish_level: str,
289
+ n_games: int,
290
+ progress: gr.Progress = gr.Progress(),
291
+ ) -> str:
292
+ """Evaluate a model's win rate against Stockfish."""
293
+ try:
294
+ import sys
295
+ sys.path.insert(0, str(Path(__file__).parent.parent / "src"))
296
+
297
+ from chess_challenge.evaluate import ChessEvaluator, load_model_from_hub
298
+
299
+ progress(0, desc="Loading model...")
300
+ model, tokenizer = load_model_from_hub(model_id)
301
+
302
+ progress(0.1, desc="Setting up Stockfish...")
303
+ level = STOCKFISH_LEVELS.get(stockfish_level, 1)
304
+ evaluator = ChessEvaluator(
305
+ model=model,
306
+ tokenizer=tokenizer,
307
+ stockfish_level=level,
308
+ )
309
+
310
+ progress(0.2, desc=f"Playing {n_games} games...")
311
+ results = evaluator.evaluate(n_games=n_games, verbose=False)
312
+
313
+ # Update leaderboard
314
+ leaderboard = load_leaderboard()
315
+ entry = next((e for e in leaderboard if e["model_id"] == model_id), None)
316
+ if entry is None:
317
+ entry = {"model_id": model_id}
318
+ leaderboard.append(entry)
319
+
320
+ entry.update({
321
+ "elo": results.get("estimated_elo", 1000),
322
+ "win_rate": results.get("win_rate", 0),
323
+ "games_played": entry.get("games_played", 0) + n_games,
324
+ "last_updated": datetime.now().strftime("%Y-%m-%d %H:%M"),
325
+ })
326
+
327
+ save_leaderboard(leaderboard)
328
+ progress(1.0, desc="Done!")
329
+
330
+ return f"""
331
+ ## 🏆 Win Rate Evaluation for {model_id.split('/')[-1]}
332
+
333
+ | Metric | Value |
334
+ |--------|-------|
335
+ | **Estimated ELO** | {results.get('estimated_elo', 'N/A'):.0f} |
336
+ | **Win Rate** | {results.get('win_rate', 0)*100:.1f}% |
337
+ | **Draw Rate** | {results.get('draw_rate', 0)*100:.1f}% |
338
+ | **Loss Rate** | {results.get('loss_rate', 0)*100:.1f}% |
339
+ | **Avg Game Length** | {results.get('avg_game_length', 0):.1f} moves |
340
+ | **Illegal Move Rate** | {results.get('illegal_move_rate', 0)*100:.2f}% |
341
+
342
+ Games played: {n_games} against Stockfish {stockfish_level}
343
+ """
344
+
345
+ except Exception as e:
346
+ return f"❌ Evaluation failed: {str(e)}"
347
+
348
+
349
+ def evaluate_model(
350
+ model_id: str,
351
+ stockfish_level: str,
352
+ n_games: int,
353
+ progress: gr.Progress = gr.Progress(),
354
+ ) -> str:
355
+ """Evaluate a model against Stockfish."""
356
+ try:
357
+ # Import evaluation code
358
+ import sys
359
+ sys.path.insert(0, str(Path(__file__).parent.parent / "src"))
360
+
361
+ from chess_challenge.evaluate import ChessEvaluator, load_model_from_hub
362
+
363
+ progress(0, desc="Loading model...")
364
+ model, tokenizer = load_model_from_hub(model_id)
365
+
366
+ progress(0.1, desc="Setting up Stockfish...")
367
+ level = STOCKFISH_LEVELS.get(stockfish_level, 1)
368
+ evaluator = ChessEvaluator(
369
+ model=model,
370
+ tokenizer=tokenizer,
371
+ stockfish_level=level,
372
+ )
373
+
374
+ progress(0.2, desc=f"Playing {n_games} games...")
375
+ results = evaluator.evaluate(n_games=n_games, verbose=False)
376
+
377
+ # Update leaderboard
378
+ leaderboard = load_leaderboard()
379
+
380
+ # Find or create entry
381
+ entry = next((e for e in leaderboard if e["model_id"] == model_id), None)
382
+ if entry is None:
383
+ entry = {"model_id": model_id}
384
+ leaderboard.append(entry)
385
+
386
+ entry.update({
387
+ "elo": results.get("estimated_elo", 1000),
388
+ "win_rate": results.get("win_rate", 0),
389
+ "games_played": entry.get("games_played", 0) + n_games,
390
+ "illegal_rate": results.get("illegal_move_rate", 0),
391
+ "last_updated": datetime.now().strftime("%Y-%m-%d %H:%M"),
392
+ })
393
+
394
+ save_leaderboard(leaderboard)
395
+
396
+ progress(1.0, desc="Done!")
397
+
398
+ return f"""
399
+ ## Evaluation Results for {model_id.split('/')[-1]}
400
+
401
+ | Metric | Value |
402
+ |--------|-------|
403
+ | **Estimated ELO** | {results.get('estimated_elo', 'N/A'):.0f} |
404
+ | **Win Rate** | {results.get('win_rate', 0)*100:.1f}% |
405
+ | **Draw Rate** | {results.get('draw_rate', 0)*100:.1f}% |
406
+ | **Loss Rate** | {results.get('loss_rate', 0)*100:.1f}% |
407
+ | **Avg Game Length** | {results.get('avg_game_length', 0):.1f} moves |
408
+ | **Illegal Move Rate** | {results.get('illegal_move_rate', 0)*100:.2f}% |
409
+
410
+ Games played: {n_games} against Stockfish {stockfish_level}
411
+ """
412
+
413
+ except Exception as e:
414
+ return f"❌ Evaluation failed: {str(e)}"
415
+
416
+
417
+ def refresh_leaderboard() -> str:
418
+ """Refresh and return the leaderboard HTML."""
419
+ return format_leaderboard_html(load_leaderboard())
420
+
421
+
422
+ # Build Gradio Interface
423
+ with gr.Blocks(
424
+ title="Chess Challenge Arena",
425
+ theme=gr.themes.Soft(),
426
+ ) as demo:
427
+ gr.Markdown("""
428
+ # ♟️ Chess Challenge Arena
429
+
430
+ Welcome to the LLM Chess Challenge evaluation arena!
431
+ Test your models, see the leaderboard, and compete with your classmates.
432
+ """)
433
+
434
+ with gr.Tabs():
435
+ # Leaderboard Tab
436
+ with gr.TabItem("🏆 Leaderboard"):
437
+ gr.Markdown("### Current Rankings")
438
+ leaderboard_html = gr.HTML(value=format_leaderboard_html(load_leaderboard()))
439
+ refresh_btn = gr.Button("🔄 Refresh Leaderboard")
440
+ refresh_btn.click(refresh_leaderboard, outputs=leaderboard_html)
441
+
442
+ # Interactive Demo Tab
443
+ with gr.TabItem("🎮 Interactive Demo"):
444
+ gr.Markdown("### Test a Model")
445
+
446
+ with gr.Row():
447
+ with gr.Column(scale=1):
448
+ model_dropdown = gr.Dropdown(
449
+ choices=get_available_models(),
450
+ label="Select Model",
451
+ value=None,
452
+ )
453
+ temperature_slider = gr.Slider(
454
+ minimum=0.1,
455
+ maximum=2.0,
456
+ value=0.7,
457
+ step=0.1,
458
+ label="Temperature",
459
+ )
460
+
461
+ with gr.Row():
462
+ play_btn = gr.Button("▶️ Model Move", variant="primary")
463
+ reset_btn = gr.Button("🔄 Reset")
464
+
465
+ status_text = gr.Textbox(label="Status", interactive=False)
466
+
467
+ with gr.Column(scale=1):
468
+ board_display = gr.HTML(value=render_board_svg())
469
+
470
+ # Hidden state
471
+ current_fen = gr.State("startpos")
472
+ move_history = gr.State("")
473
+
474
+ play_btn.click(
475
+ play_move,
476
+ inputs=[model_dropdown, current_fen, move_history, temperature_slider],
477
+ outputs=[board_display, current_fen, move_history, status_text],
478
+ )
479
+
480
+ def reset_game():
481
+ return render_board_svg(), "startpos", "", "Game reset!"
482
+
483
+ reset_btn.click(
484
+ reset_game,
485
+ outputs=[board_display, current_fen, move_history, status_text],
486
+ )
487
+
488
+ # Legal Move Evaluation Tab
489
+ with gr.TabItem("✅ Legal Move Eval"):
490
+ gr.Markdown("""
491
+ ### Phase 1: Legal Move Evaluation
492
+
493
+ Test if your model can generate **legal chess moves** in random positions.
494
+ This is a quick first check before running full games.
495
+
496
+ - Tests the model on random board positions
497
+ - Measures how often it generates legal moves
498
+ - **Recommended before win rate evaluation**
499
+ """)
500
+
501
+ with gr.Row():
502
+ legal_model = gr.Dropdown(
503
+ choices=get_available_models(),
504
+ label="Model to Evaluate",
505
+ )
506
+ legal_positions = gr.Slider(
507
+ minimum=100,
508
+ maximum=1000,
509
+ value=500,
510
+ step=100,
511
+ label="Number of Positions",
512
+ )
513
+
514
+ legal_btn = gr.Button("✅ Run Legal Move Evaluation", variant="primary")
515
+ legal_results = gr.Markdown()
516
+
517
+ legal_btn.click(
518
+ evaluate_legal_moves,
519
+ inputs=[legal_model, legal_positions],
520
+ outputs=legal_results,
521
+ )
522
+
523
+ # Win Rate Evaluation Tab
524
+ with gr.TabItem("🏆 Win Rate Eval"):
525
+ gr.Markdown("""
526
+ ### Phase 2: Win Rate Evaluation
527
+
528
+ Play full games against Stockfish and measure win rate.
529
+ This evaluation computes your model's **ELO rating**.
530
+
531
+ - Plays complete games against Stockfish
532
+ - Measures win/draw/loss rates
533
+ - Estimates ELO rating
534
+ """)
535
+
536
+ with gr.Row():
537
+ eval_model = gr.Dropdown(
538
+ choices=get_available_models(),
539
+ label="Model to Evaluate",
540
+ )
541
+ eval_level = gr.Dropdown(
542
+ choices=list(STOCKFISH_LEVELS.keys()),
543
+ value="Easy (Level 1)",
544
+ label="Stockfish Level",
545
+ )
546
+ eval_games = gr.Slider(
547
+ minimum=10,
548
+ maximum=100,
549
+ value=50,
550
+ step=10,
551
+ label="Number of Games",
552
+ )
553
+
554
+ eval_btn = gr.Button("🏆 Run Win Rate Evaluation", variant="primary")
555
+ eval_results = gr.Markdown()
556
+
557
+ eval_btn.click(
558
+ evaluate_winrate,
559
+ inputs=[eval_model, eval_level, eval_games],
560
+ outputs=eval_results,
561
+ )
562
+
563
+ # Submission Guide Tab
564
+ with gr.TabItem("📤 How to Submit"):
565
+ gr.Markdown(f"""
566
+ ### Submitting Your Model
567
+
568
+ 1. **Train your model** using the Chess Challenge template
569
+
570
+ 2. **Push to Hugging Face Hub**:
571
+ ```python
572
+ from chess_challenge import ChessForCausalLM, ChessTokenizer
573
+
574
+ # After training
575
+ model.push_to_hub("your-model-name", organization="{ORGANIZATION}")
576
+ tokenizer.push_to_hub("your-model-name", organization="{ORGANIZATION}")
577
+ ```
578
+
579
+ 3. **Verify your submission** by checking the model page on Hugging Face
580
+
581
+ 4. **Run evaluations**:
582
+ - First: **Legal Move Eval** (quick sanity check)
583
+ - Then: **Win Rate Eval** (full ELO computation)
584
+
585
+ ### Requirements
586
+
587
+ - Model must be under **1M parameters**
588
+ - Model must use the `ChessConfig` and `ChessForCausalLM` classes
589
+ - Include the tokenizer with your submission
590
+
591
+ ### Tips for Better Performance
592
+
593
+ - Experiment with different architectures (layers, heads, dimensions)
594
+ - Try weight tying to save parameters
595
+ - Fine-tune on high-quality games only
596
+ - Use RL fine-tuning with Stockfish rewards
597
+ """)
598
+
599
+
600
+ if __name__ == "__main__":
601
+ demo.launch()
requirements.txt ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ gradio>=4.44.0
2
+ transformers>=4.40.0
3
+ torch>=2.0.0
4
+ python-chess>=1.999
5
+ huggingface-hub>=0.20.0
6
+ datasets>=2.14.0