Spaces:
Running
Running
Commit
·
560a461
1
Parent(s):
19d5912
updated instructions
Browse files- app.py +60 -54
- src/evaluate.py +24 -5
app.py
CHANGED
|
@@ -1,11 +1,15 @@
|
|
| 1 |
"""
|
| 2 |
-
Chess
|
| 3 |
|
| 4 |
This Gradio app provides:
|
| 5 |
1. Interactive demo to test models
|
| 6 |
2. Leaderboard of submitted models
|
| 7 |
3. Live game visualization
|
| 8 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 9 |
Leaderboard data is stored in a private HuggingFace dataset for persistence.
|
| 10 |
"""
|
| 11 |
|
|
@@ -45,7 +49,7 @@ LEADERBOARD_COLUMNS = [
|
|
| 45 |
"legal_rate",
|
| 46 |
"legal_rate_first_try",
|
| 47 |
"elo",
|
| 48 |
-
"win_rate",
|
| 49 |
"draw_rate",
|
| 50 |
"games_played",
|
| 51 |
"last_updated",
|
|
@@ -170,7 +174,7 @@ def format_leaderboard_html(data: list) -> str:
|
|
| 170 |
<th>Model</th>
|
| 171 |
<th>Legal Rate</th>
|
| 172 |
<th>ELO</th>
|
| 173 |
-
<th>Win Rate</th>
|
| 174 |
<th>Games</th>
|
| 175 |
<th>Last Updated</th>
|
| 176 |
</tr>
|
|
@@ -199,7 +203,7 @@ def format_leaderboard_html(data: list) -> str:
|
|
| 199 |
<td><a href="{model_url}" target="_blank" class="model-link">{entry['model_id'].split('/')[-1]}</a></td>
|
| 200 |
<td class="{legal_class}">{legal_rate*100:.1f}%</td>
|
| 201 |
<td><strong>{entry.get('elo', 'N/A'):.0f}</strong></td>
|
| 202 |
-
<td>{entry.get('win_rate', 0)*100:.1f}%</td>
|
| 203 |
<td>{entry.get('games_played', 0)}</td>
|
| 204 |
<td>{entry.get('last_updated', 'N/A')}</td>
|
| 205 |
</tr>
|
|
@@ -492,14 +496,16 @@ def refresh_leaderboard() -> str:
|
|
| 492 |
|
| 493 |
# Build Gradio Interface
|
| 494 |
with gr.Blocks(
|
| 495 |
-
title="Chess
|
| 496 |
theme=gr.themes.Soft(),
|
| 497 |
) as demo:
|
| 498 |
gr.Markdown("""
|
| 499 |
-
#
|
|
|
|
|
|
|
|
|
|
| 500 |
|
| 501 |
-
|
| 502 |
-
Test your models, see the leaderboard, and compete with your classmates.
|
| 503 |
""")
|
| 504 |
|
| 505 |
with gr.Tabs():
|
|
@@ -539,13 +545,6 @@ with gr.Blocks(
|
|
| 539 |
- Use RL fine-tuning with Stockfish rewards
|
| 540 |
""")
|
| 541 |
|
| 542 |
-
# Leaderboard Tab
|
| 543 |
-
with gr.TabItem("🏆 Leaderboard"):
|
| 544 |
-
gr.Markdown("### Current Rankings")
|
| 545 |
-
leaderboard_html = gr.HTML(value=format_leaderboard_html(load_leaderboard()))
|
| 546 |
-
refresh_btn = gr.Button("Refresh Leaderboard")
|
| 547 |
-
refresh_btn.click(refresh_leaderboard, outputs=leaderboard_html)
|
| 548 |
-
|
| 549 |
# Interactive Demo Tab
|
| 550 |
with gr.TabItem("🎮 Interactive Demo"):
|
| 551 |
gr.Markdown("### Test a Model")
|
|
@@ -627,45 +626,52 @@ with gr.Blocks(
|
|
| 627 |
outputs=legal_results,
|
| 628 |
)
|
| 629 |
|
| 630 |
-
# Win Rate Evaluation Tab
|
| 631 |
-
with gr.TabItem("🏆 Win Rate Eval"):
|
| 632 |
-
|
| 633 |
-
|
| 634 |
-
|
| 635 |
-
|
| 636 |
-
|
| 637 |
-
|
| 638 |
-
|
| 639 |
-
|
| 640 |
-
|
| 641 |
-
|
| 642 |
-
|
| 643 |
-
|
| 644 |
-
|
| 645 |
-
|
| 646 |
-
|
| 647 |
-
|
| 648 |
-
|
| 649 |
-
|
| 650 |
-
|
| 651 |
-
|
| 652 |
-
|
| 653 |
-
|
| 654 |
-
|
| 655 |
-
|
| 656 |
-
|
| 657 |
-
|
| 658 |
-
|
| 659 |
-
|
| 660 |
-
|
| 661 |
-
|
| 662 |
-
|
| 663 |
-
|
| 664 |
-
|
| 665 |
-
|
| 666 |
-
|
| 667 |
-
|
| 668 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 669 |
|
| 670 |
|
| 671 |
# =============================================================================
|
|
|
|
| 1 |
"""
|
| 2 |
+
Play Chess like a Honey Bee
|
| 3 |
|
| 4 |
This Gradio app provides:
|
| 5 |
1. Interactive demo to test models
|
| 6 |
2. Leaderboard of submitted models
|
| 7 |
3. Live game visualization
|
| 8 |
|
| 9 |
+
Instructions:
|
| 10 |
+
The goal is to train a language model to play chess, under a strict constraint:
|
| 11 |
+
less than 1M parameters! This is approximately the number of neurons of a honey bee.
|
| 12 |
+
|
| 13 |
Leaderboard data is stored in a private HuggingFace dataset for persistence.
|
| 14 |
"""
|
| 15 |
|
|
|
|
| 49 |
"legal_rate",
|
| 50 |
"legal_rate_first_try",
|
| 51 |
"elo",
|
| 52 |
+
# "win_rate",
|
| 53 |
"draw_rate",
|
| 54 |
"games_played",
|
| 55 |
"last_updated",
|
|
|
|
| 174 |
<th>Model</th>
|
| 175 |
<th>Legal Rate</th>
|
| 176 |
<th>ELO</th>
|
| 177 |
+
<!-- <th>Win Rate</th> -->
|
| 178 |
<th>Games</th>
|
| 179 |
<th>Last Updated</th>
|
| 180 |
</tr>
|
|
|
|
| 203 |
<td><a href="{model_url}" target="_blank" class="model-link">{entry['model_id'].split('/')[-1]}</a></td>
|
| 204 |
<td class="{legal_class}">{legal_rate*100:.1f}%</td>
|
| 205 |
<td><strong>{entry.get('elo', 'N/A'):.0f}</strong></td>
|
| 206 |
+
<!-- <td>{entry.get('win_rate', 0)*100:.1f}%</td> -->
|
| 207 |
<td>{entry.get('games_played', 0)}</td>
|
| 208 |
<td>{entry.get('last_updated', 'N/A')}</td>
|
| 209 |
</tr>
|
|
|
|
| 496 |
|
| 497 |
# Build Gradio Interface
|
| 498 |
with gr.Blocks(
|
| 499 |
+
title="Play Chess like a Honey Bee",
|
| 500 |
theme=gr.themes.Soft(),
|
| 501 |
) as demo:
|
| 502 |
gr.Markdown("""
|
| 503 |
+
# 🐝 Play Chess like a Honey Bee
|
| 504 |
+
|
| 505 |
+
Welcome to the Chess Challenge! The goal is to train a language model to play chess,
|
| 506 |
+
under a strict constraint: **less than 1M parameters!**
|
| 507 |
|
| 508 |
+
This is approximately the number of neurons of a honey bee 🐝
|
|
|
|
| 509 |
""")
|
| 510 |
|
| 511 |
with gr.Tabs():
|
|
|
|
| 545 |
- Use RL fine-tuning with Stockfish rewards
|
| 546 |
""")
|
| 547 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 548 |
# Interactive Demo Tab
|
| 549 |
with gr.TabItem("🎮 Interactive Demo"):
|
| 550 |
gr.Markdown("### Test a Model")
|
|
|
|
| 626 |
outputs=legal_results,
|
| 627 |
)
|
| 628 |
|
| 629 |
+
# Win Rate Evaluation Tab (commented out for now)
|
| 630 |
+
# with gr.TabItem("🏆 Win Rate Eval"):
|
| 631 |
+
# gr.Markdown("""
|
| 632 |
+
# ### Phase 2: Win Rate Evaluation
|
| 633 |
+
#
|
| 634 |
+
# Play full games against Stockfish and measure win rate.
|
| 635 |
+
# This evaluation computes your model's **ELO rating**.
|
| 636 |
+
#
|
| 637 |
+
# - Plays complete games against Stockfish
|
| 638 |
+
# - Measures win/draw/loss rates
|
| 639 |
+
# - Estimates ELO rating
|
| 640 |
+
# """)
|
| 641 |
+
#
|
| 642 |
+
# with gr.Row():
|
| 643 |
+
# eval_model = gr.Dropdown(
|
| 644 |
+
# choices=get_available_models(),
|
| 645 |
+
# label="Model to Evaluate",
|
| 646 |
+
# )
|
| 647 |
+
# eval_level = gr.Dropdown(
|
| 648 |
+
# choices=list(STOCKFISH_LEVELS.keys()),
|
| 649 |
+
# value="Easy (Level 1)",
|
| 650 |
+
# label="Stockfish Level",
|
| 651 |
+
# )
|
| 652 |
+
# eval_games = gr.Slider(
|
| 653 |
+
# minimum=10,
|
| 654 |
+
# maximum=100,
|
| 655 |
+
# value=50,
|
| 656 |
+
# step=10,
|
| 657 |
+
# label="Number of Games",
|
| 658 |
+
# )
|
| 659 |
+
#
|
| 660 |
+
# eval_btn = gr.Button("Run Win Rate Evaluation", variant="primary")
|
| 661 |
+
# eval_results = gr.Markdown()
|
| 662 |
+
#
|
| 663 |
+
# eval_btn.click(
|
| 664 |
+
# evaluate_winrate,
|
| 665 |
+
# inputs=[eval_model, eval_level, eval_games],
|
| 666 |
+
# outputs=eval_results,
|
| 667 |
+
# )
|
| 668 |
+
|
| 669 |
+
# Leaderboard Tab (moved to the end)
|
| 670 |
+
with gr.TabItem("🏆 Leaderboard"):
|
| 671 |
+
gr.Markdown("### Current Rankings")
|
| 672 |
+
leaderboard_html = gr.HTML(value=format_leaderboard_html(load_leaderboard()))
|
| 673 |
+
refresh_btn = gr.Button("Refresh Leaderboard")
|
| 674 |
+
refresh_btn.click(refresh_leaderboard, outputs=leaderboard_html)
|
| 675 |
|
| 676 |
|
| 677 |
# =============================================================================
|
src/evaluate.py
CHANGED
|
@@ -506,9 +506,13 @@ def load_model_from_hub(model_id: str, device: str = "auto"):
|
|
| 506 |
with open(config_path, "r") as f:
|
| 507 |
config_dict = json.load(f)
|
| 508 |
|
| 509 |
-
# Remove
|
| 510 |
config_dict.pop("model_type", None)
|
| 511 |
config_dict.pop("architectures", None)
|
|
|
|
|
|
|
|
|
|
|
|
|
| 512 |
config = ChessConfig(**config_dict)
|
| 513 |
|
| 514 |
# Load model weights with our config
|
|
@@ -518,12 +522,27 @@ def load_model_from_hub(model_id: str, device: str = "auto"):
|
|
| 518 |
device_map=device,
|
| 519 |
)
|
| 520 |
|
| 521 |
-
# Load tokenizer
|
| 522 |
try:
|
| 523 |
tokenizer = ChessTokenizer.from_pretrained(model_id)
|
| 524 |
except Exception as e:
|
| 525 |
-
print(f"ChessTokenizer failed
|
| 526 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 527 |
|
| 528 |
return model, tokenizer
|
| 529 |
|
|
@@ -537,7 +556,7 @@ def main():
|
|
| 537 |
help="Path to the model or Hugging Face model ID"
|
| 538 |
)
|
| 539 |
parser.add_argument(
|
| 540 |
-
"--mode", type=str, default="
|
| 541 |
help="Evaluation mode: 'legal' for legal move rate, 'winrate' for games, 'both' for both"
|
| 542 |
)
|
| 543 |
parser.add_argument(
|
|
|
|
| 506 |
with open(config_path, "r") as f:
|
| 507 |
config_dict = json.load(f)
|
| 508 |
|
| 509 |
+
# Remove fields that are not in ChessConfig to avoid unexpected kwargs
|
| 510 |
config_dict.pop("model_type", None)
|
| 511 |
config_dict.pop("architectures", None)
|
| 512 |
+
config_dict.pop("transformers_version", None)
|
| 513 |
+
config_dict.pop("dtype", None)
|
| 514 |
+
config_dict.pop("torch_dtype", None)
|
| 515 |
+
|
| 516 |
config = ChessConfig(**config_dict)
|
| 517 |
|
| 518 |
# Load model weights with our config
|
|
|
|
| 522 |
device_map=device,
|
| 523 |
)
|
| 524 |
|
| 525 |
+
# Load tokenizer - try to find vocab.json, else build default
|
| 526 |
try:
|
| 527 |
tokenizer = ChessTokenizer.from_pretrained(model_id)
|
| 528 |
except Exception as e:
|
| 529 |
+
print(f"ChessTokenizer.from_pretrained failed: {e}")
|
| 530 |
+
try:
|
| 531 |
+
tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)
|
| 532 |
+
except Exception as e2:
|
| 533 |
+
print(f"AutoTokenizer also failed: {e2}")
|
| 534 |
+
print("Creating default tokenizer with vocab_size from config...")
|
| 535 |
+
# Create a minimal tokenizer with just the vocab size
|
| 536 |
+
tokenizer = ChessTokenizer()
|
| 537 |
+
# Ensure vocab size matches model
|
| 538 |
+
if hasattr(config, 'vocab_size'):
|
| 539 |
+
# Build a placeholder vocab of the right size
|
| 540 |
+
tokenizer._vocab = {f"[MOVE_{i}]": i for i in range(config.vocab_size)}
|
| 541 |
+
tokenizer._vocab["[PAD]"] = 0
|
| 542 |
+
tokenizer._vocab["[BOS]"] = 1
|
| 543 |
+
tokenizer._vocab["[EOS]"] = 2
|
| 544 |
+
tokenizer._vocab["[UNK]"] = 3
|
| 545 |
+
tokenizer._ids_to_tokens = {v: k for k, v in tokenizer._vocab.items()}
|
| 546 |
|
| 547 |
return model, tokenizer
|
| 548 |
|
|
|
|
| 556 |
help="Path to the model or Hugging Face model ID"
|
| 557 |
)
|
| 558 |
parser.add_argument(
|
| 559 |
+
"--mode", type=str, default="legal", choices=["legal", "winrate", "both"],
|
| 560 |
help="Evaluation mode: 'legal' for legal move rate, 'winrate' for games, 'both' for both"
|
| 561 |
)
|
| 562 |
parser.add_argument(
|