nathanael-fijalkow commited on
Commit
560a461
·
1 Parent(s): 19d5912

updated instructions

Browse files
Files changed (2) hide show
  1. app.py +60 -54
  2. src/evaluate.py +24 -5
app.py CHANGED
@@ -1,11 +1,15 @@
1
  """
2
- Chess Challenge Arena - Hugging Face Space
3
 
4
  This Gradio app provides:
5
  1. Interactive demo to test models
6
  2. Leaderboard of submitted models
7
  3. Live game visualization
8
 
 
 
 
 
9
  Leaderboard data is stored in a private HuggingFace dataset for persistence.
10
  """
11
 
@@ -45,7 +49,7 @@ LEADERBOARD_COLUMNS = [
45
  "legal_rate",
46
  "legal_rate_first_try",
47
  "elo",
48
- "win_rate",
49
  "draw_rate",
50
  "games_played",
51
  "last_updated",
@@ -170,7 +174,7 @@ def format_leaderboard_html(data: list) -> str:
170
  <th>Model</th>
171
  <th>Legal Rate</th>
172
  <th>ELO</th>
173
- <th>Win Rate</th>
174
  <th>Games</th>
175
  <th>Last Updated</th>
176
  </tr>
@@ -199,7 +203,7 @@ def format_leaderboard_html(data: list) -> str:
199
  <td><a href="{model_url}" target="_blank" class="model-link">{entry['model_id'].split('/')[-1]}</a></td>
200
  <td class="{legal_class}">{legal_rate*100:.1f}%</td>
201
  <td><strong>{entry.get('elo', 'N/A'):.0f}</strong></td>
202
- <td>{entry.get('win_rate', 0)*100:.1f}%</td>
203
  <td>{entry.get('games_played', 0)}</td>
204
  <td>{entry.get('last_updated', 'N/A')}</td>
205
  </tr>
@@ -492,14 +496,16 @@ def refresh_leaderboard() -> str:
492
 
493
  # Build Gradio Interface
494
  with gr.Blocks(
495
- title="Chess Challenge Arena",
496
  theme=gr.themes.Soft(),
497
  ) as demo:
498
  gr.Markdown("""
499
- # ♟️ Chess Challenge Arena
 
 
 
500
 
501
- Welcome to the LLM Chess Challenge evaluation arena!
502
- Test your models, see the leaderboard, and compete with your classmates.
503
  """)
504
 
505
  with gr.Tabs():
@@ -539,13 +545,6 @@ with gr.Blocks(
539
  - Use RL fine-tuning with Stockfish rewards
540
  """)
541
 
542
- # Leaderboard Tab
543
- with gr.TabItem("🏆 Leaderboard"):
544
- gr.Markdown("### Current Rankings")
545
- leaderboard_html = gr.HTML(value=format_leaderboard_html(load_leaderboard()))
546
- refresh_btn = gr.Button("Refresh Leaderboard")
547
- refresh_btn.click(refresh_leaderboard, outputs=leaderboard_html)
548
-
549
  # Interactive Demo Tab
550
  with gr.TabItem("🎮 Interactive Demo"):
551
  gr.Markdown("### Test a Model")
@@ -627,45 +626,52 @@ with gr.Blocks(
627
  outputs=legal_results,
628
  )
629
 
630
- # Win Rate Evaluation Tab
631
- with gr.TabItem("🏆 Win Rate Eval"):
632
- gr.Markdown("""
633
- ### Phase 2: Win Rate Evaluation
634
-
635
- Play full games against Stockfish and measure win rate.
636
- This evaluation computes your model's **ELO rating**.
637
-
638
- - Plays complete games against Stockfish
639
- - Measures win/draw/loss rates
640
- - Estimates ELO rating
641
- """)
642
-
643
- with gr.Row():
644
- eval_model = gr.Dropdown(
645
- choices=get_available_models(),
646
- label="Model to Evaluate",
647
- )
648
- eval_level = gr.Dropdown(
649
- choices=list(STOCKFISH_LEVELS.keys()),
650
- value="Easy (Level 1)",
651
- label="Stockfish Level",
652
- )
653
- eval_games = gr.Slider(
654
- minimum=10,
655
- maximum=100,
656
- value=50,
657
- step=10,
658
- label="Number of Games",
659
- )
660
-
661
- eval_btn = gr.Button("Run Win Rate Evaluation", variant="primary")
662
- eval_results = gr.Markdown()
663
-
664
- eval_btn.click(
665
- evaluate_winrate,
666
- inputs=[eval_model, eval_level, eval_games],
667
- outputs=eval_results,
668
- )
 
 
 
 
 
 
 
669
 
670
 
671
  # =============================================================================
 
1
  """
2
+ Play Chess like a Honey Bee
3
 
4
  This Gradio app provides:
5
  1. Interactive demo to test models
6
  2. Leaderboard of submitted models
7
  3. Live game visualization
8
 
9
+ Instructions:
10
+ The goal is to train a language model to play chess, under a strict constraint:
11
+ less than 1M parameters! This is approximately the number of neurons of a honey bee.
12
+
13
  Leaderboard data is stored in a private HuggingFace dataset for persistence.
14
  """
15
 
 
49
  "legal_rate",
50
  "legal_rate_first_try",
51
  "elo",
52
+ # "win_rate",
53
  "draw_rate",
54
  "games_played",
55
  "last_updated",
 
174
  <th>Model</th>
175
  <th>Legal Rate</th>
176
  <th>ELO</th>
177
+ <!-- <th>Win Rate</th> -->
178
  <th>Games</th>
179
  <th>Last Updated</th>
180
  </tr>
 
203
  <td><a href="{model_url}" target="_blank" class="model-link">{entry['model_id'].split('/')[-1]}</a></td>
204
  <td class="{legal_class}">{legal_rate*100:.1f}%</td>
205
  <td><strong>{entry.get('elo', 'N/A'):.0f}</strong></td>
206
+ <!-- <td>{entry.get('win_rate', 0)*100:.1f}%</td> -->
207
  <td>{entry.get('games_played', 0)}</td>
208
  <td>{entry.get('last_updated', 'N/A')}</td>
209
  </tr>
 
496
 
497
  # Build Gradio Interface
498
  with gr.Blocks(
499
+ title="Play Chess like a Honey Bee",
500
  theme=gr.themes.Soft(),
501
  ) as demo:
502
  gr.Markdown("""
503
+ # 🐝 Play Chess like a Honey Bee
504
+
505
+ Welcome to the Chess Challenge! The goal is to train a language model to play chess,
506
+ under a strict constraint: **less than 1M parameters!**
507
 
508
+ This is approximately the number of neurons of a honey bee 🐝
 
509
  """)
510
 
511
  with gr.Tabs():
 
545
  - Use RL fine-tuning with Stockfish rewards
546
  """)
547
 
 
 
 
 
 
 
 
548
  # Interactive Demo Tab
549
  with gr.TabItem("🎮 Interactive Demo"):
550
  gr.Markdown("### Test a Model")
 
626
  outputs=legal_results,
627
  )
628
 
629
+ # Win Rate Evaluation Tab (commented out for now)
630
+ # with gr.TabItem("🏆 Win Rate Eval"):
631
+ # gr.Markdown("""
632
+ # ### Phase 2: Win Rate Evaluation
633
+ #
634
+ # Play full games against Stockfish and measure win rate.
635
+ # This evaluation computes your model's **ELO rating**.
636
+ #
637
+ # - Plays complete games against Stockfish
638
+ # - Measures win/draw/loss rates
639
+ # - Estimates ELO rating
640
+ # """)
641
+ #
642
+ # with gr.Row():
643
+ # eval_model = gr.Dropdown(
644
+ # choices=get_available_models(),
645
+ # label="Model to Evaluate",
646
+ # )
647
+ # eval_level = gr.Dropdown(
648
+ # choices=list(STOCKFISH_LEVELS.keys()),
649
+ # value="Easy (Level 1)",
650
+ # label="Stockfish Level",
651
+ # )
652
+ # eval_games = gr.Slider(
653
+ # minimum=10,
654
+ # maximum=100,
655
+ # value=50,
656
+ # step=10,
657
+ # label="Number of Games",
658
+ # )
659
+ #
660
+ # eval_btn = gr.Button("Run Win Rate Evaluation", variant="primary")
661
+ # eval_results = gr.Markdown()
662
+ #
663
+ # eval_btn.click(
664
+ # evaluate_winrate,
665
+ # inputs=[eval_model, eval_level, eval_games],
666
+ # outputs=eval_results,
667
+ # )
668
+
669
+ # Leaderboard Tab (moved to the end)
670
+ with gr.TabItem("🏆 Leaderboard"):
671
+ gr.Markdown("### Current Rankings")
672
+ leaderboard_html = gr.HTML(value=format_leaderboard_html(load_leaderboard()))
673
+ refresh_btn = gr.Button("Refresh Leaderboard")
674
+ refresh_btn.click(refresh_leaderboard, outputs=leaderboard_html)
675
 
676
 
677
  # =============================================================================
src/evaluate.py CHANGED
@@ -506,9 +506,13 @@ def load_model_from_hub(model_id: str, device: str = "auto"):
506
  with open(config_path, "r") as f:
507
  config_dict = json.load(f)
508
 
509
- # Remove model_type to avoid conflicts, instantiate our config directly
510
  config_dict.pop("model_type", None)
511
  config_dict.pop("architectures", None)
 
 
 
 
512
  config = ChessConfig(**config_dict)
513
 
514
  # Load model weights with our config
@@ -518,12 +522,27 @@ def load_model_from_hub(model_id: str, device: str = "auto"):
518
  device_map=device,
519
  )
520
 
521
- # Load tokenizer
522
  try:
523
  tokenizer = ChessTokenizer.from_pretrained(model_id)
524
  except Exception as e:
525
- print(f"ChessTokenizer failed ({e}), trying AutoTokenizer...")
526
- tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
527
 
528
  return model, tokenizer
529
 
@@ -537,7 +556,7 @@ def main():
537
  help="Path to the model or Hugging Face model ID"
538
  )
539
  parser.add_argument(
540
- "--mode", type=str, default="both", choices=["legal", "winrate", "both"],
541
  help="Evaluation mode: 'legal' for legal move rate, 'winrate' for games, 'both' for both"
542
  )
543
  parser.add_argument(
 
506
  with open(config_path, "r") as f:
507
  config_dict = json.load(f)
508
 
509
+ # Remove fields that are not in ChessConfig to avoid unexpected kwargs
510
  config_dict.pop("model_type", None)
511
  config_dict.pop("architectures", None)
512
+ config_dict.pop("transformers_version", None)
513
+ config_dict.pop("dtype", None)
514
+ config_dict.pop("torch_dtype", None)
515
+
516
  config = ChessConfig(**config_dict)
517
 
518
  # Load model weights with our config
 
522
  device_map=device,
523
  )
524
 
525
+ # Load tokenizer - try to find vocab.json, else build default
526
  try:
527
  tokenizer = ChessTokenizer.from_pretrained(model_id)
528
  except Exception as e:
529
+ print(f"ChessTokenizer.from_pretrained failed: {e}")
530
+ try:
531
+ tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)
532
+ except Exception as e2:
533
+ print(f"AutoTokenizer also failed: {e2}")
534
+ print("Creating default tokenizer with vocab_size from config...")
535
+ # Create a minimal tokenizer with just the vocab size
536
+ tokenizer = ChessTokenizer()
537
+ # Ensure vocab size matches model
538
+ if hasattr(config, 'vocab_size'):
539
+ # Build a placeholder vocab of the right size
540
+ tokenizer._vocab = {f"[MOVE_{i}]": i for i in range(config.vocab_size)}
541
+ tokenizer._vocab["[PAD]"] = 0
542
+ tokenizer._vocab["[BOS]"] = 1
543
+ tokenizer._vocab["[EOS]"] = 2
544
+ tokenizer._vocab["[UNK]"] = 3
545
+ tokenizer._ids_to_tokens = {v: k for k, v in tokenizer._vocab.items()}
546
 
547
  return model, tokenizer
548
 
 
556
  help="Path to the model or Hugging Face model ID"
557
  )
558
  parser.add_argument(
559
+ "--mode", type=str, default="legal", choices=["legal", "winrate", "both"],
560
  help="Evaluation mode: 'legal' for legal move rate, 'winrate' for games, 'both' for both"
561
  )
562
  parser.add_argument(