Spaces:

LLM-course
/

Chess1MChallenge

Running

App Files Files Community

nathanael-fijalkow commited on 22 days ago

Commit

560a461

1 Parent(s): 19d5912

updated instructions

Browse files

Files changed (2) hide show

app.py +60 -54
src/evaluate.py +24 -5

app.py CHANGED Viewed

@@ -1,11 +1,15 @@
 """
-Chess Challenge Arena - Hugging Face Space
 This Gradio app provides:
 1. Interactive demo to test models
 2. Leaderboard of submitted models
 3. Live game visualization
 Leaderboard data is stored in a private HuggingFace dataset for persistence.
 """
@@ -45,7 +49,7 @@ LEADERBOARD_COLUMNS = [
     "legal_rate",
     "legal_rate_first_try",
     "elo",
-    "win_rate",
     "draw_rate",
     "games_played",
     "last_updated",
@@ -170,7 +174,7 @@ def format_leaderboard_html(data: list) -> str:
                 <th>Model</th>
                 <th>Legal Rate</th>
                 <th>ELO</th>
-                <th>Win Rate</th>
                 <th>Games</th>
                 <th>Last Updated</th>
             </tr>
@@ -199,7 +203,7 @@ def format_leaderboard_html(data: list) -> str:
                 <td><a href="{model_url}" target="_blank" class="model-link">{entry['model_id'].split('/')[-1]}</a></td>
                 <td class="{legal_class}">{legal_rate*100:.1f}%</td>
                 <td><strong>{entry.get('elo', 'N/A'):.0f}</strong></td>
-                <td>{entry.get('win_rate', 0)*100:.1f}%</td>
                 <td>{entry.get('games_played', 0)}</td>
                 <td>{entry.get('last_updated', 'N/A')}</td>
             </tr>
@@ -492,14 +496,16 @@ def refresh_leaderboard() -> str:
 # Build Gradio Interface
 with gr.Blocks(
-    title="Chess Challenge Arena",
     theme=gr.themes.Soft(),
 ) as demo:
     gr.Markdown("""
-    # ♟️ Chess Challenge Arena
-    Welcome to the LLM Chess Challenge evaluation arena!
-    Test your models, see the leaderboard, and compete with your classmates.
     """)
     with gr.Tabs():
@@ -539,13 +545,6 @@ with gr.Blocks(
             - Use RL fine-tuning with Stockfish rewards
             """)
-        # Leaderboard Tab
-        with gr.TabItem("🏆 Leaderboard"):
-            gr.Markdown("### Current Rankings")
-            leaderboard_html = gr.HTML(value=format_leaderboard_html(load_leaderboard()))
-            refresh_btn = gr.Button("Refresh Leaderboard")
-            refresh_btn.click(refresh_leaderboard, outputs=leaderboard_html)
         # Interactive Demo Tab
         with gr.TabItem("🎮 Interactive Demo"):
             gr.Markdown("### Test a Model")
@@ -627,45 +626,52 @@ with gr.Blocks(
                 outputs=legal_results,
             )
-        # Win Rate Evaluation Tab
-        with gr.TabItem("🏆 Win Rate Eval"):
-            gr.Markdown("""
-            ### Phase 2: Win Rate Evaluation
-            Play full games against Stockfish and measure win rate.
-            This evaluation computes your model's **ELO rating**.
-            - Plays complete games against Stockfish
-            - Measures win/draw/loss rates
-            - Estimates ELO rating
-            """)
-            with gr.Row():
-                eval_model = gr.Dropdown(
-                    choices=get_available_models(),
-                    label="Model to Evaluate",
-                )
-                eval_level = gr.Dropdown(
-                    choices=list(STOCKFISH_LEVELS.keys()),
-                    value="Easy (Level 1)",
-                    label="Stockfish Level",
-                )
-                eval_games = gr.Slider(
-                    minimum=10,
-                    maximum=100,
-                    value=50,
-                    step=10,
-                    label="Number of Games",
-                )
-            eval_btn = gr.Button("Run Win Rate Evaluation", variant="primary")
-            eval_results = gr.Markdown()
-            eval_btn.click(
-                evaluate_winrate,
-                inputs=[eval_model, eval_level, eval_games],
-                outputs=eval_results,
-            )
 # =============================================================================

 """
+Play Chess like a Honey Bee
 This Gradio app provides:
 1. Interactive demo to test models
 2. Leaderboard of submitted models
 3. Live game visualization
+Instructions:
+The goal is to train a language model to play chess, under a strict constraint:
+less than 1M parameters! This is approximately the number of neurons of a honey bee.
 Leaderboard data is stored in a private HuggingFace dataset for persistence.
 """
     "legal_rate",
     "legal_rate_first_try",
     "elo",
+    # "win_rate",
     "draw_rate",
     "games_played",
     "last_updated",
                 <th>Model</th>
                 <th>Legal Rate</th>
                 <th>ELO</th>
+                <!-- <th>Win Rate</th> -->
                 <th>Games</th>
                 <th>Last Updated</th>
             </tr>
                 <td><a href="{model_url}" target="_blank" class="model-link">{entry['model_id'].split('/')[-1]}</a></td>
                 <td class="{legal_class}">{legal_rate*100:.1f}%</td>
                 <td><strong>{entry.get('elo', 'N/A'):.0f}</strong></td>
+                <!-- <td>{entry.get('win_rate', 0)*100:.1f}%</td> -->
                 <td>{entry.get('games_played', 0)}</td>
                 <td>{entry.get('last_updated', 'N/A')}</td>
             </tr>
 # Build Gradio Interface
 with gr.Blocks(
+    title="Play Chess like a Honey Bee",
     theme=gr.themes.Soft(),
 ) as demo:
     gr.Markdown("""
+    # 🐝 Play Chess like a Honey Bee
+    Welcome to the Chess Challenge! The goal is to train a language model to play chess,
+    under a strict constraint: **less than 1M parameters!**
+    This is approximately the number of neurons of a honey bee 🐝
     """)
     with gr.Tabs():
             - Use RL fine-tuning with Stockfish rewards
             """)
         # Interactive Demo Tab
         with gr.TabItem("🎮 Interactive Demo"):
             gr.Markdown("### Test a Model")
                 outputs=legal_results,
             )
+        # Win Rate Evaluation Tab (commented out for now)
+        # with gr.TabItem("🏆 Win Rate Eval"):
+        #     gr.Markdown("""
+        #     ### Phase 2: Win Rate Evaluation
+        #
+        #     Play full games against Stockfish and measure win rate.
+        #     This evaluation computes your model's **ELO rating**.
+        #
+        #     - Plays complete games against Stockfish
+        #     - Measures win/draw/loss rates
+        #     - Estimates ELO rating
+        #     """)
+        #
+        #     with gr.Row():
+        #         eval_model = gr.Dropdown(
+        #             choices=get_available_models(),
+        #             label="Model to Evaluate",
+        #         )
+        #         eval_level = gr.Dropdown(
+        #             choices=list(STOCKFISH_LEVELS.keys()),
+        #             value="Easy (Level 1)",
+        #             label="Stockfish Level",
+        #         )
+        #         eval_games = gr.Slider(
+        #             minimum=10,
+        #             maximum=100,
+        #             value=50,
+        #             step=10,
+        #             label="Number of Games",
+        #         )
+        #
+        #     eval_btn = gr.Button("Run Win Rate Evaluation", variant="primary")
+        #     eval_results = gr.Markdown()
+        #
+        #     eval_btn.click(
+        #         evaluate_winrate,
+        #         inputs=[eval_model, eval_level, eval_games],
+        #         outputs=eval_results,
+        #     )
+        # Leaderboard Tab (moved to the end)
+        with gr.TabItem("🏆 Leaderboard"):
+            gr.Markdown("### Current Rankings")
+            leaderboard_html = gr.HTML(value=format_leaderboard_html(load_leaderboard()))
+            refresh_btn = gr.Button("Refresh Leaderboard")
+            refresh_btn.click(refresh_leaderboard, outputs=leaderboard_html)
 # =============================================================================

src/evaluate.py CHANGED Viewed

@@ -506,9 +506,13 @@ def load_model_from_hub(model_id: str, device: str = "auto"):
     with open(config_path, "r") as f:
         config_dict = json.load(f)
-    # Remove model_type to avoid conflicts, instantiate our config directly
     config_dict.pop("model_type", None)
     config_dict.pop("architectures", None)
     config = ChessConfig(**config_dict)
     # Load model weights with our config
@@ -518,12 +522,27 @@ def load_model_from_hub(model_id: str, device: str = "auto"):
         device_map=device,
     )
-    # Load tokenizer
     try:
         tokenizer = ChessTokenizer.from_pretrained(model_id)
     except Exception as e:
-        print(f"ChessTokenizer failed ({e}), trying AutoTokenizer...")
-        tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)
     return model, tokenizer
@@ -537,7 +556,7 @@ def main():
         help="Path to the model or Hugging Face model ID"
     )
     parser.add_argument(
-        "--mode", type=str, default="both", choices=["legal", "winrate", "both"],
         help="Evaluation mode: 'legal' for legal move rate, 'winrate' for games, 'both' for both"
     )
     parser.add_argument(

     with open(config_path, "r") as f:
         config_dict = json.load(f)
+    # Remove fields that are not in ChessConfig to avoid unexpected kwargs
     config_dict.pop("model_type", None)
     config_dict.pop("architectures", None)
+    config_dict.pop("transformers_version", None)
+    config_dict.pop("dtype", None)
+    config_dict.pop("torch_dtype", None)
     config = ChessConfig(**config_dict)
     # Load model weights with our config
         device_map=device,
     )
+    # Load tokenizer - try to find vocab.json, else build default
     try:
         tokenizer = ChessTokenizer.from_pretrained(model_id)
     except Exception as e:
+        print(f"ChessTokenizer.from_pretrained failed: {e}")
+        try:
+            tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)
+        except Exception as e2:
+            print(f"AutoTokenizer also failed: {e2}")
+            print("Creating default tokenizer with vocab_size from config...")
+            # Create a minimal tokenizer with just the vocab size
+            tokenizer = ChessTokenizer()
+            # Ensure vocab size matches model
+            if hasattr(config, 'vocab_size'):
+                # Build a placeholder vocab of the right size
+                tokenizer._vocab = {f"[MOVE_{i}]": i for i in range(config.vocab_size)}
+                tokenizer._vocab["[PAD]"] = 0
+                tokenizer._vocab["[BOS]"] = 1
+                tokenizer._vocab["[EOS]"] = 2
+                tokenizer._vocab["[UNK]"] = 3
+                tokenizer._ids_to_tokens = {v: k for k, v in tokenizer._vocab.items()}
     return model, tokenizer
         help="Path to the model or Hugging Face model ID"
     )
     parser.add_argument(
+        "--mode", type=str, default="legal", choices=["legal", "winrate", "both"],
         help="Evaluation mode: 'legal' for legal move rate, 'winrate' for games, 'both' for both"
     )
     parser.add_argument(