Spaces:

LLM-course
/

Chess1MChallenge

Running

App Files Files Community

nathanael-fijalkow commited on Jan 6

Commit

36fc570

1 Parent(s): f8cdc2f

private dataset for leaderboard

Browse files

Files changed (3) hide show

README.md +25 -0
app.py +82 -19
requirements.txt +1 -0

README.md CHANGED Viewed

@@ -20,6 +20,31 @@ This Space hosts the evaluation arena for the LLM Chess Challenge.
 - **Leaderboard**: See rankings of all submitted models
 - **Statistics**: View detailed performance metrics
 ## How to Submit
 Students should push their trained models to this organization:

 - **Leaderboard**: See rankings of all submitted models
 - **Statistics**: View detailed performance metrics
+## Setup (Admin)
+### 1. Create a Private Leaderboard Dataset
+Create a private dataset to store the leaderboard CSV:
+```bash
+# Using the HuggingFace CLI
+huggingface-cli repo create chess-challenge-leaderboard --type dataset --private
+```
+Or create it via the web UI at: https://huggingface.co/new-dataset
+### 2. Configure Space Secrets
+Go to **Settings → Variables and secrets** and add:
+| Secret/Variable | Value | Description |
+|-----------------|-------|-------------|
+| `HF_TOKEN` | `hf_xxx...` | Write-access token for the leaderboard dataset |
+| `HF_ORGANIZATION` | `LLM-course` | Your organization name |
+| `LEADERBOARD_DATASET` | `LLM-course/chess-challenge-leaderboard` | Dataset repo ID |
+> ⚠️ The `HF_TOKEN` needs **write access** to the leaderboard dataset to save results.
 ## How to Submit
 Students should push their trained models to this organization:

app.py CHANGED Viewed

@@ -5,19 +5,25 @@ This Gradio app provides:
 1. Interactive demo to test models
 2. Leaderboard of submitted models
 3. Live game visualization
 """
-import json
 import os
 from datetime import datetime
 from pathlib import Path
 from typing import Optional
 import gradio as gr
 # Configuration
-ORGANIZATION = os.environ.get("HF_ORGANIZATION", "your-org-name")
-LEADERBOARD_FILE = "leaderboard.json"
 STOCKFISH_LEVELS = {
     "Beginner (Level 0)": 0,
     "Easy (Level 1)": 1,
@@ -25,19 +31,76 @@ STOCKFISH_LEVELS = {
     "Hard (Level 5)": 5,
 }
 def load_leaderboard() -> list:
-    """Load leaderboard from file or return empty list."""
-    if Path(LEADERBOARD_FILE).exists():
-        with open(LEADERBOARD_FILE, "r") as f:
-            return json.load(f)
-    return []
 def save_leaderboard(data: list):
-    """Save leaderboard to file."""
-    with open(LEADERBOARD_FILE, "w") as f:
-        json.dump(data, f, indent=2)
 def get_available_models() -> list:
@@ -264,7 +327,7 @@ def evaluate_legal_moves(
         progress(1.0, desc="Done!")
         return f"""
-## ✅ Legal Move Evaluation for {model_id.split('/')[-1]}
 | Metric | Value |
 |--------|-------|
@@ -280,7 +343,7 @@ def evaluate_legal_moves(
 """
     except Exception as e:
-        return f"❌ Evaluation failed: {str(e)}"
 def evaluate_winrate(
@@ -328,7 +391,7 @@ def evaluate_winrate(
         progress(1.0, desc="Done!")
         return f"""
-## 🏆 Win Rate Evaluation for {model_id.split('/')[-1]}
 | Metric | Value |
 |--------|-------|
@@ -343,7 +406,7 @@ Games played: {n_games} against Stockfish {stockfish_level}
 """
     except Exception as e:
-        return f"❌ Evaluation failed: {str(e)}"
 def evaluate_model(
@@ -411,7 +474,7 @@ Games played: {n_games} against Stockfish {stockfish_level}
 """
     except Exception as e:
-        return f"❌ Evaluation failed: {str(e)}"
 def refresh_leaderboard() -> str:
@@ -486,7 +549,7 @@ with gr.Blocks(
             )
         # Legal Move Evaluation Tab
-        with gr.TabItem("✅ Legal Move Eval"):
             gr.Markdown("""
             ### Phase 1: Legal Move Evaluation
@@ -551,7 +614,7 @@ with gr.Blocks(
                     label="Number of Games",
                 )
-            eval_btn = gr.Button("🏆 Run Win Rate Evaluation", variant="primary")
             eval_results = gr.Markdown()
             eval_btn.click(
@@ -561,7 +624,7 @@ with gr.Blocks(
             )
         # Submission Guide Tab
-        with gr.TabItem("📤 How to Submit"):
             gr.Markdown(f"""
             ### Submitting Your Model

 1. Interactive demo to test models
 2. Leaderboard of submitted models
 3. Live game visualization
+Leaderboard data is stored in a private HuggingFace dataset for persistence.
 """
+import io
 import os
 from datetime import datetime
 from pathlib import Path
 from typing import Optional
 import gradio as gr
+import pandas as pd
 # Configuration
+ORGANIZATION = os.environ.get("HF_ORGANIZATION", "LLM-course")
+LEADERBOARD_DATASET = os.environ.get("LEADERBOARD_DATASET", f"{ORGANIZATION}/chess-challenge-leaderboard")
+LEADERBOARD_FILENAME = "leaderboard.csv"
+HF_TOKEN = os.environ.get("HF_TOKEN")  # Required for private dataset access
 STOCKFISH_LEVELS = {
     "Beginner (Level 0)": 0,
     "Easy (Level 1)": 1,
     "Hard (Level 5)": 5,
 }
+# CSV columns for the leaderboard
+LEADERBOARD_COLUMNS = [
+    "model_id",
+    "legal_rate",
+    "legal_rate_first_try",
+    "elo",
+    "win_rate",
+    "draw_rate",
+    "games_played",
+    "last_updated",
+]
 def load_leaderboard() -> list:
+    """Load leaderboard from private HuggingFace dataset."""
+    try:
+        from huggingface_hub import hf_hub_download
+        # Download the CSV file from the dataset
+        csv_path = hf_hub_download(
+            repo_id=LEADERBOARD_DATASET,
+            filename=LEADERBOARD_FILENAME,
+            repo_type="dataset",
+            token=HF_TOKEN,
+        )
+        df = pd.read_csv(csv_path)
+        return df.to_dict(orient="records")
+    except Exception as e:
+        print(f"Could not load leaderboard from dataset: {e}")
+        # Return empty list if dataset doesn't exist yet
+        return []
 def save_leaderboard(data: list):
+    """Save leaderboard to private HuggingFace dataset."""
+    try:
+        from huggingface_hub import HfApi
+        # Convert to DataFrame
+        df = pd.DataFrame(data, columns=LEADERBOARD_COLUMNS)
+        # Fill missing columns with defaults
+        for col in LEADERBOARD_COLUMNS:
+            if col not in df.columns:
+                df[col] = None
+        # Reorder columns
+        df = df[LEADERBOARD_COLUMNS]
+        # Convert to CSV bytes
+        csv_buffer = io.BytesIO()
+        df.to_csv(csv_buffer, index=False)
+        csv_buffer.seek(0)
+        # Upload to HuggingFace dataset
+        api = HfApi(token=HF_TOKEN)
+        api.upload_file(
+            path_or_fileobj=csv_buffer,
+            path_in_repo=LEADERBOARD_FILENAME,
+            repo_id=LEADERBOARD_DATASET,
+            repo_type="dataset",
+            commit_message=f"Update leaderboard - {datetime.now().strftime('%Y-%m-%d %H:%M')}",
+        )
+        print(f"Leaderboard saved to {LEADERBOARD_DATASET}")
+    except Exception as e:
+        print(f"Error saving leaderboard to dataset: {e}")
+        raise
 def get_available_models() -> list:
         progress(1.0, desc="Done!")
         return f"""
+## Legal Move Evaluation for {model_id.split('/')[-1]}
 | Metric | Value |
 |--------|-------|
 """
     except Exception as e:
+        return f"Evaluation failed: {str(e)}"
 def evaluate_winrate(
         progress(1.0, desc="Done!")
         return f"""
+## Win Rate Evaluation for {model_id.split('/')[-1]}
 | Metric | Value |
 |--------|-------|
 """
     except Exception as e:
+        return f"Evaluation failed: {str(e)}"
 def evaluate_model(
 """
     except Exception as e:
+        return f"Evaluation failed: {str(e)}"
 def refresh_leaderboard() -> str:
             )
         # Legal Move Evaluation Tab
+        with gr.TabItem("Legal Move Eval"):
             gr.Markdown("""
             ### Phase 1: Legal Move Evaluation
                     label="Number of Games",
                 )
+            eval_btn = gr.Button("Run Win Rate Evaluation", variant="primary")
             eval_results = gr.Markdown()
             eval_btn.click(
             )
         # Submission Guide Tab
+        with gr.TabItem("How to Submit"):
             gr.Markdown(f"""
             ### Submitting Your Model

requirements.txt CHANGED Viewed

@@ -4,3 +4,4 @@ torch>=2.0.0
 python-chess>=1.999
 huggingface-hub>=0.20.0
 datasets>=2.14.0

 python-chess>=1.999
 huggingface-hub>=0.20.0
 datasets>=2.14.0
+pandas>=2.0.0