nathanael-fijalkow commited on
Commit
36fc570
·
1 Parent(s): f8cdc2f

private dataset for leaderboard

Browse files
Files changed (3) hide show
  1. README.md +25 -0
  2. app.py +82 -19
  3. requirements.txt +1 -0
README.md CHANGED
@@ -20,6 +20,31 @@ This Space hosts the evaluation arena for the LLM Chess Challenge.
20
  - **Leaderboard**: See rankings of all submitted models
21
  - **Statistics**: View detailed performance metrics
22
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
23
  ## How to Submit
24
 
25
  Students should push their trained models to this organization:
 
20
  - **Leaderboard**: See rankings of all submitted models
21
  - **Statistics**: View detailed performance metrics
22
 
23
+ ## Setup (Admin)
24
+
25
+ ### 1. Create a Private Leaderboard Dataset
26
+
27
+ Create a private dataset to store the leaderboard CSV:
28
+
29
+ ```bash
30
+ # Using the HuggingFace CLI
31
+ huggingface-cli repo create chess-challenge-leaderboard --type dataset --private
32
+ ```
33
+
34
+ Or create it via the web UI at: https://huggingface.co/new-dataset
35
+
36
+ ### 2. Configure Space Secrets
37
+
38
+ Go to **Settings → Variables and secrets** and add:
39
+
40
+ | Secret/Variable | Value | Description |
41
+ |-----------------|-------|-------------|
42
+ | `HF_TOKEN` | `hf_xxx...` | Write-access token for the leaderboard dataset |
43
+ | `HF_ORGANIZATION` | `LLM-course` | Your organization name |
44
+ | `LEADERBOARD_DATASET` | `LLM-course/chess-challenge-leaderboard` | Dataset repo ID |
45
+
46
+ > ⚠️ The `HF_TOKEN` needs **write access** to the leaderboard dataset to save results.
47
+
48
  ## How to Submit
49
 
50
  Students should push their trained models to this organization:
app.py CHANGED
@@ -5,19 +5,25 @@ This Gradio app provides:
5
  1. Interactive demo to test models
6
  2. Leaderboard of submitted models
7
  3. Live game visualization
 
 
8
  """
9
 
10
- import json
11
  import os
12
  from datetime import datetime
13
  from pathlib import Path
14
  from typing import Optional
15
 
16
  import gradio as gr
 
17
 
18
  # Configuration
19
- ORGANIZATION = os.environ.get("HF_ORGANIZATION", "your-org-name")
20
- LEADERBOARD_FILE = "leaderboard.json"
 
 
 
21
  STOCKFISH_LEVELS = {
22
  "Beginner (Level 0)": 0,
23
  "Easy (Level 1)": 1,
@@ -25,19 +31,76 @@ STOCKFISH_LEVELS = {
25
  "Hard (Level 5)": 5,
26
  }
27
 
 
 
 
 
 
 
 
 
 
 
 
 
28
 
29
  def load_leaderboard() -> list:
30
- """Load leaderboard from file or return empty list."""
31
- if Path(LEADERBOARD_FILE).exists():
32
- with open(LEADERBOARD_FILE, "r") as f:
33
- return json.load(f)
34
- return []
 
 
 
 
 
 
 
 
 
 
 
 
 
 
35
 
36
 
37
  def save_leaderboard(data: list):
38
- """Save leaderboard to file."""
39
- with open(LEADERBOARD_FILE, "w") as f:
40
- json.dump(data, f, indent=2)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
41
 
42
 
43
  def get_available_models() -> list:
@@ -264,7 +327,7 @@ def evaluate_legal_moves(
264
  progress(1.0, desc="Done!")
265
 
266
  return f"""
267
- ## Legal Move Evaluation for {model_id.split('/')[-1]}
268
 
269
  | Metric | Value |
270
  |--------|-------|
@@ -280,7 +343,7 @@ def evaluate_legal_moves(
280
  """
281
 
282
  except Exception as e:
283
- return f"Evaluation failed: {str(e)}"
284
 
285
 
286
  def evaluate_winrate(
@@ -328,7 +391,7 @@ def evaluate_winrate(
328
  progress(1.0, desc="Done!")
329
 
330
  return f"""
331
- ## 🏆 Win Rate Evaluation for {model_id.split('/')[-1]}
332
 
333
  | Metric | Value |
334
  |--------|-------|
@@ -343,7 +406,7 @@ Games played: {n_games} against Stockfish {stockfish_level}
343
  """
344
 
345
  except Exception as e:
346
- return f"Evaluation failed: {str(e)}"
347
 
348
 
349
  def evaluate_model(
@@ -411,7 +474,7 @@ Games played: {n_games} against Stockfish {stockfish_level}
411
  """
412
 
413
  except Exception as e:
414
- return f"Evaluation failed: {str(e)}"
415
 
416
 
417
  def refresh_leaderboard() -> str:
@@ -486,7 +549,7 @@ with gr.Blocks(
486
  )
487
 
488
  # Legal Move Evaluation Tab
489
- with gr.TabItem("Legal Move Eval"):
490
  gr.Markdown("""
491
  ### Phase 1: Legal Move Evaluation
492
 
@@ -551,7 +614,7 @@ with gr.Blocks(
551
  label="Number of Games",
552
  )
553
 
554
- eval_btn = gr.Button("🏆 Run Win Rate Evaluation", variant="primary")
555
  eval_results = gr.Markdown()
556
 
557
  eval_btn.click(
@@ -561,7 +624,7 @@ with gr.Blocks(
561
  )
562
 
563
  # Submission Guide Tab
564
- with gr.TabItem("📤 How to Submit"):
565
  gr.Markdown(f"""
566
  ### Submitting Your Model
567
 
 
5
  1. Interactive demo to test models
6
  2. Leaderboard of submitted models
7
  3. Live game visualization
8
+
9
+ Leaderboard data is stored in a private HuggingFace dataset for persistence.
10
  """
11
 
12
+ import io
13
  import os
14
  from datetime import datetime
15
  from pathlib import Path
16
  from typing import Optional
17
 
18
  import gradio as gr
19
+ import pandas as pd
20
 
21
  # Configuration
22
+ ORGANIZATION = os.environ.get("HF_ORGANIZATION", "LLM-course")
23
+ LEADERBOARD_DATASET = os.environ.get("LEADERBOARD_DATASET", f"{ORGANIZATION}/chess-challenge-leaderboard")
24
+ LEADERBOARD_FILENAME = "leaderboard.csv"
25
+ HF_TOKEN = os.environ.get("HF_TOKEN") # Required for private dataset access
26
+
27
  STOCKFISH_LEVELS = {
28
  "Beginner (Level 0)": 0,
29
  "Easy (Level 1)": 1,
 
31
  "Hard (Level 5)": 5,
32
  }
33
 
34
+ # CSV columns for the leaderboard
35
+ LEADERBOARD_COLUMNS = [
36
+ "model_id",
37
+ "legal_rate",
38
+ "legal_rate_first_try",
39
+ "elo",
40
+ "win_rate",
41
+ "draw_rate",
42
+ "games_played",
43
+ "last_updated",
44
+ ]
45
+
46
 
47
  def load_leaderboard() -> list:
48
+ """Load leaderboard from private HuggingFace dataset."""
49
+ try:
50
+ from huggingface_hub import hf_hub_download
51
+
52
+ # Download the CSV file from the dataset
53
+ csv_path = hf_hub_download(
54
+ repo_id=LEADERBOARD_DATASET,
55
+ filename=LEADERBOARD_FILENAME,
56
+ repo_type="dataset",
57
+ token=HF_TOKEN,
58
+ )
59
+
60
+ df = pd.read_csv(csv_path)
61
+ return df.to_dict(orient="records")
62
+
63
+ except Exception as e:
64
+ print(f"Could not load leaderboard from dataset: {e}")
65
+ # Return empty list if dataset doesn't exist yet
66
+ return []
67
 
68
 
69
  def save_leaderboard(data: list):
70
+ """Save leaderboard to private HuggingFace dataset."""
71
+ try:
72
+ from huggingface_hub import HfApi
73
+
74
+ # Convert to DataFrame
75
+ df = pd.DataFrame(data, columns=LEADERBOARD_COLUMNS)
76
+
77
+ # Fill missing columns with defaults
78
+ for col in LEADERBOARD_COLUMNS:
79
+ if col not in df.columns:
80
+ df[col] = None
81
+
82
+ # Reorder columns
83
+ df = df[LEADERBOARD_COLUMNS]
84
+
85
+ # Convert to CSV bytes
86
+ csv_buffer = io.BytesIO()
87
+ df.to_csv(csv_buffer, index=False)
88
+ csv_buffer.seek(0)
89
+
90
+ # Upload to HuggingFace dataset
91
+ api = HfApi(token=HF_TOKEN)
92
+ api.upload_file(
93
+ path_or_fileobj=csv_buffer,
94
+ path_in_repo=LEADERBOARD_FILENAME,
95
+ repo_id=LEADERBOARD_DATASET,
96
+ repo_type="dataset",
97
+ commit_message=f"Update leaderboard - {datetime.now().strftime('%Y-%m-%d %H:%M')}",
98
+ )
99
+ print(f"Leaderboard saved to {LEADERBOARD_DATASET}")
100
+
101
+ except Exception as e:
102
+ print(f"Error saving leaderboard to dataset: {e}")
103
+ raise
104
 
105
 
106
  def get_available_models() -> list:
 
327
  progress(1.0, desc="Done!")
328
 
329
  return f"""
330
+ ## Legal Move Evaluation for {model_id.split('/')[-1]}
331
 
332
  | Metric | Value |
333
  |--------|-------|
 
343
  """
344
 
345
  except Exception as e:
346
+ return f"Evaluation failed: {str(e)}"
347
 
348
 
349
  def evaluate_winrate(
 
391
  progress(1.0, desc="Done!")
392
 
393
  return f"""
394
+ ## Win Rate Evaluation for {model_id.split('/')[-1]}
395
 
396
  | Metric | Value |
397
  |--------|-------|
 
406
  """
407
 
408
  except Exception as e:
409
+ return f"Evaluation failed: {str(e)}"
410
 
411
 
412
  def evaluate_model(
 
474
  """
475
 
476
  except Exception as e:
477
+ return f"Evaluation failed: {str(e)}"
478
 
479
 
480
  def refresh_leaderboard() -> str:
 
549
  )
550
 
551
  # Legal Move Evaluation Tab
552
+ with gr.TabItem("Legal Move Eval"):
553
  gr.Markdown("""
554
  ### Phase 1: Legal Move Evaluation
555
 
 
614
  label="Number of Games",
615
  )
616
 
617
+ eval_btn = gr.Button("Run Win Rate Evaluation", variant="primary")
618
  eval_results = gr.Markdown()
619
 
620
  eval_btn.click(
 
624
  )
625
 
626
  # Submission Guide Tab
627
+ with gr.TabItem("How to Submit"):
628
  gr.Markdown(f"""
629
  ### Submitting Your Model
630
 
requirements.txt CHANGED
@@ -4,3 +4,4 @@ torch>=2.0.0
4
  python-chess>=1.999
5
  huggingface-hub>=0.20.0
6
  datasets>=2.14.0
 
 
4
  python-chess>=1.999
5
  huggingface-hub>=0.20.0
6
  datasets>=2.14.0
7
+ pandas>=2.0.0