Spaces:
Running
Running
Commit
·
faa67d0
1
Parent(s):
dbdb0ce
Fixed legal_moves bug and skipped already evaluated models
Browse files
app.py
CHANGED
|
@@ -21,7 +21,7 @@ import os
|
|
| 21 |
import queue
|
| 22 |
import sys
|
| 23 |
import threading
|
| 24 |
-
from datetime import datetime
|
| 25 |
from pathlib import Path
|
| 26 |
from typing import Optional
|
| 27 |
|
|
@@ -33,7 +33,6 @@ ORGANIZATION = os.environ.get("HF_ORGANIZATION", "LLM-course")
|
|
| 33 |
LEADERBOARD_DATASET = os.environ.get("LEADERBOARD_DATASET", f"{ORGANIZATION}/chess-challenge-leaderboard")
|
| 34 |
LEADERBOARD_FILENAME = "leaderboard.csv"
|
| 35 |
HF_TOKEN = os.environ.get("HF_TOKEN") # Required for private dataset access
|
| 36 |
-
WEBHOOK_SECRET = os.environ.get("WEBHOOK_SECRET", "459f4c2c6b0b4b6468e21f981103753d14219d4955f07ab457e100fee93cae66")
|
| 37 |
|
| 38 |
# CSV columns for the leaderboard
|
| 39 |
LEADERBOARD_COLUMNS = [
|
|
@@ -42,6 +41,7 @@ LEADERBOARD_COLUMNS = [
|
|
| 42 |
"legal_rate",
|
| 43 |
"legal_rate_first_try",
|
| 44 |
"last_updated",
|
|
|
|
| 45 |
]
|
| 46 |
|
| 47 |
def is_chess_model(model_id: str) -> bool:
|
|
@@ -51,15 +51,6 @@ def is_chess_model(model_id: str) -> bool:
|
|
| 51 |
model_name = model_id.split("/")[-1].lower()
|
| 52 |
return "chess" in model_name
|
| 53 |
|
| 54 |
-
|
| 55 |
-
def verify_webhook_signature(body: bytes, signature: str) -> bool:
|
| 56 |
-
"""Verify the webhook signature using HMAC-SHA256."""
|
| 57 |
-
if not WEBHOOK_SECRET:
|
| 58 |
-
return True # Skip verification if no secret configured
|
| 59 |
-
expected = hmac.new(WEBHOOK_SECRET.encode(), body, hashlib.sha256).hexdigest()
|
| 60 |
-
return hmac.compare_digest(signature or "", expected)
|
| 61 |
-
|
| 62 |
-
|
| 63 |
# =============================================================================
|
| 64 |
# Leaderboard Management
|
| 65 |
# =============================================================================
|
|
@@ -109,7 +100,7 @@ def save_leaderboard(data: list):
|
|
| 109 |
path_or_fileobj=csv_buffer,
|
| 110 |
path_in_repo=LEADERBOARD_FILENAME,
|
| 111 |
repo_id=LEADERBOARD_DATASET,
|
| 112 |
-
repo_type="dataset",
|
| 113 |
commit_message=f"Update leaderboard - {datetime.now().strftime('%Y-%m-%d %H:%M')}",
|
| 114 |
)
|
| 115 |
print(f"Leaderboard saved to {LEADERBOARD_DATASET}")
|
|
@@ -286,13 +277,38 @@ def run_evaluation(
|
|
| 286 |
5. Update leaderboard and post discussion
|
| 287 |
"""
|
| 288 |
try:
|
| 289 |
-
|
| 290 |
-
|
| 291 |
from src.evaluate import (
|
| 292 |
ChessEvaluator,
|
| 293 |
load_model_and_tokenizer,
|
| 294 |
post_discussion_summary,
|
| 295 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 296 |
|
| 297 |
progress(0, desc="Loading model...")
|
| 298 |
|
|
@@ -312,6 +328,11 @@ def run_evaluation(
|
|
| 312 |
|
| 313 |
# Run evaluation
|
| 314 |
result = evaluator.evaluate(verbose=True)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 315 |
|
| 316 |
progress(0.9, desc="Updating leaderboard...")
|
| 317 |
|
|
@@ -360,29 +381,31 @@ which adds the required metadata to the README.md file.
|
|
| 360 |
# Update leaderboard
|
| 361 |
leaderboard = load_leaderboard()
|
| 362 |
|
| 363 |
-
# Find existing entry for this
|
| 364 |
-
|
| 365 |
|
| 366 |
new_entry = {
|
| 367 |
"model_id": model_id,
|
| 368 |
"user_id": user_id,
|
| 369 |
-
"legal_rate": result.
|
| 370 |
"legal_rate_first_try": result.legal_rate_first_try,
|
| 371 |
-
"last_updated": datetime.
|
|
|
|
| 372 |
}
|
| 373 |
|
| 374 |
-
if
|
| 375 |
leaderboard.append(new_entry)
|
| 376 |
save_leaderboard(leaderboard)
|
| 377 |
update_message = "New entry added to leaderboard!"
|
| 378 |
else:
|
| 379 |
-
old_rate =
|
| 380 |
-
|
| 381 |
-
|
| 382 |
-
|
| 383 |
-
update_message = f"Improved! {old_rate*100:.1f}% -> {result.
|
| 384 |
else:
|
| 385 |
-
update_message = f"
|
|
|
|
| 386 |
|
| 387 |
# Post discussion to model page
|
| 388 |
if HF_TOKEN:
|
|
@@ -462,7 +485,7 @@ with gr.Blocks(
|
|
| 462 |
|
| 463 |
1. **Clone this repository**:
|
| 464 |
```bash
|
| 465 |
-
git clone
|
| 466 |
```
|
| 467 |
|
| 468 |
2. **Check an example solution** in the `example_solution/` folder for reference
|
|
|
|
| 21 |
import queue
|
| 22 |
import sys
|
| 23 |
import threading
|
| 24 |
+
from datetime import datetime, timezone
|
| 25 |
from pathlib import Path
|
| 26 |
from typing import Optional
|
| 27 |
|
|
|
|
| 33 |
LEADERBOARD_DATASET = os.environ.get("LEADERBOARD_DATASET", f"{ORGANIZATION}/chess-challenge-leaderboard")
|
| 34 |
LEADERBOARD_FILENAME = "leaderboard.csv"
|
| 35 |
HF_TOKEN = os.environ.get("HF_TOKEN") # Required for private dataset access
|
|
|
|
| 36 |
|
| 37 |
# CSV columns for the leaderboard
|
| 38 |
LEADERBOARD_COLUMNS = [
|
|
|
|
| 41 |
"legal_rate",
|
| 42 |
"legal_rate_first_try",
|
| 43 |
"last_updated",
|
| 44 |
+
"model_last_modified",
|
| 45 |
]
|
| 46 |
|
| 47 |
def is_chess_model(model_id: str) -> bool:
|
|
|
|
| 51 |
model_name = model_id.split("/")[-1].lower()
|
| 52 |
return "chess" in model_name
|
| 53 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 54 |
# =============================================================================
|
| 55 |
# Leaderboard Management
|
| 56 |
# =============================================================================
|
|
|
|
| 100 |
path_or_fileobj=csv_buffer,
|
| 101 |
path_in_repo=LEADERBOARD_FILENAME,
|
| 102 |
repo_id=LEADERBOARD_DATASET,
|
| 103 |
+
repo_type="dataset",utc
|
| 104 |
commit_message=f"Update leaderboard - {datetime.now().strftime('%Y-%m-%d %H:%M')}",
|
| 105 |
)
|
| 106 |
print(f"Leaderboard saved to {LEADERBOARD_DATASET}")
|
|
|
|
| 277 |
5. Update leaderboard and post discussion
|
| 278 |
"""
|
| 279 |
try:
|
| 280 |
+
|
|
|
|
| 281 |
from src.evaluate import (
|
| 282 |
ChessEvaluator,
|
| 283 |
load_model_and_tokenizer,
|
| 284 |
post_discussion_summary,
|
| 285 |
)
|
| 286 |
+
from huggingface_hub import model_info as hf_model_info
|
| 287 |
+
|
| 288 |
+
progress(0, desc="Getting model info...")
|
| 289 |
+
try:
|
| 290 |
+
model_info = hf_model_info(model_id, token=HF_TOKEN)
|
| 291 |
+
model_last_modified = model_info.lastModified
|
| 292 |
+
except Exception as e:
|
| 293 |
+
return f"## Evaluation Failed
|
| 294 |
+
Could not fetch model info for `{model_id}`: {e}"
|
| 295 |
+
|
| 296 |
+
leaderboard = load_leaderboard()
|
| 297 |
+
model_entry = next((e for e in leaderboard if e.get("model_id") == model_id), None)
|
| 298 |
+
|
| 299 |
+
if model_entry and "last_updated" in model_entry and model_entry["last_updated"]:
|
| 300 |
+
last_evaluation_date = datetime.strptime(model_entry["last_updated"], "%Y-%m-%d %H:%M")
|
| 301 |
+
|
| 302 |
+
# model_last_modified is timezone-aware, last_evaluation_date is naive.
|
| 303 |
+
# Compare them by making model_last_modified naive UTC.
|
| 304 |
+
if last_evaluation_date > model_last_modified.astimezone(timezone.utc).replace(tzinfo=None):
|
| 305 |
+
return f"""## Evaluation Skipped
|
| 306 |
+
|
| 307 |
+
Model `{model_id}` was already evaluated on {last_evaluation_date.strftime('%Y-%m-%d %H:%M UTC')}
|
| 308 |
+
which is after the model was last modified on {model_last_modified.strftime('%Y-%m-%d %H:%M UTC')}.
|
| 309 |
+
|
| 310 |
+
No new evaluation is needed.
|
| 311 |
+
"""
|
| 312 |
|
| 313 |
progress(0, desc="Loading model...")
|
| 314 |
|
|
|
|
| 328 |
|
| 329 |
# Run evaluation
|
| 330 |
result = evaluator.evaluate(verbose=True)
|
| 331 |
+
|
| 332 |
+
print("=" * 80)
|
| 333 |
+
print(f"Evaluation summary for {model_id}")
|
| 334 |
+
print(result.summary())
|
| 335 |
+
print("=" * 80)
|
| 336 |
|
| 337 |
progress(0.9, desc="Updating leaderboard...")
|
| 338 |
|
|
|
|
| 381 |
# Update leaderboard
|
| 382 |
leaderboard = load_leaderboard()
|
| 383 |
|
| 384 |
+
# Find existing entry for this model
|
| 385 |
+
model_entry = next((e for e in leaderboard if e.get("model_id") == model_id), None)
|
| 386 |
|
| 387 |
new_entry = {
|
| 388 |
"model_id": model_id,
|
| 389 |
"user_id": user_id,
|
| 390 |
+
"legal_rate": result.legal_rate_with_retry,
|
| 391 |
"legal_rate_first_try": result.legal_rate_first_try,
|
| 392 |
+
"last_updated": datetime.utcnow().strftime("%Y-%m-%d %H:%M"),
|
| 393 |
+
"model_last_modified": model_last_modified.astimezone(timezone.utc).strftime("%Y-%m-%d %H:%M"),
|
| 394 |
}
|
| 395 |
|
| 396 |
+
if model_entry is None:
|
| 397 |
leaderboard.append(new_entry)
|
| 398 |
save_leaderboard(leaderboard)
|
| 399 |
update_message = "New entry added to leaderboard!"
|
| 400 |
else:
|
| 401 |
+
old_rate = model_entry.get("legal_rate", 0)
|
| 402 |
+
model_entry.update(new_entry) # Update existing entry for the model
|
| 403 |
+
save_leaderboard(leaderboard)
|
| 404 |
+
if result.legal_rate_with_retry > old_rate:
|
| 405 |
+
update_message = f"Improved! {old_rate*100:.1f}% -> {result.legal_rate_with_retry*100:.1f}%"
|
| 406 |
else:
|
| 407 |
+
update_message = f"Re-evaluated. Previous: {old_rate*100:.1f}%, This run: {result.legal_rate_with_retry*100:.1f}%"
|
| 408 |
+
update_message = f"No improvement. Best: {old_rate*100:.1f}%, This run: {result.legal_rate*100:.1f}%"
|
| 409 |
|
| 410 |
# Post discussion to model page
|
| 411 |
if HF_TOKEN:
|
|
|
|
| 485 |
|
| 486 |
1. **Clone this repository**:
|
| 487 |
```bash
|
| 488 |
+
git clone ssh://huggingface.co/spaces/LLM-course/Chess1MChallenge
|
| 489 |
```
|
| 490 |
|
| 491 |
2. **Check an example solution** in the `example_solution/` folder for reference
|