nathanael-fijalkow commited on
Commit
faa67d0
·
1 Parent(s): dbdb0ce

Fixed legal_moves bug and skipped already evaluated models

Browse files
Files changed (1) hide show
  1. app.py +49 -26
app.py CHANGED
@@ -21,7 +21,7 @@ import os
21
  import queue
22
  import sys
23
  import threading
24
- from datetime import datetime
25
  from pathlib import Path
26
  from typing import Optional
27
 
@@ -33,7 +33,6 @@ ORGANIZATION = os.environ.get("HF_ORGANIZATION", "LLM-course")
33
  LEADERBOARD_DATASET = os.environ.get("LEADERBOARD_DATASET", f"{ORGANIZATION}/chess-challenge-leaderboard")
34
  LEADERBOARD_FILENAME = "leaderboard.csv"
35
  HF_TOKEN = os.environ.get("HF_TOKEN") # Required for private dataset access
36
- WEBHOOK_SECRET = os.environ.get("WEBHOOK_SECRET", "459f4c2c6b0b4b6468e21f981103753d14219d4955f07ab457e100fee93cae66")
37
 
38
  # CSV columns for the leaderboard
39
  LEADERBOARD_COLUMNS = [
@@ -42,6 +41,7 @@ LEADERBOARD_COLUMNS = [
42
  "legal_rate",
43
  "legal_rate_first_try",
44
  "last_updated",
 
45
  ]
46
 
47
  def is_chess_model(model_id: str) -> bool:
@@ -51,15 +51,6 @@ def is_chess_model(model_id: str) -> bool:
51
  model_name = model_id.split("/")[-1].lower()
52
  return "chess" in model_name
53
 
54
-
55
- def verify_webhook_signature(body: bytes, signature: str) -> bool:
56
- """Verify the webhook signature using HMAC-SHA256."""
57
- if not WEBHOOK_SECRET:
58
- return True # Skip verification if no secret configured
59
- expected = hmac.new(WEBHOOK_SECRET.encode(), body, hashlib.sha256).hexdigest()
60
- return hmac.compare_digest(signature or "", expected)
61
-
62
-
63
  # =============================================================================
64
  # Leaderboard Management
65
  # =============================================================================
@@ -109,7 +100,7 @@ def save_leaderboard(data: list):
109
  path_or_fileobj=csv_buffer,
110
  path_in_repo=LEADERBOARD_FILENAME,
111
  repo_id=LEADERBOARD_DATASET,
112
- repo_type="dataset",
113
  commit_message=f"Update leaderboard - {datetime.now().strftime('%Y-%m-%d %H:%M')}",
114
  )
115
  print(f"Leaderboard saved to {LEADERBOARD_DATASET}")
@@ -286,13 +277,38 @@ def run_evaluation(
286
  5. Update leaderboard and post discussion
287
  """
288
  try:
289
- sys.path.insert(0, str(Path(__file__).parent))
290
-
291
  from src.evaluate import (
292
  ChessEvaluator,
293
  load_model_and_tokenizer,
294
  post_discussion_summary,
295
  )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
296
 
297
  progress(0, desc="Loading model...")
298
 
@@ -312,6 +328,11 @@ def run_evaluation(
312
 
313
  # Run evaluation
314
  result = evaluator.evaluate(verbose=True)
 
 
 
 
 
315
 
316
  progress(0.9, desc="Updating leaderboard...")
317
 
@@ -360,29 +381,31 @@ which adds the required metadata to the README.md file.
360
  # Update leaderboard
361
  leaderboard = load_leaderboard()
362
 
363
- # Find existing entry for this user
364
- user_entry = next((e for e in leaderboard if e.get("user_id") == user_id), None)
365
 
366
  new_entry = {
367
  "model_id": model_id,
368
  "user_id": user_id,
369
- "legal_rate": result.legal_rate,
370
  "legal_rate_first_try": result.legal_rate_first_try,
371
- "last_updated": datetime.now().strftime("%Y-%m-%d %H:%M"),
 
372
  }
373
 
374
- if user_entry is None:
375
  leaderboard.append(new_entry)
376
  save_leaderboard(leaderboard)
377
  update_message = "New entry added to leaderboard!"
378
  else:
379
- old_rate = user_entry.get("legal_rate", 0)
380
- if result.legal_rate > old_rate:
381
- user_entry.update(new_entry)
382
- save_leaderboard(leaderboard)
383
- update_message = f"Improved! {old_rate*100:.1f}% -> {result.legal_rate*100:.1f}%"
384
  else:
385
- update_message = f"No improvement. Best: {old_rate*100:.1f}%, This run: {result.legal_rate*100:.1f}%"
 
386
 
387
  # Post discussion to model page
388
  if HF_TOKEN:
@@ -462,7 +485,7 @@ with gr.Blocks(
462
 
463
  1. **Clone this repository**:
464
  ```bash
465
- git clone https://huggingface.co/spaces/LLM-course/Chess1MChallenge
466
  ```
467
 
468
  2. **Check an example solution** in the `example_solution/` folder for reference
 
21
  import queue
22
  import sys
23
  import threading
24
+ from datetime import datetime, timezone
25
  from pathlib import Path
26
  from typing import Optional
27
 
 
33
  LEADERBOARD_DATASET = os.environ.get("LEADERBOARD_DATASET", f"{ORGANIZATION}/chess-challenge-leaderboard")
34
  LEADERBOARD_FILENAME = "leaderboard.csv"
35
  HF_TOKEN = os.environ.get("HF_TOKEN") # Required for private dataset access
 
36
 
37
  # CSV columns for the leaderboard
38
  LEADERBOARD_COLUMNS = [
 
41
  "legal_rate",
42
  "legal_rate_first_try",
43
  "last_updated",
44
+ "model_last_modified",
45
  ]
46
 
47
  def is_chess_model(model_id: str) -> bool:
 
51
  model_name = model_id.split("/")[-1].lower()
52
  return "chess" in model_name
53
 
 
 
 
 
 
 
 
 
 
54
  # =============================================================================
55
  # Leaderboard Management
56
  # =============================================================================
 
100
  path_or_fileobj=csv_buffer,
101
  path_in_repo=LEADERBOARD_FILENAME,
102
  repo_id=LEADERBOARD_DATASET,
103
+ repo_type="dataset",utc
104
  commit_message=f"Update leaderboard - {datetime.now().strftime('%Y-%m-%d %H:%M')}",
105
  )
106
  print(f"Leaderboard saved to {LEADERBOARD_DATASET}")
 
277
  5. Update leaderboard and post discussion
278
  """
279
  try:
280
+
 
281
  from src.evaluate import (
282
  ChessEvaluator,
283
  load_model_and_tokenizer,
284
  post_discussion_summary,
285
  )
286
+ from huggingface_hub import model_info as hf_model_info
287
+
288
+ progress(0, desc="Getting model info...")
289
+ try:
290
+ model_info = hf_model_info(model_id, token=HF_TOKEN)
291
+ model_last_modified = model_info.lastModified
292
+ except Exception as e:
293
+ return f"## Evaluation Failed
294
+ Could not fetch model info for `{model_id}`: {e}"
295
+
296
+ leaderboard = load_leaderboard()
297
+ model_entry = next((e for e in leaderboard if e.get("model_id") == model_id), None)
298
+
299
+ if model_entry and "last_updated" in model_entry and model_entry["last_updated"]:
300
+ last_evaluation_date = datetime.strptime(model_entry["last_updated"], "%Y-%m-%d %H:%M")
301
+
302
+ # model_last_modified is timezone-aware, last_evaluation_date is naive.
303
+ # Compare them by making model_last_modified naive UTC.
304
+ if last_evaluation_date > model_last_modified.astimezone(timezone.utc).replace(tzinfo=None):
305
+ return f"""## Evaluation Skipped
306
+
307
+ Model `{model_id}` was already evaluated on {last_evaluation_date.strftime('%Y-%m-%d %H:%M UTC')}
308
+ which is after the model was last modified on {model_last_modified.strftime('%Y-%m-%d %H:%M UTC')}.
309
+
310
+ No new evaluation is needed.
311
+ """
312
 
313
  progress(0, desc="Loading model...")
314
 
 
328
 
329
  # Run evaluation
330
  result = evaluator.evaluate(verbose=True)
331
+
332
+ print("=" * 80)
333
+ print(f"Evaluation summary for {model_id}")
334
+ print(result.summary())
335
+ print("=" * 80)
336
 
337
  progress(0.9, desc="Updating leaderboard...")
338
 
 
381
  # Update leaderboard
382
  leaderboard = load_leaderboard()
383
 
384
+ # Find existing entry for this model
385
+ model_entry = next((e for e in leaderboard if e.get("model_id") == model_id), None)
386
 
387
  new_entry = {
388
  "model_id": model_id,
389
  "user_id": user_id,
390
+ "legal_rate": result.legal_rate_with_retry,
391
  "legal_rate_first_try": result.legal_rate_first_try,
392
+ "last_updated": datetime.utcnow().strftime("%Y-%m-%d %H:%M"),
393
+ "model_last_modified": model_last_modified.astimezone(timezone.utc).strftime("%Y-%m-%d %H:%M"),
394
  }
395
 
396
+ if model_entry is None:
397
  leaderboard.append(new_entry)
398
  save_leaderboard(leaderboard)
399
  update_message = "New entry added to leaderboard!"
400
  else:
401
+ old_rate = model_entry.get("legal_rate", 0)
402
+ model_entry.update(new_entry) # Update existing entry for the model
403
+ save_leaderboard(leaderboard)
404
+ if result.legal_rate_with_retry > old_rate:
405
+ update_message = f"Improved! {old_rate*100:.1f}% -> {result.legal_rate_with_retry*100:.1f}%"
406
  else:
407
+ update_message = f"Re-evaluated. Previous: {old_rate*100:.1f}%, This run: {result.legal_rate_with_retry*100:.1f}%"
408
+ update_message = f"No improvement. Best: {old_rate*100:.1f}%, This run: {result.legal_rate*100:.1f}%"
409
 
410
  # Post discussion to model page
411
  if HF_TOKEN:
 
485
 
486
  1. **Clone this repository**:
487
  ```bash
488
+ git clone ssh://huggingface.co/spaces/LLM-course/Chess1MChallenge
489
  ```
490
 
491
  2. **Check an example solution** in the `example_solution/` folder for reference