nathanael-fijalkow commited on
Commit
c4e2b27
·
1 Parent(s): 619e61f

focus on legal moves

Browse files
Files changed (1) hide show
  1. app.py +141 -140
app.py CHANGED
@@ -48,10 +48,10 @@ LEADERBOARD_COLUMNS = [
48
  "model_id",
49
  "legal_rate",
50
  "legal_rate_first_try",
51
- "elo",
52
  # "win_rate",
53
- "draw_rate",
54
- "games_played",
55
  "last_updated",
56
  ]
57
 
@@ -132,8 +132,8 @@ def format_leaderboard_html(data: list) -> str:
132
  if not data:
133
  return "<p>No models evaluated yet. Be the first to submit!</p>"
134
 
135
- # Sort by ELO
136
- sorted_data = sorted(data, key=lambda x: x.get("elo", 0), reverse=True)
137
 
138
  html = """
139
  <style>
@@ -173,9 +173,10 @@ def format_leaderboard_html(data: list) -> str:
173
  <th>Rank</th>
174
  <th>Model</th>
175
  <th>Legal Rate</th>
176
- <th>ELO</th>
 
177
  <!-- <th>Win Rate</th> -->
178
- <th>Games</th>
179
  <th>Last Updated</th>
180
  </tr>
181
  </thead>
@@ -197,14 +198,16 @@ def format_leaderboard_html(data: list) -> str:
197
  else:
198
  legal_class = "legal-bad"
199
 
 
200
  html += f"""
201
  <tr>
202
  <td class="{rank_class}">{rank_display}</td>
203
  <td><a href="{model_url}" target="_blank" class="model-link">{entry['model_id'].split('/')[-1]}</a></td>
204
  <td class="{legal_class}">{legal_rate*100:.1f}%</td>
205
- <td><strong>{entry.get('elo', 'N/A'):.0f}</strong></td>
 
206
  <!-- <td>{entry.get('win_rate', 0)*100:.1f}%</td> -->
207
- <td>{entry.get('games_played', 0)}</td>
208
  <td>{entry.get('last_updated', 'N/A')}</td>
209
  </tr>
210
  """
@@ -358,135 +361,135 @@ def evaluate_legal_moves(
358
  return f"Evaluation failed: {str(e)}"
359
 
360
 
361
- def evaluate_winrate(
362
- model_id: str,
363
- stockfish_level: str,
364
- n_games: int,
365
- progress: gr.Progress = gr.Progress(),
366
- ) -> str:
367
- """Evaluate a model's win rate against Stockfish."""
368
- try:
369
- import sys
370
- sys.path.insert(0, str(Path(__file__).parent))
371
-
372
- from src.evaluate import ChessEvaluator, load_model_from_hub
373
-
374
- progress(0, desc="Loading model...")
375
- model, tokenizer = load_model_from_hub(model_id)
376
-
377
- progress(0.1, desc="Setting up Stockfish...")
378
- level = STOCKFISH_LEVELS.get(stockfish_level, 1)
379
- evaluator = ChessEvaluator(
380
- model=model,
381
- tokenizer=tokenizer,
382
- stockfish_level=level,
383
- )
384
-
385
- progress(0.2, desc=f"Playing {n_games} games...")
386
- results = evaluator.evaluate(n_games=n_games, verbose=False)
387
-
388
- # Update leaderboard
389
- leaderboard = load_leaderboard()
390
- entry = next((e for e in leaderboard if e["model_id"] == model_id), None)
391
- if entry is None:
392
- entry = {"model_id": model_id}
393
- leaderboard.append(entry)
394
-
395
- entry.update({
396
- "elo": results.get("estimated_elo", 1000),
397
- "win_rate": results.get("win_rate", 0),
398
- "games_played": entry.get("games_played", 0) + n_games,
399
- "last_updated": datetime.now().strftime("%Y-%m-%d %H:%M"),
400
- })
401
-
402
- save_leaderboard(leaderboard)
403
- progress(1.0, desc="Done!")
404
-
405
- return f"""
406
- ## Win Rate Evaluation for {model_id.split('/')[-1]}
407
-
408
- | Metric | Value |
409
- |--------|-------|
410
- | **Estimated ELO** | {results.get('estimated_elo', 'N/A'):.0f} |
411
- | **Win Rate** | {results.get('win_rate', 0)*100:.1f}% |
412
- | **Draw Rate** | {results.get('draw_rate', 0)*100:.1f}% |
413
- | **Loss Rate** | {results.get('loss_rate', 0)*100:.1f}% |
414
- | **Avg Game Length** | {results.get('avg_game_length', 0):.1f} moves |
415
- | **Illegal Move Rate** | {results.get('illegal_move_rate', 0)*100:.2f}% |
416
-
417
- Games played: {n_games} against Stockfish {stockfish_level}
418
- """
419
-
420
- except Exception as e:
421
- return f"Evaluation failed: {str(e)}"
422
-
423
-
424
- def evaluate_model(
425
- model_id: str,
426
- stockfish_level: str,
427
- n_games: int,
428
- progress: gr.Progress = gr.Progress(),
429
- ) -> str:
430
- """Evaluate a model against Stockfish."""
431
- try:
432
- # Import evaluation code
433
- import sys
434
- sys.path.insert(0, str(Path(__file__).parent))
435
-
436
- from src.evaluate import ChessEvaluator, load_model_from_hub
437
-
438
- progress(0, desc="Loading model...")
439
- model, tokenizer = load_model_from_hub(model_id)
440
-
441
- progress(0.1, desc="Setting up Stockfish...")
442
- level = STOCKFISH_LEVELS.get(stockfish_level, 1)
443
- evaluator = ChessEvaluator(
444
- model=model,
445
- tokenizer=tokenizer,
446
- stockfish_level=level,
447
- )
448
-
449
- progress(0.2, desc=f"Playing {n_games} games...")
450
- results = evaluator.evaluate(n_games=n_games, verbose=False)
451
-
452
- # Update leaderboard
453
- leaderboard = load_leaderboard()
454
-
455
- # Find or create entry
456
- entry = next((e for e in leaderboard if e["model_id"] == model_id), None)
457
- if entry is None:
458
- entry = {"model_id": model_id}
459
- leaderboard.append(entry)
460
-
461
- entry.update({
462
- "elo": results.get("estimated_elo", 1000),
463
- "win_rate": results.get("win_rate", 0),
464
- "games_played": entry.get("games_played", 0) + n_games,
465
- "illegal_rate": results.get("illegal_move_rate", 0),
466
- "last_updated": datetime.now().strftime("%Y-%m-%d %H:%M"),
467
- })
468
-
469
- save_leaderboard(leaderboard)
470
-
471
- progress(1.0, desc="Done!")
472
-
473
- return f"""
474
- ## Evaluation Results for {model_id.split('/')[-1]}
475
-
476
- | Metric | Value |
477
- |--------|-------|
478
- | **Estimated ELO** | {results.get('estimated_elo', 'N/A'):.0f} |
479
- | **Win Rate** | {results.get('win_rate', 0)*100:.1f}% |
480
- | **Draw Rate** | {results.get('draw_rate', 0)*100:.1f}% |
481
- | **Loss Rate** | {results.get('loss_rate', 0)*100:.1f}% |
482
- | **Avg Game Length** | {results.get('avg_game_length', 0):.1f} moves |
483
- | **Illegal Move Rate** | {results.get('illegal_move_rate', 0)*100:.2f}% |
484
-
485
- Games played: {n_games} against Stockfish {stockfish_level}
486
- """
487
-
488
- except Exception as e:
489
- return f"Evaluation failed: {str(e)}"
490
 
491
 
492
  def refresh_leaderboard() -> str:
@@ -595,11 +598,9 @@ with gr.Blocks(
595
  ### Phase 1: Legal Move Evaluation
596
 
597
  Test if your model can generate **legal chess moves** in random positions.
598
- This is a quick first check before running full games.
599
 
600
  - Tests the model on random board positions
601
  - Measures how often it generates legal moves
602
- - **Recommended before win rate evaluation**
603
  """)
604
 
605
  with gr.Row():
 
48
  "model_id",
49
  "legal_rate",
50
  "legal_rate_first_try",
51
+ # "elo",
52
  # "win_rate",
53
+ # "draw_rate",
54
+ # "games_played",
55
  "last_updated",
56
  ]
57
 
 
132
  if not data:
133
  return "<p>No models evaluated yet. Be the first to submit!</p>"
134
 
135
+ # Sort by legal_rate
136
+ sorted_data = sorted(data, key=lambda x: x.get("legal_rate", 0), reverse=True)
137
 
138
  html = """
139
  <style>
 
173
  <th>Rank</th>
174
  <th>Model</th>
175
  <th>Legal Rate</th>
176
+ <th>Legal Rate (1st try)</th>
177
+ <!-- <th>ELO</th> -->
178
  <!-- <th>Win Rate</th> -->
179
+ <!-- <th>Games</th> -->
180
  <th>Last Updated</th>
181
  </tr>
182
  </thead>
 
198
  else:
199
  legal_class = "legal-bad"
200
 
201
+ legal_rate_first_try = entry.get('legal_rate_first_try', 0)
202
  html += f"""
203
  <tr>
204
  <td class="{rank_class}">{rank_display}</td>
205
  <td><a href="{model_url}" target="_blank" class="model-link">{entry['model_id'].split('/')[-1]}</a></td>
206
  <td class="{legal_class}">{legal_rate*100:.1f}%</td>
207
+ <td>{legal_rate_first_try*100:.1f}%</td>
208
+ <!-- <td><strong>{entry.get('elo', 'N/A'):.0f}</strong></td> -->
209
  <!-- <td>{entry.get('win_rate', 0)*100:.1f}%</td> -->
210
+ <!-- <td>{entry.get('games_played', 0)}</td> -->
211
  <td>{entry.get('last_updated', 'N/A')}</td>
212
  </tr>
213
  """
 
361
  return f"Evaluation failed: {str(e)}"
362
 
363
 
364
+ # def evaluate_winrate(
365
+ # model_id: str,
366
+ # stockfish_level: str,
367
+ # n_games: int,
368
+ # progress: gr.Progress = gr.Progress(),
369
+ # ) -> str:
370
+ # """Evaluate a model's win rate against Stockfish."""
371
+ # try:
372
+ # import sys
373
+ # sys.path.insert(0, str(Path(__file__).parent))
374
+ #
375
+ # from src.evaluate import ChessEvaluator, load_model_from_hub
376
+ #
377
+ # progress(0, desc="Loading model...")
378
+ # model, tokenizer = load_model_from_hub(model_id)
379
+ #
380
+ # progress(0.1, desc="Setting up Stockfish...")
381
+ # level = STOCKFISH_LEVELS.get(stockfish_level, 1)
382
+ # evaluator = ChessEvaluator(
383
+ # model=model,
384
+ # tokenizer=tokenizer,
385
+ # stockfish_level=level,
386
+ # )
387
+ #
388
+ # progress(0.2, desc=f"Playing {n_games} games...")
389
+ # results = evaluator.evaluate(n_games=n_games, verbose=False)
390
+ #
391
+ # # Update leaderboard
392
+ # leaderboard = load_leaderboard()
393
+ # entry = next((e for e in leaderboard if e["model_id"] == model_id), None)
394
+ # if entry is None:
395
+ # entry = {"model_id": model_id}
396
+ # leaderboard.append(entry)
397
+ #
398
+ # entry.update({
399
+ # "elo": results.get("estimated_elo", 1000),
400
+ # "win_rate": results.get("win_rate", 0),
401
+ # "games_played": entry.get("games_played", 0) + n_games,
402
+ # "last_updated": datetime.now().strftime("%Y-%m-%d %H:%M"),
403
+ # })
404
+ #
405
+ # save_leaderboard(leaderboard)
406
+ # progress(1.0, desc="Done!")
407
+ #
408
+ # return f"""
409
+ # ## Win Rate Evaluation for {model_id.split('/')[-1]}
410
+ #
411
+ # | Metric | Value |
412
+ # |--------|-------|
413
+ # | **Estimated ELO** | {results.get('estimated_elo', 'N/A'):.0f} |
414
+ # | **Win Rate** | {results.get('win_rate', 0)*100:.1f}% |
415
+ # | **Draw Rate** | {results.get('draw_rate', 0)*100:.1f}% |
416
+ # | **Loss Rate** | {results.get('loss_rate', 0)*100:.1f}% |
417
+ # | **Avg Game Length** | {results.get('avg_game_length', 0):.1f} moves |
418
+ # | **Illegal Move Rate** | {results.get('illegal_move_rate', 0)*100:.2f}% |
419
+ #
420
+ # Games played: {n_games} against Stockfish {stockfish_level}
421
+ # """
422
+ #
423
+ # except Exception as e:
424
+ # return f"Evaluation failed: {str(e)}"
425
+
426
+
427
+ # def evaluate_model(
428
+ # model_id: str,
429
+ # stockfish_level: str,
430
+ # n_games: int,
431
+ # progress: gr.Progress = gr.Progress(),
432
+ # ) -> str:
433
+ # """Evaluate a model against Stockfish."""
434
+ # try:
435
+ # # Import evaluation code
436
+ # import sys
437
+ # sys.path.insert(0, str(Path(__file__).parent))
438
+ #
439
+ # from src.evaluate import ChessEvaluator, load_model_from_hub
440
+ #
441
+ # progress(0, desc="Loading model...")
442
+ # model, tokenizer = load_model_from_hub(model_id)
443
+ #
444
+ # progress(0.1, desc="Setting up Stockfish...")
445
+ # level = STOCKFISH_LEVELS.get(stockfish_level, 1)
446
+ # evaluator = ChessEvaluator(
447
+ # model=model,
448
+ # tokenizer=tokenizer,
449
+ # stockfish_level=level,
450
+ # )
451
+ #
452
+ # progress(0.2, desc=f"Playing {n_games} games...")
453
+ # results = evaluator.evaluate(n_games=n_games, verbose=False)
454
+ #
455
+ # # Update leaderboard
456
+ # leaderboard = load_leaderboard()
457
+ #
458
+ # # Find or create entry
459
+ # entry = next((e for e in leaderboard if e["model_id"] == model_id), None)
460
+ # if entry is None:
461
+ # entry = {"model_id": model_id}
462
+ # leaderboard.append(entry)
463
+ #
464
+ # entry.update({
465
+ # "elo": results.get("estimated_elo", 1000),
466
+ # "win_rate": results.get("win_rate", 0),
467
+ # "games_played": entry.get("games_played", 0) + n_games,
468
+ # "illegal_rate": results.get("illegal_move_rate", 0),
469
+ # "last_updated": datetime.now().strftime("%Y-%m-%d %H:%M"),
470
+ # })
471
+ #
472
+ # save_leaderboard(leaderboard)
473
+ #
474
+ # progress(1.0, desc="Done!")
475
+ #
476
+ # return f"""
477
+ # ## Evaluation Results for {model_id.split('/')[-1]}
478
+ #
479
+ # | Metric | Value |
480
+ # |--------|-------|
481
+ # | **Estimated ELO** | {results.get('estimated_elo', 'N/A'):.0f} |
482
+ # | **Win Rate** | {results.get('win_rate', 0)*100:.1f}% |
483
+ # | **Draw Rate** | {results.get('draw_rate', 0)*100:.1f}% |
484
+ # | **Loss Rate** | {results.get('loss_rate', 0)*100:.1f}% |
485
+ # | **Avg Game Length** | {results.get('avg_game_length', 0):.1f} moves |
486
+ # | **Illegal Move Rate** | {results.get('illegal_move_rate', 0)*100:.2f}% |
487
+ #
488
+ # Games played: {n_games} against Stockfish {stockfish_level}
489
+ # """
490
+ #
491
+ # except Exception as e:
492
+ # return f"Evaluation failed: {str(e)}"
493
 
494
 
495
  def refresh_leaderboard() -> str:
 
598
  ### Phase 1: Legal Move Evaluation
599
 
600
  Test if your model can generate **legal chess moves** in random positions.
 
601
 
602
  - Tests the model on random board positions
603
  - Measures how often it generates legal moves
 
604
  """)
605
 
606
  with gr.Row():