Spaces:
Running
Running
Commit
·
aee63fa
1
Parent(s):
c8419c3
removed interactive demo and improved username finding
Browse files- app.py +103 -60
- requirements.txt +1 -1
- src/evaluate.py +5 -0
app.py
CHANGED
|
@@ -307,6 +307,42 @@ def play_move(
|
|
| 307 |
)
|
| 308 |
|
| 309 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 310 |
def evaluate_legal_moves(
|
| 311 |
model_id: str,
|
| 312 |
n_positions: int,
|
|
@@ -332,8 +368,16 @@ def evaluate_legal_moves(
|
|
| 332 |
progress(0.2, desc=f"Testing {n_positions} positions...")
|
| 333 |
results = evaluator.evaluate_legal_moves(n_positions=n_positions, verbose=False)
|
| 334 |
|
| 335 |
-
# Extract user_id from
|
| 336 |
-
user_id = model_id
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 337 |
|
| 338 |
# Update leaderboard - only if improved
|
| 339 |
leaderboard = load_leaderboard()
|
|
@@ -579,62 +623,62 @@ with gr.Blocks(
|
|
| 579 |
- Try weight tying to save parameters
|
| 580 |
""")
|
| 581 |
|
| 582 |
-
# Interactive Demo Tab
|
| 583 |
-
with gr.TabItem("🎮 Interactive Demo"):
|
| 584 |
-
|
| 585 |
-
|
| 586 |
-
|
| 587 |
-
|
| 588 |
-
|
| 589 |
-
|
| 590 |
-
|
| 591 |
-
|
| 592 |
-
|
| 593 |
-
|
| 594 |
-
|
| 595 |
-
|
| 596 |
-
|
| 597 |
-
|
| 598 |
-
|
| 599 |
-
|
| 600 |
-
|
| 601 |
-
|
| 602 |
-
|
| 603 |
-
|
| 604 |
-
|
| 605 |
-
|
| 606 |
-
|
| 607 |
-
|
| 608 |
-
|
| 609 |
-
|
| 610 |
-
|
| 611 |
-
|
| 612 |
-
|
| 613 |
-
|
| 614 |
-
|
| 615 |
-
|
| 616 |
-
|
| 617 |
-
|
| 618 |
-
|
| 619 |
-
|
| 620 |
-
|
| 621 |
-
|
| 622 |
-
|
| 623 |
-
|
| 624 |
-
|
| 625 |
-
|
| 626 |
-
|
| 627 |
-
|
| 628 |
-
|
| 629 |
-
|
| 630 |
-
|
| 631 |
-
|
| 632 |
-
|
| 633 |
-
|
| 634 |
-
|
| 635 |
-
|
| 636 |
-
|
| 637 |
-
|
| 638 |
|
| 639 |
# Legal Move Evaluation Tab
|
| 640 |
with gr.TabItem("Legal Move Eval"):
|
|
@@ -651,9 +695,8 @@ with gr.Blocks(
|
|
| 651 |
legal_model = gr.Dropdown(
|
| 652 |
choices=get_available_models(),
|
| 653 |
label="Model to Evaluate",
|
| 654 |
-
scale=4,
|
| 655 |
)
|
| 656 |
-
refresh_legal_models_btn = gr.Button("🔄", scale=
|
| 657 |
legal_positions = gr.Slider(
|
| 658 |
minimum=100,
|
| 659 |
maximum=1000,
|
|
|
|
| 307 |
)
|
| 308 |
|
| 309 |
|
| 310 |
+
def get_model_submitter(model_id: str) -> Optional[str]:
|
| 311 |
+
"""Extract the submitter's username from the model's README on HuggingFace.
|
| 312 |
+
|
| 313 |
+
Returns None if the submitter cannot be determined.
|
| 314 |
+
"""
|
| 315 |
+
try:
|
| 316 |
+
from huggingface_hub import hf_hub_download
|
| 317 |
+
import re
|
| 318 |
+
|
| 319 |
+
# Download the README.md from the model repo
|
| 320 |
+
readme_path = hf_hub_download(
|
| 321 |
+
repo_id=model_id,
|
| 322 |
+
filename="README.md",
|
| 323 |
+
token=HF_TOKEN,
|
| 324 |
+
)
|
| 325 |
+
|
| 326 |
+
with open(readme_path, "r") as f:
|
| 327 |
+
readme_content = f.read()
|
| 328 |
+
|
| 329 |
+
# Look for the pattern: **Submitted by**: [username](https://huggingface.co/username)
|
| 330 |
+
match = re.search(r'\*\*Submitted by\*\*:\s*\[([^\]]+)\]', readme_content)
|
| 331 |
+
if match:
|
| 332 |
+
return match.group(1)
|
| 333 |
+
|
| 334 |
+
# Fallback: try to get from model info
|
| 335 |
+
from huggingface_hub import model_info
|
| 336 |
+
info = model_info(model_id, token=HF_TOKEN)
|
| 337 |
+
if info.author:
|
| 338 |
+
return info.author
|
| 339 |
+
|
| 340 |
+
except Exception as e:
|
| 341 |
+
print(f"Could not extract submitter from model: {e}")
|
| 342 |
+
|
| 343 |
+
return None
|
| 344 |
+
|
| 345 |
+
|
| 346 |
def evaluate_legal_moves(
|
| 347 |
model_id: str,
|
| 348 |
n_positions: int,
|
|
|
|
| 368 |
progress(0.2, desc=f"Testing {n_positions} positions...")
|
| 369 |
results = evaluator.evaluate_legal_moves(n_positions=n_positions, verbose=False)
|
| 370 |
|
| 371 |
+
# Extract user_id from model's README (submitted by field)
|
| 372 |
+
user_id = get_model_submitter(model_id)
|
| 373 |
+
if user_id is None:
|
| 374 |
+
return f"""## Evaluation Failed
|
| 375 |
+
|
| 376 |
+
Could not determine the submitter for model `{model_id}`.
|
| 377 |
+
|
| 378 |
+
Please ensure your model was submitted using the official submission script (`submit.py`),
|
| 379 |
+
which adds the required metadata to the README.md file.
|
| 380 |
+
"""
|
| 381 |
|
| 382 |
# Update leaderboard - only if improved
|
| 383 |
leaderboard = load_leaderboard()
|
|
|
|
| 623 |
- Try weight tying to save parameters
|
| 624 |
""")
|
| 625 |
|
| 626 |
+
# Interactive Demo Tab (commented out for now)
|
| 627 |
+
# with gr.TabItem("🎮 Interactive Demo"):
|
| 628 |
+
# gr.Markdown("### Test a Model")
|
| 629 |
+
#
|
| 630 |
+
# with gr.Row():
|
| 631 |
+
# with gr.Column(scale=1):
|
| 632 |
+
# with gr.Row():
|
| 633 |
+
# model_dropdown = gr.Dropdown(
|
| 634 |
+
# choices=get_available_models(),
|
| 635 |
+
# label="Select Model",
|
| 636 |
+
# value=None,
|
| 637 |
+
# scale=4,
|
| 638 |
+
# )
|
| 639 |
+
# refresh_models_btn = gr.Button("🔄", scale=1)
|
| 640 |
+
# temperature_slider = gr.Slider(
|
| 641 |
+
# minimum=0.1,
|
| 642 |
+
# maximum=2.0,
|
| 643 |
+
# value=0.7,
|
| 644 |
+
# step=0.1,
|
| 645 |
+
# label="Temperature",
|
| 646 |
+
# )
|
| 647 |
+
#
|
| 648 |
+
# with gr.Row():
|
| 649 |
+
# play_btn = gr.Button("Model Move", variant="primary")
|
| 650 |
+
# reset_btn = gr.Button("Reset")
|
| 651 |
+
#
|
| 652 |
+
# status_text = gr.Textbox(label="Status", interactive=False)
|
| 653 |
+
#
|
| 654 |
+
# with gr.Column(scale=1):
|
| 655 |
+
# board_display = gr.HTML(value=render_board_svg())
|
| 656 |
+
#
|
| 657 |
+
# # Hidden state
|
| 658 |
+
# current_fen = gr.State("startpos")
|
| 659 |
+
# move_history = gr.State("")
|
| 660 |
+
#
|
| 661 |
+
# def refresh_models():
|
| 662 |
+
# return gr.update(choices=get_available_models())
|
| 663 |
+
#
|
| 664 |
+
# refresh_models_btn.click(
|
| 665 |
+
# refresh_models,
|
| 666 |
+
# outputs=[model_dropdown],
|
| 667 |
+
# )
|
| 668 |
+
#
|
| 669 |
+
# play_btn.click(
|
| 670 |
+
# play_move,
|
| 671 |
+
# inputs=[model_dropdown, current_fen, move_history, temperature_slider],
|
| 672 |
+
# outputs=[board_display, current_fen, move_history, status_text],
|
| 673 |
+
# )
|
| 674 |
+
#
|
| 675 |
+
# def reset_game():
|
| 676 |
+
# return render_board_svg(), "startpos", "", "Game reset!"
|
| 677 |
+
#
|
| 678 |
+
# reset_btn.click(
|
| 679 |
+
# reset_game,
|
| 680 |
+
# outputs=[board_display, current_fen, move_history, status_text],
|
| 681 |
+
# )
|
| 682 |
|
| 683 |
# Legal Move Evaluation Tab
|
| 684 |
with gr.TabItem("Legal Move Eval"):
|
|
|
|
| 695 |
legal_model = gr.Dropdown(
|
| 696 |
choices=get_available_models(),
|
| 697 |
label="Model to Evaluate",
|
|
|
|
| 698 |
)
|
| 699 |
+
refresh_legal_models_btn = gr.Button("🔄", scale=0, min_width=40)
|
| 700 |
legal_positions = gr.Slider(
|
| 701 |
minimum=100,
|
| 702 |
maximum=1000,
|
requirements.txt
CHANGED
|
@@ -1,4 +1,4 @@
|
|
| 1 |
-
gradio>=4.44.
|
| 2 |
transformers>=4.40.0
|
| 3 |
torch>=2.0.0
|
| 4 |
python-chess>=1.999
|
|
|
|
| 1 |
+
gradio>=4.44.1
|
| 2 |
transformers>=4.40.0
|
| 3 |
torch>=2.0.0
|
| 4 |
python-chess>=1.999
|
src/evaluate.py
CHANGED
|
@@ -300,6 +300,7 @@ class ChessEvaluator:
|
|
| 300 |
n_positions: int = 1000,
|
| 301 |
temperature: float = 0.7,
|
| 302 |
verbose: bool = True,
|
|
|
|
| 303 |
) -> dict:
|
| 304 |
"""
|
| 305 |
Evaluate the model's ability to generate legal moves.
|
|
@@ -311,10 +312,14 @@ class ChessEvaluator:
|
|
| 311 |
n_positions: Number of positions to test.
|
| 312 |
temperature: Sampling temperature.
|
| 313 |
verbose: Whether to print progress.
|
|
|
|
| 314 |
|
| 315 |
Returns:
|
| 316 |
Dictionary with legal move statistics.
|
| 317 |
"""
|
|
|
|
|
|
|
|
|
|
| 318 |
results = {
|
| 319 |
"total_positions": 0,
|
| 320 |
"legal_first_try": 0,
|
|
|
|
| 300 |
n_positions: int = 1000,
|
| 301 |
temperature: float = 0.7,
|
| 302 |
verbose: bool = True,
|
| 303 |
+
seed: int = 42,
|
| 304 |
) -> dict:
|
| 305 |
"""
|
| 306 |
Evaluate the model's ability to generate legal moves.
|
|
|
|
| 312 |
n_positions: Number of positions to test.
|
| 313 |
temperature: Sampling temperature.
|
| 314 |
verbose: Whether to print progress.
|
| 315 |
+
seed: Random seed for reproducibility.
|
| 316 |
|
| 317 |
Returns:
|
| 318 |
Dictionary with legal move statistics.
|
| 319 |
"""
|
| 320 |
+
# Set seed for deterministic evaluation
|
| 321 |
+
random.seed(seed)
|
| 322 |
+
|
| 323 |
results = {
|
| 324 |
"total_positions": 0,
|
| 325 |
"legal_first_try": 0,
|