nathanael-fijalkow commited on
Commit
be9adf7
·
1 Parent(s): f97097f

add register_for_auto_class("AutoTokenizer")

Browse files
Files changed (3) hide show
  1. app.py +17 -1
  2. src/evaluate.py +17 -2
  3. submit.py +4 -0
app.py CHANGED
@@ -341,12 +341,20 @@ def evaluate_legal_moves(
341
  """Evaluate a model's legal move generation."""
342
  try:
343
  import sys
 
 
 
344
  sys.path.insert(0, str(Path(__file__).parent))
345
 
346
  from src.evaluate import ChessEvaluator, load_model_from_hub
347
 
348
  progress(0, desc="Loading model...")
349
- model, tokenizer = load_model_from_hub(model_id)
 
 
 
 
 
350
 
351
  progress(0.1, desc="Setting up evaluator...")
352
  evaluator = ChessEvaluator(
@@ -409,6 +417,9 @@ which adds the required metadata to the README.md file.
409
 
410
  progress(1.0, desc="Done!")
411
 
 
 
 
412
  return f"""
413
  ## Legal Move Evaluation for {model_id.split('/')[-1]}
414
 
@@ -419,6 +430,11 @@ which adds the required metadata to the README.md file.
419
  | **Legal (with retries)** | {results['legal_first_try'] + results['legal_with_retry']} ({results['legal_rate_with_retry']*100:.1f}%) |
420
  | **Always Illegal** | {results['illegal_all_retries']} ({results['illegal_rate']*100:.1f}%) |
421
 
 
 
 
 
 
422
  ### Leaderboard Update
423
  {update_message}
424
 
 
341
  """Evaluate a model's legal move generation."""
342
  try:
343
  import sys
344
+ import io
345
+ from contextlib import redirect_stdout
346
+
347
  sys.path.insert(0, str(Path(__file__).parent))
348
 
349
  from src.evaluate import ChessEvaluator, load_model_from_hub
350
 
351
  progress(0, desc="Loading model...")
352
+
353
+ # Capture tokenizer debug info
354
+ debug_output = io.StringIO()
355
+ with redirect_stdout(debug_output):
356
+ model, tokenizer = load_model_from_hub(model_id, verbose=True)
357
+ tokenizer_info = debug_output.getvalue()
358
 
359
  progress(0.1, desc="Setting up evaluator...")
360
  evaluator = ChessEvaluator(
 
417
 
418
  progress(1.0, desc="Done!")
419
 
420
+ # Format tokenizer info for display
421
+ tokenizer_debug = tokenizer_info.strip().replace(" ", "- ")
422
+
423
  return f"""
424
  ## Legal Move Evaluation for {model_id.split('/')[-1]}
425
 
 
430
  | **Legal (with retries)** | {results['legal_first_try'] + results['legal_with_retry']} ({results['legal_rate_with_retry']*100:.1f}%) |
431
  | **Always Illegal** | {results['illegal_all_retries']} ({results['illegal_rate']*100:.1f}%) |
432
 
433
+ ### Tokenizer Info
434
+ ```
435
+ {tokenizer_debug}
436
+ ```
437
+
438
  ### Leaderboard Update
439
  {update_message}
440
 
src/evaluate.py CHANGED
@@ -750,13 +750,14 @@ class ChessEvaluator:
750
  return results
751
 
752
 
753
- def load_model_from_hub(model_id: str, device: str = "auto"):
754
  """
755
  Load a model from the Hugging Face Hub.
756
 
757
  Args:
758
  model_id: Model ID on Hugging Face Hub.
759
  device: Device to load the model on.
 
760
 
761
  Returns:
762
  Tuple of (model, tokenizer).
@@ -769,10 +770,15 @@ def load_model_from_hub(model_id: str, device: str = "auto"):
769
 
770
  # Try AutoTokenizer with trust_remote_code first to load custom tokenizer.py from Hub
771
  # Fall back to local ChessTokenizer if the model doesn't have a custom tokenizer
 
772
  try:
773
  tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)
774
- except Exception:
 
 
 
775
  tokenizer = ChessTokenizer.from_pretrained(model_id)
 
776
 
777
  model = AutoModelForCausalLM.from_pretrained(
778
  model_id,
@@ -780,6 +786,15 @@ def load_model_from_hub(model_id: str, device: str = "auto"):
780
  device_map=device,
781
  )
782
 
 
 
 
 
 
 
 
 
 
783
  return model, tokenizer
784
 
785
 
 
750
  return results
751
 
752
 
753
+ def load_model_from_hub(model_id: str, device: str = "auto", verbose: bool = True):
754
  """
755
  Load a model from the Hugging Face Hub.
756
 
757
  Args:
758
  model_id: Model ID on Hugging Face Hub.
759
  device: Device to load the model on.
760
+ verbose: Whether to print debug info about loaded tokenizer.
761
 
762
  Returns:
763
  Tuple of (model, tokenizer).
 
770
 
771
  # Try AutoTokenizer with trust_remote_code first to load custom tokenizer.py from Hub
772
  # Fall back to local ChessTokenizer if the model doesn't have a custom tokenizer
773
+ tokenizer_source = None
774
  try:
775
  tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)
776
+ tokenizer_source = "AutoTokenizer (from Hub with trust_remote_code=True)"
777
+ except Exception as e:
778
+ if verbose:
779
+ print(f" AutoTokenizer failed: {e}")
780
  tokenizer = ChessTokenizer.from_pretrained(model_id)
781
+ tokenizer_source = "ChessTokenizer (local class, vocab from Hub)"
782
 
783
  model = AutoModelForCausalLM.from_pretrained(
784
  model_id,
 
786
  device_map=device,
787
  )
788
 
789
+ # Print debug info
790
+ if verbose:
791
+ print(f" Tokenizer loaded via: {tokenizer_source}")
792
+ print(f" Tokenizer class: {type(tokenizer).__name__}")
793
+ print(f" Tokenizer vocab size: {tokenizer.vocab_size}")
794
+ # Check if tokenizer has custom attributes that might differ
795
+ if hasattr(tokenizer, '_vocab'):
796
+ print(f" Tokenizer has _vocab attribute: yes ({len(tokenizer._vocab)} entries)")
797
+
798
  return model, tokenizer
799
 
800
 
submit.py CHANGED
@@ -78,6 +78,10 @@ def main():
78
  with tempfile.TemporaryDirectory() as tmp_dir:
79
  tmp_path = Path(tmp_dir)
80
 
 
 
 
 
81
  # Save model and tokenizer
82
  model.save_pretrained(tmp_path)
83
  tokenizer.save_pretrained(tmp_path)
 
78
  with tempfile.TemporaryDirectory() as tmp_dir:
79
  tmp_path = Path(tmp_dir)
80
 
81
+ # Register tokenizer for AutoTokenizer so it can be loaded with trust_remote_code=True
82
+ # This adds the 'auto_map' field to tokenizer_config.json
83
+ tokenizer.register_for_auto_class("AutoTokenizer")
84
+
85
  # Save model and tokenizer
86
  model.save_pretrained(tmp_path)
87
  tokenizer.save_pretrained(tmp_path)