spm_uk_64k / tokenizer.py
dovcharenko's picture
Upload folder using huggingface_hub
5996cd9 verified
#!/usr/bin/env python3
"""
Export SentencePiece model to HuggingFace tokenizer format.
Uses a custom tokenizer class that properly wraps SentencePiece.
"""
import argparse
import shutil
import json
from pathlib import Path
import sentencepiece as spm
from transformers import PreTrainedTokenizer
def read_specials(path: str):
if not path:
return []
out = []
for line in Path(path).read_text(encoding="utf-8").splitlines():
s = line.strip()
if s:
out.append(s)
return out
def verify_spm_model(model_path: Path):
"""Verify SentencePiece model can be loaded and works."""
try:
sp = spm.SentencePieceProcessor()
sp.Load(str(model_path))
vocab_size = sp.GetPieceSize()
print(f"[VERIFY] SentencePiece model loaded: {vocab_size} pieces")
# Test encoding
test_text = "Україна"
ids = sp.EncodeAsIds(test_text)
decoded = sp.DecodeIds(ids)
print(f"[VERIFY] Test encode/decode: '{test_text}' -> {len(ids)} tokens -> '{decoded}'")
return vocab_size, sp
except Exception as e:
print(f"[ERROR] Failed to load SentencePiece model: {e}")
raise
class SentencePieceTokenizer(PreTrainedTokenizer):
"""Custom tokenizer that properly wraps SentencePiece."""
vocab_files_names = {"vocab_file": "spiece.model"}
model_input_names = ["input_ids", "attention_mask"]
def __init__(
self,
vocab_file,
bos_token="<s>",
eos_token="</s>",
unk_token="<unk>",
pad_token="<pad>",
**kwargs
):
# Initialize SentencePiece model FIRST before calling super().__init__()
# because super().__init__() may access vocab_size
self.vocab_file = vocab_file
self.sp_model = spm.SentencePieceProcessor()
self.sp_model.Load(vocab_file)
# Store vocab_size in a private attribute
# Use object.__setattr__ to bypass PreTrainedTokenizer's property setter
object.__setattr__(self, '_vocab_size', self.sp_model.GetPieceSize())
# Now call super().__init__() which may access vocab_size via the property
# Don't pass vocab_size as parameter - let the property handle it
super().__init__(
bos_token=bos_token,
eos_token=eos_token,
unk_token=unk_token,
pad_token=pad_token,
**kwargs
)
@property
def vocab_size(self):
"""Return vocabulary size from SentencePiece model."""
if hasattr(self, '_vocab_size'):
return self._vocab_size
elif hasattr(self, 'sp_model'):
return self.sp_model.GetPieceSize()
else:
return 0
def get_vocab(self):
"""Get vocabulary dictionary."""
vocab = {}
# Use _vocab_size or get from model
vocab_size = self._vocab_size if hasattr(self, '_vocab_size') else self.sp_model.GetPieceSize()
for i in range(vocab_size):
piece = self.sp_model.IdToPiece(i)
vocab[piece] = i
return vocab
def _tokenize(self, text):
return self.sp_model.EncodeAsPieces(text)
def _convert_token_to_id(self, token):
return self.sp_model.PieceToId(token)
def _convert_id_to_token(self, index):
return self.sp_model.IdToPiece(index)
def convert_tokens_to_string(self, tokens):
return self.sp_model.DecodePieces(tokens)
def save_vocabulary(self, save_directory, filename_prefix=None):
"""Save the SentencePiece model file."""
save_dir = Path(save_directory)
if not save_dir.exists():
save_dir.mkdir(parents=True, exist_ok=True)
out_vocab_file = save_dir / "spiece.model"
# Check if source and destination are the same file
source_path = Path(self.vocab_file).resolve()
dest_path = out_vocab_file.resolve()
if source_path == dest_path:
# File already in place, no need to copy
return (str(out_vocab_file),)
# Copy the file
shutil.copy2(self.vocab_file, out_vocab_file)
return (str(out_vocab_file),)
def main():
ap = argparse.ArgumentParser()
ap.add_argument("--spm_model", required=True, help="Path to .model file")
ap.add_argument("--specials", required=True, help="Special tokens file (one per line)")
ap.add_argument("--out_dir", required=True, help="HF tokenizer output dir")
args = ap.parse_args()
out_dir = Path(args.out_dir)
out_dir.mkdir(parents=True, exist_ok=True)
spm_model_path = Path(args.spm_model)
if not spm_model_path.exists():
raise FileNotFoundError(f"SentencePiece model not found: {spm_model_path}")
# Verify the SentencePiece model works
print("🔍 Verifying SentencePiece model...")
spm_vocab_size, sp_model = verify_spm_model(spm_model_path)
specials = read_specials(args.specials)
print(f"[INFO] Found {len(specials)} special tokens")
# Copy the .model file to output directory with correct name
output_model = out_dir / "spiece.model"
print(f"[COPY] Copying {spm_model_path} -> {output_model}")
shutil.copy2(spm_model_path, output_model)
# Verify file was copied correctly
if not output_model.exists():
raise FileNotFoundError(f"Failed to copy model file to {output_model}")
file_size = output_model.stat().st_size
print(f"[COPY] Model file size: {file_size:,} bytes")
# Initialize custom tokenizer
print("[INIT] Initializing SentencePieceTokenizer...")
tok = SentencePieceTokenizer(
vocab_file=str(output_model),
bos_token="<s>",
eos_token="</s>",
unk_token="<unk>",
pad_token="<pad>",
)
print(f"[INIT] Vocab size: {tok.vocab_size} (expected: {spm_vocab_size})")
if tok.vocab_size != spm_vocab_size:
raise RuntimeError(f"Vocab size mismatch: {tok.vocab_size} != {spm_vocab_size}")
# Add special tokens
if specials:
print(f"[ADD] Adding {len(specials)} special tokens...")
# Note: Special tokens are already in the SentencePiece model
# We just need to register them in the tokenizer config
tok.add_special_tokens({"additional_special_tokens": specials})
print(f"[ADD] Special tokens registered")
# Save tokenizer
print(f"[SAVE] Saving tokenizer to {out_dir}...")
tok.save_pretrained(str(out_dir))
# Copy the tokenizer class file to the output directory
# This allows AutoTokenizer to load it via auto_map
script_path = Path(__file__).absolute()
tokenizer_py = out_dir / "tokenizer.py"
print(f"[COPY] Copying tokenizer class to {tokenizer_py}...")
shutil.copy2(script_path, tokenizer_py)
# Update tokenizer config with correct class name
tokenizer_config_path = out_dir / "tokenizer_config.json"
# Create/update tokenizer config
config = {
"tokenizer_class": "SentencePieceTokenizer",
"auto_map": {
"AutoTokenizer": ["tokenizer.SentencePieceTokenizer", None]
},
"model_type": "llama",
"vocab_size": tok.vocab_size,
"bos_token": "<s>",
"eos_token": "</s>",
"unk_token": "<unk>",
"pad_token": "<pad>",
}
# Add special tokens to config if they exist
if specials:
config["additional_special_tokens"] = specials
with open(tokenizer_config_path, "w") as f:
json.dump(config, f, indent=2)
print(f"[CONFIG] Updated tokenizer_config.json with auto_map")
print("\n[OK] saved HF tokenizer to:", out_dir)
print(f"[INFO] vocab_size: {tok.vocab_size}")
print(f"[INFO] added_special_tokens: {len(specials)}")
# Verify the tokenizer works
print("\n[TEST] Testing tokenizer...")
test_text = "Україна — це країна"
tokens = tok.encode(test_text, add_special_tokens=False)
decoded = tok.decode(tokens)
print(f"[TEST] Encoded '{test_text}' -> {len(tokens)} tokens")
print(f"[TEST] Decoded back: '{decoded}'")
# Test with special tokens
if specials:
test_with_special = f"<user>Привіт</user><assistant>Вітаю!</assistant>"
tokens_special = tok.encode(test_with_special, add_special_tokens=False)
print(f"[TEST] Special tokens test: {len(tokens_special)} tokens")
if len(tokens) == 0:
print("\n⚠️ WARNING: Tokenizer encoded 0 tokens!")
print(" The tokenizer may not be working correctly.")
else:
print("\n✅ Tokenizer export successful!")
print(f"\n💡 To use this tokenizer:")
print(f" # Method 1: Using AutoTokenizer (recommended)")
print(f" from transformers import AutoTokenizer")
print(f" tok = AutoTokenizer.from_pretrained('{out_dir}', trust_remote_code=True)")
print(f"\n # Method 2: Direct import")
print(f" import sys; sys.path.insert(0, '{out_dir}')")
print(f" from tokenizer import SentencePieceTokenizer")
print(f" tok = SentencePieceTokenizer.from_pretrained('{out_dir}')")
if __name__ == "__main__":
main()