#!/usr/bin/env python3 """ EthioBBPE: Production-Ready Byte-Level BPE Tokenizer Trainer Features: Checkpointing, Compression, Parallel Processing, Robust Logging """ import os import json import gzip import shutil import logging from pathlib import Path from typing import List, Optional, Union, Dict, Any from dataclasses import dataclass, field, asdict from datetime import datetime from tokenizers import ByteLevelBPETokenizer, trainers from tokenizers.implementations import BaseTokenizer # Configure logging logging.basicConfig( level=logging.INFO, format="%(asctime)s - %(name)s - %(levelname)s - %(message)s" ) logger = logging.getLogger("EthioBBPE") @dataclass class BBPEConfig: """Configuration for EthioBBPE training.""" vocab_size: int = 30000 min_frequency: int = 2 show_progress: bool = True special_tokens: List[str] = field(default_factory=lambda: ["", "", "", ""]) lowercase: bool = False dropout: Optional[float] = None # File paths data_dir: str = "./data" model_save_dir: str = "./models" model_name: str = "EthioBBPE" # Advanced features use_checkpoint: bool = True checkpoint_dir: str = "./models/checkpoints" save_compressed: bool = True checkpoint_steps: Optional[int] = None # Save checkpoint every N steps if custom trainer used num_threads: int = -1 # -1 for auto def save(self, path: str): """Save configuration to JSON.""" with open(path, 'w', encoding='utf-8') as f: json.dump(asdict(self), f, indent=2) logger.info(f"Configuration saved to {path}") @classmethod def load(cls, path: str) -> "BBPEConfig": """Load configuration from JSON.""" with open(path, 'r', encoding='utf-8') as f: data = json.load(f) return cls(**data) class EthioBBPETrainer: """ Production-ready trainer for Byte-Level BPE with checkpointing and compression. """ def __init__(self, config: BBPEConfig = None): self.config = config or BBPEConfig() self.output_dir = Path(self.config.model_save_dir) self.checkpoint_dir = Path(self.config.checkpoint_dir) self.tokenizer = None self.is_trained = False # Create directories self.output_dir.mkdir(parents=True, exist_ok=True) self.checkpoint_dir.mkdir(parents=True, exist_ok=True) logger.info(f"Initialized EthioBBPETrainer with output dir: {self.output_dir}") def _initialize_tokenizer(self): """Initialize the ByteLevelBPETokenizer.""" self.tokenizer = ByteLevelBPETokenizer( add_prefix_space=False, trim_offsets=True, lowercase=self.config.lowercase ) logger.info("Tokenizer initialized") def train(self, files: Union[str, List[str]] = None, use_checkpoint: bool = None): """ Train the tokenizer on a list of files or a directory. Args: files: Path to a file, list of files, or directory containing text files. If None, uses files from config.data_dir use_checkpoint: If True, attempts to resume from the latest checkpoint. Defaults to config.use_checkpoint """ if self.tokenizer is None: self._initialize_tokenizer() # Use config default if not specified use_checkpoint = use_checkpoint if use_checkpoint is not None else self.config.use_checkpoint # Resolve file paths if files is None: # Use data_dir from config data_path = Path(self.config.data_dir) if data_path.is_dir(): file_paths = [str(f) for f in data_path.glob("**/*.txt")] file_paths.extend([str(f) for f in data_path.glob("**/*.jsonl")]) file_paths.extend([str(f) for f in data_path.glob("**/*.json")]) else: raise FileNotFoundError(f"Data directory not found: {self.config.data_dir}") elif isinstance(files, str): path = Path(files) if path.is_dir(): file_paths = [str(f) for f in path.glob("**/*.txt")] file_paths.extend([str(f) for f in path.glob("**/*.jsonl")]) file_paths.extend([str(f) for f in path.glob("**/*.json")]) else: file_paths = [str(path)] else: file_paths = files if not file_paths: raise ValueError("No valid training files found.") logger.info(f"Found {len(file_paths)} files for training.") # Checkpoint logic start_from_scratch = True if use_checkpoint: latest_ckpt = self._get_latest_checkpoint() if latest_ckpt: logger.info(f"Resuming from checkpoint: {latest_ckpt}") # Use Tokenizer.from_file for loading checkpoints (works with tokenizer.json format) from tokenizers import Tokenizer self.tokenizer = Tokenizer.from_file(str(latest_ckpt)) start_from_scratch = False else: logger.info("No checkpoint found. Starting from scratch.") # Train logger.info("Starting training...") # ByteLevelBPETokenizer.train() accepts parameters directly, not a trainer object self.tokenizer.train( files=file_paths, vocab_size=self.config.vocab_size, min_frequency=self.config.min_frequency, special_tokens=self.config.special_tokens, show_progress=self.config.show_progress ) self.is_trained = True logger.info("Training completed successfully.") # Auto-save checkpoint after training self._save_checkpoint("final_pre_compress") return self.tokenizer def _get_latest_checkpoint(self) -> Optional[Path]: """Find the latest checkpoint file.""" ckpts = list(self.checkpoint_dir.glob("checkpoint_*.json")) if not ckpts: return None # Sort by modification time ckpts.sort(key=lambda p: p.stat().st_mtime, reverse=True) return ckpts[0] def _save_checkpoint(self, name: str = "latest"): """Save current tokenizer state to checkpoint.""" if self.tokenizer is None: return ckpt_path = self.checkpoint_dir / f"checkpoint_{name}.json" self.tokenizer.save(str(ckpt_path)) logger.info(f"Checkpoint saved to {ckpt_path}") def save(self, model_name: str = None, compress: bool = None): """ Save the trained tokenizer. Args: model_name: Name of the model folder. Defaults to config.model_name compress: If True, saves vocab and merges in gzip format. Defaults to config.save_compressed. """ if not self.is_trained and self.tokenizer is None: raise RuntimeError("Tokenizer not trained yet.") model_name = model_name or self.config.model_name compress = compress if compress is not None else self.config.save_compressed model_path = self.output_dir / model_name model_path.mkdir(parents=True, exist_ok=True) logger.info(f"Saving model to {model_path} (compressed={compress})...") if compress: # Save standard tokenizer.json (required for HF loading) tokenizer_file = model_path / "tokenizer.json" self.tokenizer.save(str(tokenizer_file)) # Extract vocab from tokenizer vocab = self.tokenizer.get_vocab() # Save compressed vocab vocab_path = model_path / "vocab.json.gz" with gzip.open(vocab_path, 'wt', encoding='utf-8') as f: json.dump(vocab, f) logger.info(f"Compressed vocab saved: {vocab_path}") # Calculate savings original_size = tokenizer_file.stat().st_size compressed_size = vocab_path.stat().st_size logger.info(f"Storage saved: {(original_size - compressed_size) / 1024:.2f} KB") else: # Standard save self.tokenizer.save(str(model_path / "tokenizer.json")) self.tokenizer.model.save(str(model_path)) logger.info("Standard model artifacts saved.") # Save config self.config.save(str(model_path / "config.json")) # Save metadata card for Hugging Face self._save_model_card(model_path) logger.info(f"Model successfully saved to {model_path}") return model_path def _save_model_card(self, path: Path): """Generate and save a README.md for Hugging Face Hub.""" card_content = f"""--- language: - multilingual tags: - ethiobbpe - bpe - tokenizer - byte-level license: apache-2.0 datasets: - user-provided --- # EthioBBPE Tokenizer This is a production-ready Byte-Level BPE tokenizer trained for robust text processing. ## Features - **Byte-Level**: Handles any Unicode character without . - **Compressed Storage**: Supports gzip compression for efficient deployment. - **Checkpointing**: Built-in safety checkpoints during training. ## Usage ### Transformers ```python from transformers import AutoTokenizer tokenizer = AutoTokenizer.from_pretrained("{path.name}") ``` ### Tokenizers Library ```python from tokenizers import Tokenizer tokenizer = Tokenizer.from_file("tokenizer.json") ``` ## Training Configuration ```json {json.dumps(asdict(self.config), indent=2)} ``` """ with open(path / "README.md", 'w', encoding='utf-8') as f: f.write(card_content) def tokenize(self, text: str) -> List[str]: if self.tokenizer is None: raise RuntimeError("Tokenizer not initialized") return self.tokenizer.encode(text).tokens def encode(self, text: str) -> List[int]: if self.tokenizer is None: raise RuntimeError("Tokenizer not initialized") return self.tokenizer.encode(text).ids def decode(self, ids: List[int]) -> str: if self.tokenizer is None: raise RuntimeError("Tokenizer not initialized") return self.tokenizer.decode(ids)