import os import re import glob from pathlib import Path from transformers import ( AutoTokenizer, AutoModelForMaskedLM, TrainingArguments, Trainer, DataCollatorForLanguageModeling, ) from datasets import Dataset import torch def load_and_process_data(data_dir: str) -> str: """ Load all .en.txt files, remove timestamps, and concatenate with [BRK]. Args: data_dir: Directory containing the .en.txt files Returns: Concatenated text with [BRK] separators """ pattern = os.path.join(data_dir, "*.en.txt") files = glob.glob(pattern) if not files: raise ValueError(f"No .en.txt files found in {data_dir}") print(f"Found {len(files)} .en.txt files") all_segments = [] for file_path in sorted(files): print(f"Processing {os.path.basename(file_path)}...") with open(file_path, 'r', encoding='utf-8') as f: for line in f: line = line.strip() if not line: # Skip empty lines continue # Remove timestamps in brackets like [0.00] or [2.30] # Pattern matches [number.number] or [number:number:number] line = re.sub(r'\[\d+\.?\d*\]', '', line) line = line.strip() if line: # Only add non-empty lines after timestamp removal all_segments.append(line) # Concatenate all segments with [BRK] concatenated_text = " [BRK] ".join(all_segments) print(f"Total segments: {len(all_segments)}") print(f"Total text length: {len(concatenated_text)} characters") return concatenated_text def prepare_dataset(text: str, tokenizer, max_length: int = 512): """ Tokenize the text and create a dataset for training. Preserves [BRK] tokens in the training data so the model can learn to generate them. Splits by token count only, not by [BRK] boundaries. Args: text: The concatenated text with [BRK] tokens tokenizer: The tokenizer to use max_length: Maximum sequence length Returns: Dataset ready for training """ # Tokenize the entire text first to split by token count # This preserves [BRK] tokens within chunks print("Tokenizing full text...") full_tokens = tokenizer(text, add_special_tokens=False, return_offsets_mapping=False) input_ids = full_tokens['input_ids'] # Split into chunks of max_length tokens # The tokenizer will add CLS and SEP tokens, so we use max_length directly # and let truncation handle it, or we can be more precise chunk_size = max_length - 2 # Reserve space for CLS and SEP tokens examples = [] for i in range(0, len(input_ids), chunk_size): chunk_ids = input_ids[i:i + chunk_size] # Decode back to text to preserve [BRK] tokens, then re-tokenize with special tokens chunk_text = tokenizer.decode(chunk_ids, skip_special_tokens=False) examples.append(chunk_text) print(f"Created {len(examples)} training examples") # Tokenize all examples with proper special tokens def tokenize_function(examples): return tokenizer( examples["text"], truncation=True, max_length=max_length, padding="max_length", ) dataset = Dataset.from_dict({"text": examples}) tokenized_dataset = dataset.map( tokenize_function, batched=True, remove_columns=["text"], ) return tokenized_dataset def main(): # Configuration model_name = "answerdotai/ModernBERT-large" data_dir = "/home/allen/Codes/metricsubs-chunktranslate/data" output_dir = "." print("=" * 60) print("ModernBERT-large Fine-tuning Script") print("=" * 60) # Step 1: Load model and tokenizer print("\n[1/4] Loading model and tokenizer from HuggingFace...") tokenizer = AutoTokenizer.from_pretrained(model_name) model = AutoModelForMaskedLM.from_pretrained(model_name) # Add [BRK] as a special token print("Adding [BRK] as a special token...") special_tokens_dict = {"additional_special_tokens": ["[BRK]"]} tokenizer.add_special_tokens(special_tokens_dict) model.resize_token_embeddings(len(tokenizer)) print(f"Model loaded: {model_name}") print(f"Model parameters: {sum(p.numel() for p in model.parameters()):,}") print(f"Vocabulary size: {len(tokenizer)}") # Step 2: Load and process data print("\n[2/4] Loading and processing training data...") concatenated_text = load_and_process_data(data_dir) # Step 3: Prepare dataset print("\n[3/4] Preparing dataset...") train_dataset = prepare_dataset(concatenated_text, tokenizer, max_length=512) # Step 4: Set up training print("\n[4/4] Setting up training...") # Data collator for MLM data_collator = DataCollatorForLanguageModeling( tokenizer=tokenizer, mlm=True, mlm_probability=0.15, ) # Training arguments training_args = TrainingArguments( output_dir=output_dir, overwrite_output_dir=True, num_train_epochs=3, per_device_train_batch_size=4, gradient_accumulation_steps=4, learning_rate=2e-5, weight_decay=0.01, warmup_steps=500, logging_steps=100, save_steps=1000, save_total_limit=3, prediction_loss_only=True, fp16=torch.cuda.is_available(), # Use mixed precision if GPU available dataloader_pin_memory=True, ) # Initialize trainer trainer = Trainer( model=model, args=training_args, data_collator=data_collator, train_dataset=train_dataset, ) # Train print("\nStarting training...") print(f"Training on {'GPU' if torch.cuda.is_available() else 'CPU'}") trainer.train() # Save the final model print(f"\nSaving model to {output_dir}...") model.save_pretrained(output_dir) tokenizer.save_pretrained(output_dir) print("\n" + "=" * 60) print("Fine-tuning complete!") print(f"Model saved to: {os.path.abspath(output_dir)}") print("=" * 60) if __name__ == "__main__": main()