|
|
import os |
|
|
import re |
|
|
import glob |
|
|
from pathlib import Path |
|
|
from transformers import ( |
|
|
AutoTokenizer, |
|
|
AutoModelForMaskedLM, |
|
|
TrainingArguments, |
|
|
Trainer, |
|
|
DataCollatorForLanguageModeling, |
|
|
) |
|
|
from datasets import Dataset |
|
|
import torch |
|
|
|
|
|
|
|
|
def load_and_process_data(data_dir: str) -> str: |
|
|
""" |
|
|
Load all .en.txt files, remove timestamps, and concatenate with [BRK]. |
|
|
|
|
|
Args: |
|
|
data_dir: Directory containing the .en.txt files |
|
|
|
|
|
Returns: |
|
|
Concatenated text with [BRK] separators |
|
|
""" |
|
|
pattern = os.path.join(data_dir, "*.en.txt") |
|
|
files = glob.glob(pattern) |
|
|
|
|
|
if not files: |
|
|
raise ValueError(f"No .en.txt files found in {data_dir}") |
|
|
|
|
|
print(f"Found {len(files)} .en.txt files") |
|
|
|
|
|
all_segments = [] |
|
|
|
|
|
for file_path in sorted(files): |
|
|
print(f"Processing {os.path.basename(file_path)}...") |
|
|
with open(file_path, 'r', encoding='utf-8') as f: |
|
|
for line in f: |
|
|
line = line.strip() |
|
|
if not line: |
|
|
continue |
|
|
|
|
|
|
|
|
|
|
|
line = re.sub(r'\[\d+\.?\d*\]', '', line) |
|
|
line = line.strip() |
|
|
|
|
|
if line: |
|
|
all_segments.append(line) |
|
|
|
|
|
|
|
|
concatenated_text = " [BRK] ".join(all_segments) |
|
|
|
|
|
print(f"Total segments: {len(all_segments)}") |
|
|
print(f"Total text length: {len(concatenated_text)} characters") |
|
|
|
|
|
return concatenated_text |
|
|
|
|
|
|
|
|
def prepare_dataset(text: str, tokenizer, max_length: int = 512): |
|
|
""" |
|
|
Tokenize the text and create a dataset for training. |
|
|
Preserves [BRK] tokens in the training data so the model can learn to generate them. |
|
|
Splits by token count only, not by [BRK] boundaries. |
|
|
|
|
|
Args: |
|
|
text: The concatenated text with [BRK] tokens |
|
|
tokenizer: The tokenizer to use |
|
|
max_length: Maximum sequence length |
|
|
|
|
|
Returns: |
|
|
Dataset ready for training |
|
|
""" |
|
|
|
|
|
|
|
|
print("Tokenizing full text...") |
|
|
full_tokens = tokenizer(text, add_special_tokens=False, return_offsets_mapping=False) |
|
|
input_ids = full_tokens['input_ids'] |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
chunk_size = max_length - 2 |
|
|
examples = [] |
|
|
|
|
|
for i in range(0, len(input_ids), chunk_size): |
|
|
chunk_ids = input_ids[i:i + chunk_size] |
|
|
|
|
|
chunk_text = tokenizer.decode(chunk_ids, skip_special_tokens=False) |
|
|
examples.append(chunk_text) |
|
|
|
|
|
print(f"Created {len(examples)} training examples") |
|
|
|
|
|
|
|
|
def tokenize_function(examples): |
|
|
return tokenizer( |
|
|
examples["text"], |
|
|
truncation=True, |
|
|
max_length=max_length, |
|
|
padding="max_length", |
|
|
) |
|
|
|
|
|
dataset = Dataset.from_dict({"text": examples}) |
|
|
tokenized_dataset = dataset.map( |
|
|
tokenize_function, |
|
|
batched=True, |
|
|
remove_columns=["text"], |
|
|
) |
|
|
|
|
|
return tokenized_dataset |
|
|
|
|
|
|
|
|
def main(): |
|
|
|
|
|
model_name = "answerdotai/ModernBERT-large" |
|
|
data_dir = "/home/allen/Codes/metricsubs-chunktranslate/data" |
|
|
output_dir = "." |
|
|
|
|
|
print("=" * 60) |
|
|
print("ModernBERT-large Fine-tuning Script") |
|
|
print("=" * 60) |
|
|
|
|
|
|
|
|
print("\n[1/4] Loading model and tokenizer from HuggingFace...") |
|
|
tokenizer = AutoTokenizer.from_pretrained(model_name) |
|
|
model = AutoModelForMaskedLM.from_pretrained(model_name) |
|
|
|
|
|
|
|
|
print("Adding [BRK] as a special token...") |
|
|
special_tokens_dict = {"additional_special_tokens": ["[BRK]"]} |
|
|
tokenizer.add_special_tokens(special_tokens_dict) |
|
|
model.resize_token_embeddings(len(tokenizer)) |
|
|
|
|
|
print(f"Model loaded: {model_name}") |
|
|
print(f"Model parameters: {sum(p.numel() for p in model.parameters()):,}") |
|
|
print(f"Vocabulary size: {len(tokenizer)}") |
|
|
|
|
|
|
|
|
print("\n[2/4] Loading and processing training data...") |
|
|
concatenated_text = load_and_process_data(data_dir) |
|
|
|
|
|
|
|
|
print("\n[3/4] Preparing dataset...") |
|
|
train_dataset = prepare_dataset(concatenated_text, tokenizer, max_length=512) |
|
|
|
|
|
|
|
|
print("\n[4/4] Setting up training...") |
|
|
|
|
|
|
|
|
data_collator = DataCollatorForLanguageModeling( |
|
|
tokenizer=tokenizer, |
|
|
mlm=True, |
|
|
mlm_probability=0.15, |
|
|
) |
|
|
|
|
|
|
|
|
training_args = TrainingArguments( |
|
|
output_dir=output_dir, |
|
|
overwrite_output_dir=True, |
|
|
num_train_epochs=3, |
|
|
per_device_train_batch_size=4, |
|
|
gradient_accumulation_steps=4, |
|
|
learning_rate=2e-5, |
|
|
weight_decay=0.01, |
|
|
warmup_steps=500, |
|
|
logging_steps=100, |
|
|
save_steps=1000, |
|
|
save_total_limit=3, |
|
|
prediction_loss_only=True, |
|
|
fp16=torch.cuda.is_available(), |
|
|
dataloader_pin_memory=True, |
|
|
) |
|
|
|
|
|
|
|
|
trainer = Trainer( |
|
|
model=model, |
|
|
args=training_args, |
|
|
data_collator=data_collator, |
|
|
train_dataset=train_dataset, |
|
|
) |
|
|
|
|
|
|
|
|
print("\nStarting training...") |
|
|
print(f"Training on {'GPU' if torch.cuda.is_available() else 'CPU'}") |
|
|
trainer.train() |
|
|
|
|
|
|
|
|
print(f"\nSaving model to {output_dir}...") |
|
|
model.save_pretrained(output_dir) |
|
|
tokenizer.save_pretrained(output_dir) |
|
|
|
|
|
print("\n" + "=" * 60) |
|
|
print("Fine-tuning complete!") |
|
|
print(f"Model saved to: {os.path.abspath(output_dir)}") |
|
|
print("=" * 60) |
|
|
|
|
|
|
|
|
if __name__ == "__main__": |
|
|
main() |
|
|
|