| | |
| | """ |
| | Train GPT-2 Medium (355M) on expression dataset to compare with base GPT-2 (124M). |
| | """ |
| |
|
| | import os |
| | import sys |
| | import json |
| | import argparse |
| | from pathlib import Path |
| |
|
| | |
| | PROJECT_ROOT = Path(__file__).parent.parent |
| | sys.path.insert(0, str(PROJECT_ROOT)) |
| |
|
| | from transformers import ( |
| | AutoTokenizer, |
| | AutoModelForCausalLM, |
| | TrainingArguments, |
| | Trainer, |
| | DataCollatorForLanguageModeling, |
| | ) |
| | from datasets import load_dataset |
| | from peft import LoraConfig, get_peft_model |
| |
|
| |
|
| | def main(): |
| | parser = argparse.ArgumentParser() |
| | parser.add_argument("--model_size", type=str, default="gpt2-medium", |
| | choices=["gpt2", "gpt2-medium", "gpt2-large"], |
| | help="Model size to train") |
| | parser.add_argument("--dataset_repo", type=str, default="augustocsc/sintetico_natural") |
| | parser.add_argument("--data_dir", type=str, default="700K") |
| | parser.add_argument("--data_column", type=str, default="i_prompt_n") |
| | parser.add_argument("--output_dir", type=str, default=None) |
| | parser.add_argument("--num_train_epochs", type=int, default=3) |
| | parser.add_argument("--per_device_train_batch_size", type=int, default=4) |
| | parser.add_argument("--learning_rate", type=float, default=5e-5) |
| | parser.add_argument("--lora_r", type=int, default=8) |
| | parser.add_argument("--lora_alpha", type=int, default=32) |
| | args = parser.parse_args() |
| |
|
| | |
| | if args.output_dir is None: |
| | model_name = args.model_size.replace("-", "_") |
| | args.output_dir = f"./output/{model_name}_700K_json" |
| |
|
| | print("="*80) |
| | print(f"Training {args.model_size} on expression dataset") |
| | print("="*80) |
| | print(f"Output dir: {args.output_dir}") |
| | print() |
| |
|
| | |
| | tokenizer = AutoTokenizer.from_pretrained(args.model_size) |
| | tokenizer.pad_token = tokenizer.eos_token |
| |
|
| | |
| | print(f"Loading {args.model_size}...") |
| | model = AutoModelForCausalLM.from_pretrained(args.model_size) |
| |
|
| | |
| | lora_config = LoraConfig( |
| | r=args.lora_r, |
| | lora_alpha=args.lora_alpha, |
| | target_modules=["c_attn"], |
| | lora_dropout=0.05, |
| | bias="none", |
| | task_type="CAUSAL_LM", |
| | ) |
| | model = get_peft_model(model, lora_config) |
| |
|
| | |
| | trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad) |
| | total_params = sum(p.numel() for p in model.parameters()) |
| | print(f"Trainable parameters: {trainable_params:,} / {total_params:,} " |
| | f"({100*trainable_params/total_params:.2f}%)") |
| | print() |
| |
|
| | |
| | print(f"Loading dataset: {args.dataset_repo}/{args.data_dir}") |
| | dataset = load_dataset(args.dataset_repo, data_dir=args.data_dir) |
| | train_dataset = dataset["train"] |
| |
|
| | print(f"Dataset size: {len(train_dataset)} examples") |
| | print(f"Sample: {train_dataset[0][args.data_column][:100]}...") |
| | print() |
| |
|
| | |
| | def tokenize_function(examples): |
| | return tokenizer( |
| | examples[args.data_column], |
| | truncation=True, |
| | max_length=512, |
| | padding=False, |
| | ) |
| |
|
| | print("Tokenizing dataset...") |
| | tokenized_dataset = train_dataset.map( |
| | tokenize_function, |
| | batched=True, |
| | remove_columns=train_dataset.column_names, |
| | desc="Tokenizing", |
| | ) |
| |
|
| | |
| | data_collator = DataCollatorForLanguageModeling( |
| | tokenizer=tokenizer, |
| | mlm=False, |
| | ) |
| |
|
| | |
| | training_args = TrainingArguments( |
| | output_dir=args.output_dir, |
| | num_train_epochs=args.num_train_epochs, |
| | per_device_train_batch_size=args.per_device_train_batch_size, |
| | gradient_accumulation_steps=4, |
| | learning_rate=args.learning_rate, |
| | warmup_steps=500, |
| | weight_decay=0.01, |
| | logging_steps=100, |
| | save_steps=1000, |
| | save_total_limit=2, |
| | fp16=True, |
| | report_to="none", |
| | ) |
| |
|
| | |
| | trainer = Trainer( |
| | model=model, |
| | args=training_args, |
| | train_dataset=tokenized_dataset, |
| | data_collator=data_collator, |
| | ) |
| |
|
| | |
| | print("Starting training...") |
| | trainer.train() |
| |
|
| | |
| | print(f"\nSaving final model to {args.output_dir}") |
| | trainer.save_model(args.output_dir) |
| | tokenizer.save_pretrained(args.output_dir) |
| |
|
| | print("\nTraining completed!") |
| | print(f"Model saved to: {args.output_dir}") |
| |
|
| |
|
| | if __name__ == "__main__": |
| | main() |
| |
|