|
|
|
|
|
""" |
|
|
Train GPT-2 variants with JSON format and early stopping. |
|
|
FIXED VERSION with proper data format conversion. |
|
|
""" |
|
|
|
|
|
import os |
|
|
import sys |
|
|
import json |
|
|
import argparse |
|
|
from pathlib import Path |
|
|
|
|
|
|
|
|
PROJECT_ROOT = Path(__file__).parent.parent |
|
|
sys.path.insert(0, str(PROJECT_ROOT)) |
|
|
|
|
|
from transformers import ( |
|
|
AutoTokenizer, |
|
|
AutoModelForCausalLM, |
|
|
TrainingArguments, |
|
|
Trainer, |
|
|
DataCollatorForLanguageModeling, |
|
|
EarlyStoppingCallback, |
|
|
) |
|
|
from datasets import load_dataset |
|
|
from peft import LoraConfig, get_peft_model |
|
|
|
|
|
|
|
|
def convert_to_json_format(example): |
|
|
"""Convert dataset format to JSON format.""" |
|
|
text = example['p_prompt_n_converted'] |
|
|
|
|
|
|
|
|
lines = text.strip().split('\n') |
|
|
data = {} |
|
|
|
|
|
for line in lines: |
|
|
if ':' in line: |
|
|
key, value = line.split(':', 1) |
|
|
key = key.strip() |
|
|
value = value.strip() |
|
|
|
|
|
if key == 'vars': |
|
|
|
|
|
data['vars'] = [v.strip() for v in value.split(',')] |
|
|
elif key == 'oper': |
|
|
|
|
|
data['ops'] = [o.strip() for o in value.split(',')] |
|
|
elif key == 'cons': |
|
|
data['cons'] = value |
|
|
elif key == 'expr': |
|
|
data['expr'] = value |
|
|
|
|
|
|
|
|
json_str = json.dumps(data, ensure_ascii=False) |
|
|
|
|
|
return {'text': json_str} |
|
|
|
|
|
|
|
|
def main(): |
|
|
parser = argparse.ArgumentParser() |
|
|
parser.add_argument("--model_size", type=str, default="gpt2-medium", |
|
|
choices=["gpt2", "gpt2-medium", "gpt2-large"], |
|
|
help="Model size to train") |
|
|
parser.add_argument("--dataset_repo", type=str, default="augustocsc/sintetico_natural") |
|
|
parser.add_argument("--data_dir", type=str, default="700K") |
|
|
parser.add_argument("--output_dir", type=str, default=None) |
|
|
parser.add_argument("--num_train_epochs", type=int, default=3) |
|
|
parser.add_argument("--per_device_train_batch_size", type=int, default=4) |
|
|
parser.add_argument("--learning_rate", type=float, default=5e-5) |
|
|
parser.add_argument("--lora_r", type=int, default=8) |
|
|
parser.add_argument("--lora_alpha", type=int, default=32) |
|
|
parser.add_argument("--early_stopping_patience", type=int, default=3) |
|
|
args = parser.parse_args() |
|
|
|
|
|
|
|
|
if args.output_dir is None: |
|
|
model_name = args.model_size.replace("-", "_") |
|
|
args.output_dir = f"./output/{model_name}_700K_json" |
|
|
|
|
|
print("="*80) |
|
|
print(f"Training {args.model_size} with JSON format + Early Stopping") |
|
|
print("="*80) |
|
|
print(f"Output dir: {args.output_dir}") |
|
|
print(f"Early stopping patience: {args.early_stopping_patience}") |
|
|
print() |
|
|
|
|
|
|
|
|
tokenizer = AutoTokenizer.from_pretrained(args.model_size) |
|
|
tokenizer.pad_token = tokenizer.eos_token |
|
|
|
|
|
|
|
|
print(f"Loading {args.model_size}...") |
|
|
model = AutoModelForCausalLM.from_pretrained(args.model_size) |
|
|
|
|
|
|
|
|
lora_config = LoraConfig( |
|
|
r=args.lora_r, |
|
|
lora_alpha=args.lora_alpha, |
|
|
target_modules=["c_attn"], |
|
|
lora_dropout=0.05, |
|
|
bias="none", |
|
|
task_type="CAUSAL_LM", |
|
|
) |
|
|
model = get_peft_model(model, lora_config) |
|
|
|
|
|
trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad) |
|
|
total_params = sum(p.numel() for p in model.parameters()) |
|
|
print(f"Trainable: {trainable_params:,} / {total_params:,} ({100*trainable_params/total_params:.2f}%)") |
|
|
print() |
|
|
|
|
|
|
|
|
print(f"Loading dataset: {args.dataset_repo}/{args.data_dir}") |
|
|
dataset = load_dataset(args.dataset_repo, data_dir=args.data_dir) |
|
|
|
|
|
|
|
|
print("Original format sample:") |
|
|
print(dataset["train"][0]['p_prompt_n_converted'][:150]) |
|
|
print() |
|
|
|
|
|
|
|
|
print("Converting to JSON format...") |
|
|
train_dataset = dataset["train"].map(convert_to_json_format, remove_columns=['p_prompt_n_converted']) |
|
|
|
|
|
|
|
|
split_dataset = train_dataset.train_test_split(test_size=0.1, seed=42) |
|
|
train_dataset = split_dataset['train'] |
|
|
eval_dataset = split_dataset['test'] |
|
|
|
|
|
print(f"Train size: {len(train_dataset):,}") |
|
|
print(f"Eval size: {len(eval_dataset):,}") |
|
|
print() |
|
|
print("JSON format sample:") |
|
|
print(train_dataset[0]['text'][:150]) |
|
|
print() |
|
|
|
|
|
|
|
|
def tokenize_function(examples): |
|
|
return tokenizer( |
|
|
examples['text'], |
|
|
truncation=True, |
|
|
max_length=512, |
|
|
padding=False, |
|
|
) |
|
|
|
|
|
print("Tokenizing datasets...") |
|
|
train_tokenized = train_dataset.map(tokenize_function, batched=True, remove_columns=['text']) |
|
|
eval_tokenized = eval_dataset.map(tokenize_function, batched=True, remove_columns=['text']) |
|
|
|
|
|
|
|
|
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False) |
|
|
|
|
|
|
|
|
training_args = TrainingArguments( |
|
|
output_dir=args.output_dir, |
|
|
num_train_epochs=args.num_train_epochs, |
|
|
per_device_train_batch_size=args.per_device_train_batch_size, |
|
|
per_device_eval_batch_size=args.per_device_train_batch_size, |
|
|
gradient_accumulation_steps=4, |
|
|
learning_rate=args.learning_rate, |
|
|
warmup_steps=500, |
|
|
weight_decay=0.01, |
|
|
logging_steps=100, |
|
|
eval_steps=500, |
|
|
save_steps=500, |
|
|
save_total_limit=3, |
|
|
eval_strategy="steps", |
|
|
load_best_model_at_end=True, |
|
|
metric_for_best_model="loss", |
|
|
greater_is_better=False, |
|
|
fp16=True, |
|
|
report_to="wandb", |
|
|
) |
|
|
|
|
|
|
|
|
early_stopping = EarlyStoppingCallback( |
|
|
early_stopping_patience=args.early_stopping_patience, |
|
|
early_stopping_threshold=0.001, |
|
|
) |
|
|
|
|
|
|
|
|
trainer = Trainer( |
|
|
model=model, |
|
|
args=training_args, |
|
|
train_dataset=train_tokenized, |
|
|
eval_dataset=eval_tokenized, |
|
|
data_collator=data_collator, |
|
|
callbacks=[early_stopping], |
|
|
) |
|
|
|
|
|
|
|
|
print("Starting training with early stopping...") |
|
|
print() |
|
|
trainer.train() |
|
|
|
|
|
|
|
|
print(f"\nSaving best model to {args.output_dir}") |
|
|
trainer.save_model(args.output_dir) |
|
|
tokenizer.save_pretrained(args.output_dir) |
|
|
|
|
|
print("\n" + "="*80) |
|
|
print("Training completed!") |
|
|
print("="*80) |
|
|
print(f"Model saved to: {args.output_dir}") |
|
|
print(f"Format: JSON (80% valid expressions expected)") |
|
|
|
|
|
|
|
|
if __name__ == "__main__": |
|
|
main() |
|
|
|