Pentium95's picture
Upload train.py
53aeb2a verified
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer
from datasets import load_dataset
from peft import LoraConfig, TaskType
from trl import SFTTrainer, SFTConfig
import trackio
model_name = "./SmolLM3-3B-Base/"
dataset_path = "./MathInstruct/MathInstruct.json"
output_dir = "./SmolLMathematician-3B"
project_name = "SmolLMathematician-3B"
MAX_SEQ_LENGTH = 4096
trackio.init(project=project_name)
model = AutoModelForCausalLM.from_pretrained(
model_name,
device_map="auto",
dtype=torch.bfloat16,
low_cpu_mem_usage=True,
trust_remote_code=True,
attn_implementation="flash_attention_2",
)
tokenizer = AutoTokenizer.from_pretrained(model_name)
if tokenizer.pad_token is None:
tokenizer.pad_token = tokenizer.eos_token
model.config.pad_token_id = model.config.eos_token_id
with open("chat_template.jinja", "r") as f:
chat_template = f.read()
tokenizer.chat_template = chat_template
model.gradient_checkpointing_enable()
dataset = load_dataset("json", data_files=dataset_path, split="train")
def formatInstructionWithTemplate(example: dict) -> str:
messages = [
{"role": "user", "content": example["instruction"]},
{"role": "assistant", "content": example["output"]},
]
return tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=False)
def checkSequenceLength(example: dict) -> bool:
formatted_text = formatInstructionWithTemplate(example)
tokens = tokenizer(formatted_text)
return len(tokens['input_ids']) <= MAX_SEQ_LENGTH
original_size = len(dataset)
train_dataset = dataset.filter(checkSequenceLength)
new_size = len(train_dataset)
print(f"Dataset: {original_size}{new_size} samples (removed: {original_size - new_size})")
torch.cuda.empty_cache()
peft_config = LoraConfig(
r=16,
lora_alpha=32,
lora_dropout=0.1,
target_modules=['q_proj', 'v_proj'],
bias="none",
task_type=TaskType.CAUSAL_LM,
)
training_args = SFTConfig(
output_dir=output_dir,
num_train_epochs=1,
per_device_train_batch_size=2,
gradient_accumulation_steps=8,
optim="paged_adamw_8bit",
learning_rate=2e-5,
weight_decay=0.01,
adam_epsilon=1e-6,
max_grad_norm=1.0,
lr_scheduler_type="cosine",
warmup_ratio=0.1,
logging_steps=8,
eval_strategy="no",
save_strategy="steps",
save_steps=32,
save_total_limit=4,
resume_from_checkpoint=True,
report_to="trackio",
bf16=True,
packing=True,
max_length=MAX_SEQ_LENGTH,
dataloader_pin_memory=False,
gradient_checkpointing_kwargs={"use_reentrant": False},
)
trainer = SFTTrainer(
model=model,
args=training_args,
train_dataset=train_dataset,
peft_config=peft_config,
formatting_func=formatInstructionWithTemplate,
)
torch.cuda.empty_cache()
trainer.train()
torch.cuda.empty_cache()
trainer.save_model(output_dir)
print(f"LoRA adapter saved to {output_dir}")
trackio.finish()