starcoder-training / train.py
Josh Weaver
Init
f899866
from transformers import (
AutoTokenizer,
AutoModelForCausalLM,
TrainingArguments,
Trainer,
DataCollatorForLanguageModeling
)
from datasets import load_dataset
import torch
import os
def tokenize_function(examples):
return tokenizer(
examples["text"],
truncation=True,
max_length=512,
padding="max_length",
return_tensors="pt"
)
# Initialize model and tokenizer
model_name = "bigcode/starcoder2-15b"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(
model_name,
torch_dtype=torch.bfloat16, # Use bfloat16 for better memory efficiency
device_map="auto" # Automatically handle model parallelism
)
# Load and preprocess dataset
dataset = load_dataset("officialweaver/code")
tokenized_dataset = dataset.map(
tokenize_function,
batched=True,
remove_columns=dataset["train"].column_names
)
# Training arguments
training_args = TrainingArguments(
output_dir="./starcoder-finetuned",
num_train_epochs=3,
per_device_train_batch_size=4,
per_device_eval_batch_size=4,
warmup_steps=500,
weight_decay=0.01,
logging_dir='./logs',
logging_steps=100,
evaluation_strategy="steps",
eval_steps=500,
save_strategy="steps",
save_steps=500,
learning_rate=5e-5,
fp16=True, # Enable mixed precision training
gradient_accumulation_steps=4, # Accumulate gradients to simulate larger batch sizes
load_best_model_at_end=True,
metric_for_best_model="eval_loss",
greater_is_better=False,
)
# Initialize trainer
trainer = Trainer(
model=model,
args=training_args,
train_dataset=tokenized_dataset["train"],
eval_dataset=tokenized_dataset["validation"],
data_collator=DataCollatorForLanguageModeling(
tokenizer=tokenizer,
mlm=False # We're doing causal language modeling, not masked
)
)
# Train the model
trainer.train()
# Save the model
trainer.save_model("./starcoder-finetuned-final")