fox1.3 / train.py
teolm30's picture
Add train.py: Training script with LoRA fine-tuning on CodeAlpaca_20K
5aa2508 verified
#!/usr/bin/env python3
"""
Fox1.3 Training Script
LoRA fine-tuning on Qwen2.5-1B-Instruct with CodeAlpaca dataset
"""
import os
import torch
from datasets import load_dataset
from transformers import (
AutoModelForCausalLM,
AutoTokenizer,
BitsAndBytesConfig,
TrainingArguments,
Trainer,
DataCollatorForLanguageModeling
)
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training
import logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)
# Config
MODEL_NAME = "Qwen/Qwen2.5-1B-Instruct"
DATASET_NAME = "HuggingFaceH4/CodeAlpaca_20K"
OUTPUT_DIR = "./fox1.3-checkpoints"
REPO_NAME = "teolm30/fox1.3"
NUM_EPOCHS = 3
BATCH_SIZE = 2
LEARNING_RATE = 2e-4
MAX_seq_LENGTH = 2048
def load_tokenizer():
logger.info(f"Loading tokenizer: {MODEL_NAME}")
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token
return tokenizer
def load_model(tokenizer):
logger.info(f"Loading model: {MODEL_NAME}")
# Quantization config for memory efficiency
bnb_config = BitsAndBytesConfig(
load_in_4bit=True,
bnb_4bit_quant_type="nf4",
bnb_4bit_compute_dtype=torch.float16,
bnb_4bit_use_double_quant=True,
)
model = AutoModelForCausalLM.from_pretrained(
MODEL_NAME,
quantization_config=bnb_config,
device_map="auto",
trust_remote_code=True
)
model = prepare_model_for_kbit_training(model)
# LoRA config
lora_config = LoraConfig(
r=8,
lora_alpha=16,
target_modules=["q_proj", "k_proj", "v_proj", "o_proj"],
lora_dropout=0.05,
bias="none",
task_type="CAUSAL_LM"
)
model = get_peft_model(model, lora_config)
model.print_trainable_parameters()
return model
def format_instruction(example):
"""Format dataset example for instruction tuning."""
instruction = example.get("instruction", "")
input_text = example.get("input", "")
output = example.get("output", "")
if input_text:
text = f"### Instruction:\n{instruction}\n\n### Input:\n{input_text}\n\n### Response:\n{output}"
else:
text = f"### Instruction:\n{instruction}\n\n### Response:\n{output}"
return {"text": text}
def tokenize(example, tokenizer, max_length):
result = tokenizer(
example["text"],
truncation=True,
max_length=max_length,
padding="max_length"
)
result["labels"] = result["input_ids"].copy()
return result
def main():
logger.info("Starting Fox1.3 training pipeline...")
# Load tokenizer and model
tokenizer = load_tokenizer()
model = load_model(tokenizer)
# Load and format dataset
logger.info(f"Loading dataset: {DATASET_NAME}")
dataset = load_dataset(DATASET_NAME, split="train")
# Format instructions
dataset = dataset.map(format_instruction, remove_columns=dataset.column_names)
# Tokenize
dataset = dataset.map(
lambda x: tokenize(x, tokenizer, MAX_SEQ_LENGTH),
batched=True,
remove_columns=["text"]
)
# Split for eval
dataset = dataset.train_test_split(test_size=0.1)
train_dataset = dataset["train"]
eval_dataset = dataset["test"]
logger.info(f"Train size: {len(train_dataset)}, Eval size: {len(eval_dataset)}")
# Training args
training_args = TrainingArguments(
output_dir=OUTPUT_DIR,
num_train_epochs=NUM_EPOCHS,
per_device_train_batch_size=BATCH_SIZE,
per_device_eval_batch_size=BATCH_SIZE,
learning_rate=LEARNING_RATE,
warmup_steps=100,
logging_steps=50,
eval_strategy="epoch",
save_strategy="epoch",
save_total_limit=2,
bf16=True,
tf32=True,
optim="paged_adamw_8bit",
group_by_length=True,
report_to="none",
push_to_hub=True,
hub_model_id=REPO_NAME,
)
# Data collator
data_collator = DataCollatorForLanguageModeling(
tokenizer=tokenizer,
mlm=False
)
# Trainer
trainer = Trainer(
model=model,
args=training_args,
train_dataset=train_dataset,
eval_dataset=eval_dataset,
data_collator=data_collator,
)
logger.info("Starting training...")
trainer.train()
logger.info("Training complete! Saving and pushing to hub...")
trainer.push_to_hub()
logger.info(f"Done! Model pushed to https://huggingface.co/{REPO_NAME}")
if __name__ == "__main__":
main()