#!/usr/bin/env python3 """ Fox1.3 Training Script LoRA fine-tuning on Qwen2.5-1B-Instruct with CodeAlpaca dataset """ import os import torch from datasets import load_dataset from transformers import ( AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig, TrainingArguments, Trainer, DataCollatorForLanguageModeling ) from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training import logging logging.basicConfig(level=logging.INFO) logger = logging.getLogger(__name__) # Config MODEL_NAME = "Qwen/Qwen2.5-1B-Instruct" DATASET_NAME = "HuggingFaceH4/CodeAlpaca_20K" OUTPUT_DIR = "./fox1.3-checkpoints" REPO_NAME = "teolm30/fox1.3" NUM_EPOCHS = 3 BATCH_SIZE = 2 LEARNING_RATE = 2e-4 MAX_seq_LENGTH = 2048 def load_tokenizer(): logger.info(f"Loading tokenizer: {MODEL_NAME}") tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, trust_remote_code=True) tokenizer.pad_token = tokenizer.eos_token return tokenizer def load_model(tokenizer): logger.info(f"Loading model: {MODEL_NAME}") # Quantization config for memory efficiency bnb_config = BitsAndBytesConfig( load_in_4bit=True, bnb_4bit_quant_type="nf4", bnb_4bit_compute_dtype=torch.float16, bnb_4bit_use_double_quant=True, ) model = AutoModelForCausalLM.from_pretrained( MODEL_NAME, quantization_config=bnb_config, device_map="auto", trust_remote_code=True ) model = prepare_model_for_kbit_training(model) # LoRA config lora_config = LoraConfig( r=8, lora_alpha=16, target_modules=["q_proj", "k_proj", "v_proj", "o_proj"], lora_dropout=0.05, bias="none", task_type="CAUSAL_LM" ) model = get_peft_model(model, lora_config) model.print_trainable_parameters() return model def format_instruction(example): """Format dataset example for instruction tuning.""" instruction = example.get("instruction", "") input_text = example.get("input", "") output = example.get("output", "") if input_text: text = f"### Instruction:\n{instruction}\n\n### Input:\n{input_text}\n\n### Response:\n{output}" else: text = f"### Instruction:\n{instruction}\n\n### Response:\n{output}" return {"text": text} def tokenize(example, tokenizer, max_length): result = tokenizer( example["text"], truncation=True, max_length=max_length, padding="max_length" ) result["labels"] = result["input_ids"].copy() return result def main(): logger.info("Starting Fox1.3 training pipeline...") # Load tokenizer and model tokenizer = load_tokenizer() model = load_model(tokenizer) # Load and format dataset logger.info(f"Loading dataset: {DATASET_NAME}") dataset = load_dataset(DATASET_NAME, split="train") # Format instructions dataset = dataset.map(format_instruction, remove_columns=dataset.column_names) # Tokenize dataset = dataset.map( lambda x: tokenize(x, tokenizer, MAX_SEQ_LENGTH), batched=True, remove_columns=["text"] ) # Split for eval dataset = dataset.train_test_split(test_size=0.1) train_dataset = dataset["train"] eval_dataset = dataset["test"] logger.info(f"Train size: {len(train_dataset)}, Eval size: {len(eval_dataset)}") # Training args training_args = TrainingArguments( output_dir=OUTPUT_DIR, num_train_epochs=NUM_EPOCHS, per_device_train_batch_size=BATCH_SIZE, per_device_eval_batch_size=BATCH_SIZE, learning_rate=LEARNING_RATE, warmup_steps=100, logging_steps=50, eval_strategy="epoch", save_strategy="epoch", save_total_limit=2, bf16=True, tf32=True, optim="paged_adamw_8bit", group_by_length=True, report_to="none", push_to_hub=True, hub_model_id=REPO_NAME, ) # Data collator data_collator = DataCollatorForLanguageModeling( tokenizer=tokenizer, mlm=False ) # Trainer trainer = Trainer( model=model, args=training_args, train_dataset=train_dataset, eval_dataset=eval_dataset, data_collator=data_collator, ) logger.info("Starting training...") trainer.train() logger.info("Training complete! Saving and pushing to hub...") trainer.push_to_hub() logger.info(f"Done! Model pushed to https://huggingface.co/{REPO_NAME}") if __name__ == "__main__": main()