| |
| """ |
| Fox1.3 Training Script |
| LoRA fine-tuning on Qwen2.5-1B-Instruct with CodeAlpaca dataset |
| """ |
|
|
| import os |
| import torch |
| from datasets import load_dataset |
| from transformers import ( |
| AutoModelForCausalLM, |
| AutoTokenizer, |
| BitsAndBytesConfig, |
| TrainingArguments, |
| Trainer, |
| DataCollatorForLanguageModeling |
| ) |
| from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training |
| import logging |
|
|
| logging.basicConfig(level=logging.INFO) |
| logger = logging.getLogger(__name__) |
|
|
| |
| MODEL_NAME = "Qwen/Qwen2.5-1B-Instruct" |
| DATASET_NAME = "HuggingFaceH4/CodeAlpaca_20K" |
| OUTPUT_DIR = "./fox1.3-checkpoints" |
| REPO_NAME = "teolm30/fox1.3" |
| NUM_EPOCHS = 3 |
| BATCH_SIZE = 2 |
| LEARNING_RATE = 2e-4 |
| MAX_seq_LENGTH = 2048 |
|
|
| def load_tokenizer(): |
| logger.info(f"Loading tokenizer: {MODEL_NAME}") |
| tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, trust_remote_code=True) |
| tokenizer.pad_token = tokenizer.eos_token |
| return tokenizer |
|
|
| def load_model(tokenizer): |
| logger.info(f"Loading model: {MODEL_NAME}") |
| |
| |
| bnb_config = BitsAndBytesConfig( |
| load_in_4bit=True, |
| bnb_4bit_quant_type="nf4", |
| bnb_4bit_compute_dtype=torch.float16, |
| bnb_4bit_use_double_quant=True, |
| ) |
| |
| model = AutoModelForCausalLM.from_pretrained( |
| MODEL_NAME, |
| quantization_config=bnb_config, |
| device_map="auto", |
| trust_remote_code=True |
| ) |
| |
| model = prepare_model_for_kbit_training(model) |
| |
| |
| lora_config = LoraConfig( |
| r=8, |
| lora_alpha=16, |
| target_modules=["q_proj", "k_proj", "v_proj", "o_proj"], |
| lora_dropout=0.05, |
| bias="none", |
| task_type="CAUSAL_LM" |
| ) |
| |
| model = get_peft_model(model, lora_config) |
| model.print_trainable_parameters() |
| |
| return model |
|
|
| def format_instruction(example): |
| """Format dataset example for instruction tuning.""" |
| instruction = example.get("instruction", "") |
| input_text = example.get("input", "") |
| output = example.get("output", "") |
| |
| if input_text: |
| text = f"### Instruction:\n{instruction}\n\n### Input:\n{input_text}\n\n### Response:\n{output}" |
| else: |
| text = f"### Instruction:\n{instruction}\n\n### Response:\n{output}" |
| |
| return {"text": text} |
|
|
| def tokenize(example, tokenizer, max_length): |
| result = tokenizer( |
| example["text"], |
| truncation=True, |
| max_length=max_length, |
| padding="max_length" |
| ) |
| result["labels"] = result["input_ids"].copy() |
| return result |
|
|
| def main(): |
| logger.info("Starting Fox1.3 training pipeline...") |
| |
| |
| tokenizer = load_tokenizer() |
| model = load_model(tokenizer) |
| |
| |
| logger.info(f"Loading dataset: {DATASET_NAME}") |
| dataset = load_dataset(DATASET_NAME, split="train") |
| |
| |
| dataset = dataset.map(format_instruction, remove_columns=dataset.column_names) |
| |
| |
| dataset = dataset.map( |
| lambda x: tokenize(x, tokenizer, MAX_SEQ_LENGTH), |
| batched=True, |
| remove_columns=["text"] |
| ) |
| |
| |
| dataset = dataset.train_test_split(test_size=0.1) |
| train_dataset = dataset["train"] |
| eval_dataset = dataset["test"] |
| |
| logger.info(f"Train size: {len(train_dataset)}, Eval size: {len(eval_dataset)}") |
| |
| |
| training_args = TrainingArguments( |
| output_dir=OUTPUT_DIR, |
| num_train_epochs=NUM_EPOCHS, |
| per_device_train_batch_size=BATCH_SIZE, |
| per_device_eval_batch_size=BATCH_SIZE, |
| learning_rate=LEARNING_RATE, |
| warmup_steps=100, |
| logging_steps=50, |
| eval_strategy="epoch", |
| save_strategy="epoch", |
| save_total_limit=2, |
| bf16=True, |
| tf32=True, |
| optim="paged_adamw_8bit", |
| group_by_length=True, |
| report_to="none", |
| push_to_hub=True, |
| hub_model_id=REPO_NAME, |
| ) |
| |
| |
| data_collator = DataCollatorForLanguageModeling( |
| tokenizer=tokenizer, |
| mlm=False |
| ) |
| |
| |
| trainer = Trainer( |
| model=model, |
| args=training_args, |
| train_dataset=train_dataset, |
| eval_dataset=eval_dataset, |
| data_collator=data_collator, |
| ) |
| |
| logger.info("Starting training...") |
| trainer.train() |
| |
| logger.info("Training complete! Saving and pushing to hub...") |
| trainer.push_to_hub() |
| |
| logger.info(f"Done! Model pushed to https://huggingface.co/{REPO_NAME}") |
|
|
| if __name__ == "__main__": |
| main() |
|
|