File size: 6,607 Bytes
76edade |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 |
# This script is designed to fine-tune the Qwen/Qwen2.5-Coder-3B-Instruct model
# using Q-LoRA on a custom dataset to create the "Syntax Copilot" model.
#
# Before running, make sure you have the necessary packages installed:
# pip install torch transformers datasets peft trl bitsandbytes accelerate
#
# Make sure you are logged into Hugging Face CLI to download the model:
# huggingface-cli login
import os
import torch
from datasets import load_dataset
from transformers import (
AutoModelForCausalLM,
AutoTokenizer,
BitsAndBytesConfig,
TrainingArguments,
)
from peft import LoraConfig, PeftModel
from trl import SFTTrainer
# 1. Configuration
# The base model to download from Hugging Face
base_model_name = "Qwen/Qwen2.5-Coder-3B-Instruct"
# The dataset file
dataset_name = "corrected_syntax_dataset.jsonl"
# The name for the fine-tuned adapter model
adapter_model_name = "Syntax-Copilot-adapter"
# The name for the final merged model, which we will call "Syntax Copilot"
final_model_name = "Syntax-Copilot"
# 2. Q-LoRA (Quantization and Low-Rank Adaptation) Configuration
lora_r = 64
lora_alpha = 16
lora_dropout = 0.1
# 3. BitsAndBytes Configuration for 4-bit Quantization
use_4bit = True
bnb_4bit_compute_dtype = "float16"
bnb_4bit_quant_type = "nf4"
use_nested_quant = False
# 4. Training Arguments
output_dir = "./training_results"
num_train_epochs = 1
# Use bf16 for better performance on modern GPUs (e.g., Ampere series)
bf16 = True
per_device_train_batch_size = 4
gradient_accumulation_steps = 1
gradient_checkpointing = True
max_grad_norm = 0.3
learning_rate = 2e-4
weight_decay = 0.001
optim = "paged_adamw_32bit"
lr_scheduler_type = "cosine"
max_steps = -1
warmup_ratio = 0.03
group_by_length = True
save_steps = 50
logging_steps = 10
# 5. SFTTrainer (Supervised Fine-tuning Trainer) Configuration
max_seq_length = 1024 # Set a reasonable max sequence length
packing = False
device_map = {"": 0} # Automatically place the model on the first available GPU
# --- Script Execution ---
def main():
# Step 1: Load the dataset from the JSONL file
print("Loading dataset...")
dataset = load_dataset('json', data_files=dataset_name, split="train")
print(f"Dataset loaded with {len(dataset)} examples.")
# Step 2: Load the model and tokenizer
print(f"Loading base model '{base_model_name}'...")
compute_dtype = getattr(torch, bnb_4bit_compute_dtype)
bnb_config = BitsAndBytesConfig(
load_in_4bit=use_4bit,
bnb_4bit_quant_type=bnb_4bit_quant_type,
bnb_4bit_compute_dtype=compute_dtype,
bnb_4bit_use_double_quant=use_nested_quant,
)
model = AutoModelForCausalLM.from_pretrained(
base_model_name,
quantization_config=bnb_config,
device_map=device_map,
trust_remote_code=True
)
model.config.use_cache = False
model.config.pretraining_tp = 1
tokenizer = AutoTokenizer.from_pretrained(base_model_name, trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"
# Step 3: Preprocess the dataset
def format_chat_template(example):
# The 'messages' field contains a list of dictionaries (e.g., [{"role": "user", "content": "..."}])
# We apply the tokenizer's chat template to format this into a single string
return {"text": tokenizer.apply_chat_template(example["messages"], tokenize=False)}
print("Formatting dataset with chat template...")
formatted_dataset = dataset.map(format_chat_template)
print("Dataset formatted.")
# Step 4: Configure PEFT for LoRA
peft_config = LoraConfig(
lora_alpha=lora_alpha,
lora_dropout=lora_dropout,
r=lora_r,
bias="none",
task_type="CAUSAL_LM",
target_modules=[
"q_proj", "k_proj", "v_proj", "o_proj",
"gate_proj", "up_proj", "down_proj"
],
)
# Step 5: Set up Training Arguments
training_arguments = TrainingArguments(
output_dir=output_dir,
num_train_epochs=num_train_epochs,
per_device_train_batch_size=per_device_train_batch_size,
gradient_accumulation_steps=gradient_accumulation_steps,
optim=optim,
save_steps=save_steps,
logging_steps=logging_steps,
learning_rate=learning_rate,
weight_decay=weight_decay,
fp16=False,
bf16=bf16,
max_grad_norm=max_grad_norm,
max_steps=max_steps,
warmup_ratio=warmup_ratio,
group_by_length=group_by_length,
lr_scheduler_type=lr_scheduler_type,
report_to="tensorboard"
)
# Step 6: Initialize the SFTTrainer
trainer = SFTTrainer(
model=model,
train_dataset=formatted_dataset,
peft_config=peft_config,
dataset_text_field="text", # Use the 'text' field created during preprocessing
max_seq_length=max_seq_length,
tokenizer=tokenizer,
args=training_arguments,
packing=packing,
)
# Step 7: Train the model
print("Starting model training...")
trainer.train()
print("Training complete.")
# Step 8: Save the fine-tuned adapter model
print(f"Saving fine-tuned adapter model to '{adapter_model_name}'...")
trainer.model.save_pretrained(adapter_model_name)
print("Adapter model saved.")
# Step 9: Merge the adapter with the base model and save
print("Merging the base model with the adapter to create the final model...")
# Reload the base model in full precision (or float16) for merging
base_model_for_merging = AutoModelForCausalLM.from_pretrained(
base_model_name,
low_cpu_mem_usage=True,
return_dict=True,
torch_dtype=torch.float16,
device_map=device_map,
trust_remote_code=True
)
# Load the PEFT model with the adapter weights
merged_model = PeftModel.from_pretrained(base_model_for_merging, adapter_model_name)
# Merge the adapter into the base model
merged_model = merged_model.merge_and_unload()
print("Model merged.")
# Save the final merged model and tokenizer
print(f"Saving final merged model to '{final_model_name}'...")
merged_model.save_pretrained(final_model_name, safe_serialization=True)
tokenizer.save_pretrained(final_model_name)
print(f"Final model '{final_model_name}' saved successfully.")
print("\n--- Fine-tuning process complete ---")
print(f"LoRA adapter model is in: '{adapter_model_name}'")
print(f"Final merged model is in: '{final_model_name}'")
if __name__ == "__main__":
main()
|