File size: 6,607 Bytes

76edade

# This script is designed to fine-tune the Qwen/Qwen2.5-Coder-3B-Instruct model
# using Q-LoRA on a custom dataset to create the "Syntax Copilot" model.
#
# Before running, make sure you have the necessary packages installed:
# pip install torch transformers datasets peft trl bitsandbytes accelerate
#
# Make sure you are logged into Hugging Face CLI to download the model:
# huggingface-cli login

import os
import torch
from datasets import load_dataset
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    TrainingArguments,
)
from peft import LoraConfig, PeftModel
from trl import SFTTrainer

# 1. Configuration
# The base model to download from Hugging Face
base_model_name = "Qwen/Qwen2.5-Coder-3B-Instruct"
# The dataset file
dataset_name = "corrected_syntax_dataset.jsonl"
# The name for the fine-tuned adapter model
adapter_model_name = "Syntax-Copilot-adapter"
# The name for the final merged model, which we will call "Syntax Copilot"
final_model_name = "Syntax-Copilot"

# 2. Q-LoRA (Quantization and Low-Rank Adaptation) Configuration
lora_r = 64
lora_alpha = 16
lora_dropout = 0.1

# 3. BitsAndBytes Configuration for 4-bit Quantization
use_4bit = True
bnb_4bit_compute_dtype = "float16"
bnb_4bit_quant_type = "nf4"
use_nested_quant = False

# 4. Training Arguments
output_dir = "./training_results"
num_train_epochs = 1
# Use bf16 for better performance on modern GPUs (e.g., Ampere series)
bf16 = True
per_device_train_batch_size = 4
gradient_accumulation_steps = 1
gradient_checkpointing = True
max_grad_norm = 0.3
learning_rate = 2e-4
weight_decay = 0.001
optim = "paged_adamw_32bit"
lr_scheduler_type = "cosine"
max_steps = -1
warmup_ratio = 0.03
group_by_length = True
save_steps = 50
logging_steps = 10

# 5. SFTTrainer (Supervised Fine-tuning Trainer) Configuration
max_seq_length = 1024  # Set a reasonable max sequence length
packing = False
device_map = {"": 0} # Automatically place the model on the first available GPU

# --- Script Execution ---

def main():
    # Step 1: Load the dataset from the JSONL file
    print("Loading dataset...")
    dataset = load_dataset('json', data_files=dataset_name, split="train")
    print(f"Dataset loaded with {len(dataset)} examples.")

    # Step 2: Load the model and tokenizer
    print(f"Loading base model '{base_model_name}'...")
    
    compute_dtype = getattr(torch, bnb_4bit_compute_dtype)
    bnb_config = BitsAndBytesConfig(
        load_in_4bit=use_4bit,
        bnb_4bit_quant_type=bnb_4bit_quant_type,
        bnb_4bit_compute_dtype=compute_dtype,
        bnb_4bit_use_double_quant=use_nested_quant,
    )

    model = AutoModelForCausalLM.from_pretrained(
        base_model_name,
        quantization_config=bnb_config,
        device_map=device_map,
        trust_remote_code=True
    )
    model.config.use_cache = False
    model.config.pretraining_tp = 1

    tokenizer = AutoTokenizer.from_pretrained(base_model_name, trust_remote_code=True)
    tokenizer.pad_token = tokenizer.eos_token
    tokenizer.padding_side = "right"

    # Step 3: Preprocess the dataset
    def format_chat_template(example):
        # The 'messages' field contains a list of dictionaries (e.g., [{"role": "user", "content": "..."}])
        # We apply the tokenizer's chat template to format this into a single string
        return {"text": tokenizer.apply_chat_template(example["messages"], tokenize=False)}

    print("Formatting dataset with chat template...")
    formatted_dataset = dataset.map(format_chat_template)
    print("Dataset formatted.")

    # Step 4: Configure PEFT for LoRA
    peft_config = LoraConfig(
        lora_alpha=lora_alpha,
        lora_dropout=lora_dropout,
        r=lora_r,
        bias="none",
        task_type="CAUSAL_LM",
        target_modules=[
            "q_proj", "k_proj", "v_proj", "o_proj",
            "gate_proj", "up_proj", "down_proj"
        ],
    )

    # Step 5: Set up Training Arguments
    training_arguments = TrainingArguments(
        output_dir=output_dir,
        num_train_epochs=num_train_epochs,
        per_device_train_batch_size=per_device_train_batch_size,
        gradient_accumulation_steps=gradient_accumulation_steps,
        optim=optim,
        save_steps=save_steps,
        logging_steps=logging_steps,
        learning_rate=learning_rate,
        weight_decay=weight_decay,
        fp16=False,
        bf16=bf16,
        max_grad_norm=max_grad_norm,
        max_steps=max_steps,
        warmup_ratio=warmup_ratio,
        group_by_length=group_by_length,
        lr_scheduler_type=lr_scheduler_type,
        report_to="tensorboard"
    )

    # Step 6: Initialize the SFTTrainer
    trainer = SFTTrainer(
        model=model,
        train_dataset=formatted_dataset,
        peft_config=peft_config,
        dataset_text_field="text",  # Use the 'text' field created during preprocessing
        max_seq_length=max_seq_length,
        tokenizer=tokenizer,
        args=training_arguments,
        packing=packing,
    )

    # Step 7: Train the model
    print("Starting model training...")
    trainer.train()
    print("Training complete.")

    # Step 8: Save the fine-tuned adapter model
    print(f"Saving fine-tuned adapter model to '{adapter_model_name}'...")
    trainer.model.save_pretrained(adapter_model_name)
    print("Adapter model saved.")

    # Step 9: Merge the adapter with the base model and save
    print("Merging the base model with the adapter to create the final model...")
    
    # Reload the base model in full precision (or float16) for merging
    base_model_for_merging = AutoModelForCausalLM.from_pretrained(
        base_model_name,
        low_cpu_mem_usage=True,
        return_dict=True,
        torch_dtype=torch.float16,
        device_map=device_map,
        trust_remote_code=True
    )

    # Load the PEFT model with the adapter weights
    merged_model = PeftModel.from_pretrained(base_model_for_merging, adapter_model_name)
    # Merge the adapter into the base model
    merged_model = merged_model.merge_and_unload()
    print("Model merged.")

    # Save the final merged model and tokenizer
    print(f"Saving final merged model to '{final_model_name}'...")
    merged_model.save_pretrained(final_model_name, safe_serialization=True)
    tokenizer.save_pretrained(final_model_name)
    print(f"Final model '{final_model_name}' saved successfully.")

    print("\n--- Fine-tuning process complete ---")
    print(f"LoRA adapter model is in: '{adapter_model_name}'")
    print(f"Final merged model is in: '{final_model_name}'")

if __name__ == "__main__":
    main()