Syntax-3B-Untrained / Syntax-Copilot-Project /create_syntax_copilot.py
algorythmtechnologies's picture
Upload 2 files
76edade verified
# This script is designed to fine-tune the Qwen/Qwen2.5-Coder-3B-Instruct model
# using Q-LoRA on a custom dataset to create the "Syntax Copilot" model.
#
# Before running, make sure you have the necessary packages installed:
# pip install torch transformers datasets peft trl bitsandbytes accelerate
#
# Make sure you are logged into Hugging Face CLI to download the model:
# huggingface-cli login
import os
import torch
from datasets import load_dataset
from transformers import (
AutoModelForCausalLM,
AutoTokenizer,
BitsAndBytesConfig,
TrainingArguments,
)
from peft import LoraConfig, PeftModel
from trl import SFTTrainer
# 1. Configuration
# The base model to download from Hugging Face
base_model_name = "Qwen/Qwen2.5-Coder-3B-Instruct"
# The dataset file
dataset_name = "corrected_syntax_dataset.jsonl"
# The name for the fine-tuned adapter model
adapter_model_name = "Syntax-Copilot-adapter"
# The name for the final merged model, which we will call "Syntax Copilot"
final_model_name = "Syntax-Copilot"
# 2. Q-LoRA (Quantization and Low-Rank Adaptation) Configuration
lora_r = 64
lora_alpha = 16
lora_dropout = 0.1
# 3. BitsAndBytes Configuration for 4-bit Quantization
use_4bit = True
bnb_4bit_compute_dtype = "float16"
bnb_4bit_quant_type = "nf4"
use_nested_quant = False
# 4. Training Arguments
output_dir = "./training_results"
num_train_epochs = 1
# Use bf16 for better performance on modern GPUs (e.g., Ampere series)
bf16 = True
per_device_train_batch_size = 4
gradient_accumulation_steps = 1
gradient_checkpointing = True
max_grad_norm = 0.3
learning_rate = 2e-4
weight_decay = 0.001
optim = "paged_adamw_32bit"
lr_scheduler_type = "cosine"
max_steps = -1
warmup_ratio = 0.03
group_by_length = True
save_steps = 50
logging_steps = 10
# 5. SFTTrainer (Supervised Fine-tuning Trainer) Configuration
max_seq_length = 1024 # Set a reasonable max sequence length
packing = False
device_map = {"": 0} # Automatically place the model on the first available GPU
# --- Script Execution ---
def main():
# Step 1: Load the dataset from the JSONL file
print("Loading dataset...")
dataset = load_dataset('json', data_files=dataset_name, split="train")
print(f"Dataset loaded with {len(dataset)} examples.")
# Step 2: Load the model and tokenizer
print(f"Loading base model '{base_model_name}'...")
compute_dtype = getattr(torch, bnb_4bit_compute_dtype)
bnb_config = BitsAndBytesConfig(
load_in_4bit=use_4bit,
bnb_4bit_quant_type=bnb_4bit_quant_type,
bnb_4bit_compute_dtype=compute_dtype,
bnb_4bit_use_double_quant=use_nested_quant,
)
model = AutoModelForCausalLM.from_pretrained(
base_model_name,
quantization_config=bnb_config,
device_map=device_map,
trust_remote_code=True
)
model.config.use_cache = False
model.config.pretraining_tp = 1
tokenizer = AutoTokenizer.from_pretrained(base_model_name, trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"
# Step 3: Preprocess the dataset
def format_chat_template(example):
# The 'messages' field contains a list of dictionaries (e.g., [{"role": "user", "content": "..."}])
# We apply the tokenizer's chat template to format this into a single string
return {"text": tokenizer.apply_chat_template(example["messages"], tokenize=False)}
print("Formatting dataset with chat template...")
formatted_dataset = dataset.map(format_chat_template)
print("Dataset formatted.")
# Step 4: Configure PEFT for LoRA
peft_config = LoraConfig(
lora_alpha=lora_alpha,
lora_dropout=lora_dropout,
r=lora_r,
bias="none",
task_type="CAUSAL_LM",
target_modules=[
"q_proj", "k_proj", "v_proj", "o_proj",
"gate_proj", "up_proj", "down_proj"
],
)
# Step 5: Set up Training Arguments
training_arguments = TrainingArguments(
output_dir=output_dir,
num_train_epochs=num_train_epochs,
per_device_train_batch_size=per_device_train_batch_size,
gradient_accumulation_steps=gradient_accumulation_steps,
optim=optim,
save_steps=save_steps,
logging_steps=logging_steps,
learning_rate=learning_rate,
weight_decay=weight_decay,
fp16=False,
bf16=bf16,
max_grad_norm=max_grad_norm,
max_steps=max_steps,
warmup_ratio=warmup_ratio,
group_by_length=group_by_length,
lr_scheduler_type=lr_scheduler_type,
report_to="tensorboard"
)
# Step 6: Initialize the SFTTrainer
trainer = SFTTrainer(
model=model,
train_dataset=formatted_dataset,
peft_config=peft_config,
dataset_text_field="text", # Use the 'text' field created during preprocessing
max_seq_length=max_seq_length,
tokenizer=tokenizer,
args=training_arguments,
packing=packing,
)
# Step 7: Train the model
print("Starting model training...")
trainer.train()
print("Training complete.")
# Step 8: Save the fine-tuned adapter model
print(f"Saving fine-tuned adapter model to '{adapter_model_name}'...")
trainer.model.save_pretrained(adapter_model_name)
print("Adapter model saved.")
# Step 9: Merge the adapter with the base model and save
print("Merging the base model with the adapter to create the final model...")
# Reload the base model in full precision (or float16) for merging
base_model_for_merging = AutoModelForCausalLM.from_pretrained(
base_model_name,
low_cpu_mem_usage=True,
return_dict=True,
torch_dtype=torch.float16,
device_map=device_map,
trust_remote_code=True
)
# Load the PEFT model with the adapter weights
merged_model = PeftModel.from_pretrained(base_model_for_merging, adapter_model_name)
# Merge the adapter into the base model
merged_model = merged_model.merge_and_unload()
print("Model merged.")
# Save the final merged model and tokenizer
print(f"Saving final merged model to '{final_model_name}'...")
merged_model.save_pretrained(final_model_name, safe_serialization=True)
tokenizer.save_pretrained(final_model_name)
print(f"Final model '{final_model_name}' saved successfully.")
print("\n--- Fine-tuning process complete ---")
print(f"LoRA adapter model is in: '{adapter_model_name}'")
print(f"Final merged model is in: '{final_model_name}'")
if __name__ == "__main__":
main()