# This script is designed to fine-tune the Qwen/Qwen2.5-Coder-3B-Instruct model # using Q-LoRA on a custom dataset to create the "Syntax Copilot" model. # # Before running, make sure you have the necessary packages installed: # pip install torch transformers datasets peft trl bitsandbytes accelerate # # Make sure you are logged into Hugging Face CLI to download the model: # huggingface-cli login import os import torch from datasets import load_dataset from transformers import ( AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig, TrainingArguments, ) from peft import LoraConfig, PeftModel from trl import SFTTrainer # 1. Configuration # The base model to download from Hugging Face base_model_name = "Qwen/Qwen2.5-Coder-3B-Instruct" # The dataset file dataset_name = "corrected_syntax_dataset.jsonl" # The name for the fine-tuned adapter model adapter_model_name = "Syntax-Copilot-adapter" # The name for the final merged model, which we will call "Syntax Copilot" final_model_name = "Syntax-Copilot" # 2. Q-LoRA (Quantization and Low-Rank Adaptation) Configuration lora_r = 64 lora_alpha = 16 lora_dropout = 0.1 # 3. BitsAndBytes Configuration for 4-bit Quantization use_4bit = True bnb_4bit_compute_dtype = "float16" bnb_4bit_quant_type = "nf4" use_nested_quant = False # 4. Training Arguments output_dir = "./training_results" num_train_epochs = 1 # Use bf16 for better performance on modern GPUs (e.g., Ampere series) bf16 = True per_device_train_batch_size = 4 gradient_accumulation_steps = 1 gradient_checkpointing = True max_grad_norm = 0.3 learning_rate = 2e-4 weight_decay = 0.001 optim = "paged_adamw_32bit" lr_scheduler_type = "cosine" max_steps = -1 warmup_ratio = 0.03 group_by_length = True save_steps = 50 logging_steps = 10 # 5. SFTTrainer (Supervised Fine-tuning Trainer) Configuration max_seq_length = 1024 # Set a reasonable max sequence length packing = False device_map = {"": 0} # Automatically place the model on the first available GPU # --- Script Execution --- def main(): # Step 1: Load the dataset from the JSONL file print("Loading dataset...") dataset = load_dataset('json', data_files=dataset_name, split="train") print(f"Dataset loaded with {len(dataset)} examples.") # Step 2: Load the model and tokenizer print(f"Loading base model '{base_model_name}'...") compute_dtype = getattr(torch, bnb_4bit_compute_dtype) bnb_config = BitsAndBytesConfig( load_in_4bit=use_4bit, bnb_4bit_quant_type=bnb_4bit_quant_type, bnb_4bit_compute_dtype=compute_dtype, bnb_4bit_use_double_quant=use_nested_quant, ) model = AutoModelForCausalLM.from_pretrained( base_model_name, quantization_config=bnb_config, device_map=device_map, trust_remote_code=True ) model.config.use_cache = False model.config.pretraining_tp = 1 tokenizer = AutoTokenizer.from_pretrained(base_model_name, trust_remote_code=True) tokenizer.pad_token = tokenizer.eos_token tokenizer.padding_side = "right" # Step 3: Preprocess the dataset def format_chat_template(example): # The 'messages' field contains a list of dictionaries (e.g., [{"role": "user", "content": "..."}]) # We apply the tokenizer's chat template to format this into a single string return {"text": tokenizer.apply_chat_template(example["messages"], tokenize=False)} print("Formatting dataset with chat template...") formatted_dataset = dataset.map(format_chat_template) print("Dataset formatted.") # Step 4: Configure PEFT for LoRA peft_config = LoraConfig( lora_alpha=lora_alpha, lora_dropout=lora_dropout, r=lora_r, bias="none", task_type="CAUSAL_LM", target_modules=[ "q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj" ], ) # Step 5: Set up Training Arguments training_arguments = TrainingArguments( output_dir=output_dir, num_train_epochs=num_train_epochs, per_device_train_batch_size=per_device_train_batch_size, gradient_accumulation_steps=gradient_accumulation_steps, optim=optim, save_steps=save_steps, logging_steps=logging_steps, learning_rate=learning_rate, weight_decay=weight_decay, fp16=False, bf16=bf16, max_grad_norm=max_grad_norm, max_steps=max_steps, warmup_ratio=warmup_ratio, group_by_length=group_by_length, lr_scheduler_type=lr_scheduler_type, report_to="tensorboard" ) # Step 6: Initialize the SFTTrainer trainer = SFTTrainer( model=model, train_dataset=formatted_dataset, peft_config=peft_config, dataset_text_field="text", # Use the 'text' field created during preprocessing max_seq_length=max_seq_length, tokenizer=tokenizer, args=training_arguments, packing=packing, ) # Step 7: Train the model print("Starting model training...") trainer.train() print("Training complete.") # Step 8: Save the fine-tuned adapter model print(f"Saving fine-tuned adapter model to '{adapter_model_name}'...") trainer.model.save_pretrained(adapter_model_name) print("Adapter model saved.") # Step 9: Merge the adapter with the base model and save print("Merging the base model with the adapter to create the final model...") # Reload the base model in full precision (or float16) for merging base_model_for_merging = AutoModelForCausalLM.from_pretrained( base_model_name, low_cpu_mem_usage=True, return_dict=True, torch_dtype=torch.float16, device_map=device_map, trust_remote_code=True ) # Load the PEFT model with the adapter weights merged_model = PeftModel.from_pretrained(base_model_for_merging, adapter_model_name) # Merge the adapter into the base model merged_model = merged_model.merge_and_unload() print("Model merged.") # Save the final merged model and tokenizer print(f"Saving final merged model to '{final_model_name}'...") merged_model.save_pretrained(final_model_name, safe_serialization=True) tokenizer.save_pretrained(final_model_name) print(f"Final model '{final_model_name}' saved successfully.") print("\n--- Fine-tuning process complete ---") print(f"LoRA adapter model is in: '{adapter_model_name}'") print(f"Final merged model is in: '{final_model_name}'") if __name__ == "__main__": main()