Syntax-3B-Untrained / Syntax-Copilot-Project /create_syntax_copilot.py

Upload 2 files

76edade verified 2 months ago

6.61 kB

	# This script is designed to fine-tune the Qwen/Qwen2.5-Coder-3B-Instruct model
	# using Q-LoRA on a custom dataset to create the "Syntax Copilot" model.
	#
	# Before running, make sure you have the necessary packages installed:
	# pip install torch transformers datasets peft trl bitsandbytes accelerate
	#
	# Make sure you are logged into Hugging Face CLI to download the model:
	# huggingface-cli login

	import os
	import torch
	from datasets import load_dataset
	from transformers import (
	AutoModelForCausalLM,
	AutoTokenizer,
	BitsAndBytesConfig,
	TrainingArguments,
	)
	from peft import LoraConfig, PeftModel
	from trl import SFTTrainer

	# 1. Configuration
	# The base model to download from Hugging Face
	base_model_name = "Qwen/Qwen2.5-Coder-3B-Instruct"
	# The dataset file
	dataset_name = "corrected_syntax_dataset.jsonl"
	# The name for the fine-tuned adapter model
	adapter_model_name = "Syntax-Copilot-adapter"
	# The name for the final merged model, which we will call "Syntax Copilot"
	final_model_name = "Syntax-Copilot"

	# 2. Q-LoRA (Quantization and Low-Rank Adaptation) Configuration
	lora_r = 64
	lora_alpha = 16
	lora_dropout = 0.1

	# 3. BitsAndBytes Configuration for 4-bit Quantization
	use_4bit = True
	bnb_4bit_compute_dtype = "float16"
	bnb_4bit_quant_type = "nf4"
	use_nested_quant = False

	# 4. Training Arguments
	output_dir = "./training_results"
	num_train_epochs = 1
	# Use bf16 for better performance on modern GPUs (e.g., Ampere series)
	bf16 = True
	per_device_train_batch_size = 4
	gradient_accumulation_steps = 1
	gradient_checkpointing = True
	max_grad_norm = 0.3
	learning_rate = 2e-4
	weight_decay = 0.001
	optim = "paged_adamw_32bit"
	lr_scheduler_type = "cosine"
	max_steps = -1
	warmup_ratio = 0.03
	group_by_length = True
	save_steps = 50
	logging_steps = 10

	# 5. SFTTrainer (Supervised Fine-tuning Trainer) Configuration
	max_seq_length = 1024 # Set a reasonable max sequence length
	packing = False
	device_map = {"": 0} # Automatically place the model on the first available GPU

	# --- Script Execution ---

	def main():
	# Step 1: Load the dataset from the JSONL file
	print("Loading dataset...")
	dataset = load_dataset('json', data_files=dataset_name, split="train")
	print(f"Dataset loaded with {len(dataset)} examples.")

	# Step 2: Load the model and tokenizer
	print(f"Loading base model '{base_model_name}'...")

	compute_dtype = getattr(torch, bnb_4bit_compute_dtype)
	bnb_config = BitsAndBytesConfig(
	load_in_4bit=use_4bit,
	bnb_4bit_quant_type=bnb_4bit_quant_type,
	bnb_4bit_compute_dtype=compute_dtype,
	bnb_4bit_use_double_quant=use_nested_quant,
	)

	model = AutoModelForCausalLM.from_pretrained(
	base_model_name,
	quantization_config=bnb_config,
	device_map=device_map,
	trust_remote_code=True
	)
	model.config.use_cache = False
	model.config.pretraining_tp = 1

	tokenizer = AutoTokenizer.from_pretrained(base_model_name, trust_remote_code=True)
	tokenizer.pad_token = tokenizer.eos_token
	tokenizer.padding_side = "right"

	# Step 3: Preprocess the dataset
	def format_chat_template(example):
	# The 'messages' field contains a list of dictionaries (e.g., [{"role": "user", "content": "..."}])
	# We apply the tokenizer's chat template to format this into a single string
	return {"text": tokenizer.apply_chat_template(example["messages"], tokenize=False)}

	print("Formatting dataset with chat template...")
	formatted_dataset = dataset.map(format_chat_template)
	print("Dataset formatted.")

	# Step 4: Configure PEFT for LoRA
	peft_config = LoraConfig(
	lora_alpha=lora_alpha,
	lora_dropout=lora_dropout,
	r=lora_r,
	bias="none",
	task_type="CAUSAL_LM",
	target_modules=[
	"q_proj", "k_proj", "v_proj", "o_proj",
	"gate_proj", "up_proj", "down_proj"
	],
	)

	# Step 5: Set up Training Arguments
	training_arguments = TrainingArguments(
	output_dir=output_dir,
	num_train_epochs=num_train_epochs,
	per_device_train_batch_size=per_device_train_batch_size,
	gradient_accumulation_steps=gradient_accumulation_steps,
	optim=optim,
	save_steps=save_steps,
	logging_steps=logging_steps,
	learning_rate=learning_rate,
	weight_decay=weight_decay,
	fp16=False,
	bf16=bf16,
	max_grad_norm=max_grad_norm,
	max_steps=max_steps,
	warmup_ratio=warmup_ratio,
	group_by_length=group_by_length,
	lr_scheduler_type=lr_scheduler_type,
	report_to="tensorboard"
	)

	# Step 6: Initialize the SFTTrainer
	trainer = SFTTrainer(
	model=model,
	train_dataset=formatted_dataset,
	peft_config=peft_config,
	dataset_text_field="text", # Use the 'text' field created during preprocessing
	max_seq_length=max_seq_length,
	tokenizer=tokenizer,
	args=training_arguments,
	packing=packing,
	)

	# Step 7: Train the model
	print("Starting model training...")
	trainer.train()
	print("Training complete.")

	# Step 8: Save the fine-tuned adapter model
	print(f"Saving fine-tuned adapter model to '{adapter_model_name}'...")
	trainer.model.save_pretrained(adapter_model_name)
	print("Adapter model saved.")

	# Step 9: Merge the adapter with the base model and save
	print("Merging the base model with the adapter to create the final model...")

	# Reload the base model in full precision (or float16) for merging
	base_model_for_merging = AutoModelForCausalLM.from_pretrained(
	base_model_name,
	low_cpu_mem_usage=True,
	return_dict=True,
	torch_dtype=torch.float16,
	device_map=device_map,
	trust_remote_code=True
	)

	# Load the PEFT model with the adapter weights
	merged_model = PeftModel.from_pretrained(base_model_for_merging, adapter_model_name)
	# Merge the adapter into the base model
	merged_model = merged_model.merge_and_unload()
	print("Model merged.")

	# Save the final merged model and tokenizer
	print(f"Saving final merged model to '{final_model_name}'...")
	merged_model.save_pretrained(final_model_name, safe_serialization=True)
	tokenizer.save_pretrained(final_model_name)
	print(f"Final model '{final_model_name}' saved successfully.")

	print("\n--- Fine-tuning process complete ---")
	print(f"LoRA adapter model is in: '{adapter_model_name}'")
	print(f"Final merged model is in: '{final_model_name}'")

	if __name__ == "__main__":
	main()