import os import sys import sqlite3 import pandas as pd import torch from datasets import load_dataset from transformers import ( AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig, TrainingArguments, TrainerCallback ) from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training from trl import SFTTrainer, SFTConfig class VRAMLoggerCallback(TrainerCallback): """Callback to print VRAM usage during training to monitor resource consumption.""" def on_step_end(self, args, state, control, **kwargs): if torch.cuda.is_available(): allocated = torch.cuda.memory_allocated(0) / (1024 ** 3) reserved = torch.cuda.memory_reserved(0) / (1024 ** 3) print(f" - [Step {state.global_step}] VRAM Allocated: {allocated:.2f} GB | Reserved: {reserved:.2f} GB") def train(): print("=" * 60) print("Starting Phi-3 Text-to-SQL QLoRA Fine-Tuning") print("=" * 60) # 1. Device Configuration and Hardware Checks if not torch.cuda.is_available(): print("WARNING: CUDA is NOT available. Running on CPU is extremely slow and NOT recommended.") device = "cpu" use_bf16 = False use_fp16 = False else: device = "cuda" gpu_name = torch.cuda.get_device_name(0) print(f"CUDA Device Detected: {gpu_name}") # RTX 40-series (Ada Lovelace) natively supports bfloat16 use_bf16 = torch.cuda.is_bf16_supported() use_fp16 = not use_bf16 print(f"bfloat16 Supported: {use_bf16} | float16 Fallback: {use_fp16}") # 2. Paths dataset_dir = "data" train_file = os.path.join(dataset_dir, "train_dataset.jsonl") val_file = os.path.join(dataset_dir, "test_dataset.jsonl") output_dir = "models/phi3-text-to-sql" adapter_dir = "models/phi3-text-to-sql-adapter" if not os.path.exists(train_file) or not os.path.exists(val_file): print("ERROR: Dataset files not found. Run dataset.py first.") sys.exit(1) # 3. Quantization Config (Crucial for 6GB VRAM) print("Configuring 4-bit Quantization (QLoRA)...") bnb_config = BitsAndBytesConfig( load_in_4bit=True, bnb_4bit_quant_type="nf4", bnb_4bit_use_double_quant=True, bnb_4bit_compute_dtype=torch.bfloat16 if use_bf16 else torch.float16 ) # 4. Load Model and Tokenizer model_id = "microsoft/Phi-3-mini-4k-instruct" print(f"Loading base model: {model_id}...") model = AutoModelForCausalLM.from_pretrained( model_id, quantization_config=bnb_config, device_map="auto", trust_remote_code=False, attn_implementation="eager" ) print("Loading tokenizer...") tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=False) # Configure padding tokens for Phi-3 (which uses EOS for padding natively in standard chat SFT) tokenizer.pad_token = tokenizer.eos_token tokenizer.padding_side = "right" # 5. Prepare Model for Peft Training print("Preparing model for k-bit training and enabling gradient checkpointing...") model = prepare_model_for_kbit_training(model, use_gradient_checkpointing=True) # 6. Configure LoRA # We target all linear layers to maximize standard model adaptation print("Configuring LoRA parameters...") peft_config = LoraConfig( r=8, # Rank of adapter (8 is memory-efficient and very effective for code syntax) lora_alpha=16, target_modules=[ "q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj" ], lora_dropout=0.05, bias="none", task_type="CAUSAL_LM" ) model = get_peft_model(model, peft_config) model.print_trainable_parameters() # 7. Load Datasets print("Loading formatted training datasets...") dataset = load_dataset("json", data_files={ "train": train_file, "validation": val_file }) # 8. Training Arguments # We optimize heavily for memory: batch_size=1, gradient_accumulation=4 (eff. batch size 4), paged_adamw optimizer print("Setting up memory-optimized SFT training arguments...") training_args = SFTConfig( output_dir=output_dir, num_train_epochs=3, per_device_train_batch_size=1, gradient_accumulation_steps=4, learning_rate=2e-4, weight_decay=0.01, lr_scheduler_type="cosine", warmup_ratio=0.05, logging_steps=1, eval_strategy="steps", eval_steps=5, save_strategy="steps", save_steps=10, save_total_limit=2, optim="paged_adamw_8bit" if device == "cuda" else "adamw_torch", bf16=use_bf16, fp16=use_fp16, gradient_checkpointing=True, max_grad_norm=0.3, report_to="none", ddp_find_unused_parameters=False, remove_unused_columns=False, # Crucial for TRL chat template processing max_length=512 # SFTConfig uses max_length instead of max_seq_length inside SFTTrainer constructor ) # 9. Initialize SFTTrainer # SFTTrainer natively formats 'messages' using the model's chat template print("Initializing SFTTrainer...") trainer = SFTTrainer( model=model, train_dataset=dataset["train"], eval_dataset=dataset["validation"], peft_config=None, processing_class=tokenizer, args=training_args, callbacks=[VRAMLoggerCallback()] if device == "cuda" else [] ) # 10. Execute Fine-Tuning print("Launching Fine-Tuning loop...") trainer.train() # 11. Save the Fine-Tuned Adapter print(f"Fine-tuning complete! Saving adapter to {adapter_dir}...") trainer.model.save_pretrained(adapter_dir) tokenizer.save_pretrained(adapter_dir) print("Adapter saved successfully. Ready for evaluation and deployment!") if __name__ == "__main__": train()