""" Local LoRA fine-tuning script for a small coding model. Quick start (Windows/Linux local): 1) pip install transformers datasets peft accelerate bitsandbytes huggingface_hub 2) python finetune_coding_llm_colab.py --dataset-size 8000 3) Optional upload: python finetune_coding_llm_colab.py --skip-train --upload --hf-repo your-user/your-model """ import argparse import json import os import random import torch from datasets import load_dataset from huggingface_hub import upload_folder from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training from transformers import ( AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig, Trainer, TrainingArguments, ) DEFAULT_MODEL_NAME = "Qwen/Qwen2.5-Coder-0.5B-Instruct" DEFAULT_OUTPUT_DIR = "./model" DEFAULT_TRAIN_FILE = "train.json" HF_REPO_ID = "your-username/coding-llm-model" # Keep dataset size in the requested 5k-10k window. DATASET_SIZE = 8000 TEMPLATES = [ { "instruction": "Fix the Python code", "input": "def add(a,b) return a+b", "output": "def add(a, b): return a + b", "explanation": "Added missing colon and corrected syntax.", }, { "instruction": "Fix loop syntax", "input": "for i in range(5 print(i)", "output": "for i in range(5): print(i)", "explanation": "Added missing parenthesis and colon.", }, { "instruction": "Fix condition", "input": "if x = 10: print(x)", "output": "if x == 10: print(x)", "explanation": "Corrected assignment to comparison operator.", }, { "instruction": "Explain code", "input": "for i in range(3): print(i)", "output": "Prints numbers from 0 to 2.", "explanation": "Loop iterates from 0 to 2 and prints values.", }, { "instruction": "Write a Python function", "input": "Create a function to multiply two numbers", "output": "def multiply(a, b):\n return a * b", "explanation": "Defined a multiply function that returns the product of two inputs.", }, { "instruction": "Write a Python function", "input": "Create a function to add two numbers", "output": "def add(a, b):\n return a + b", "explanation": "Defined an add function that returns the sum of two inputs.", }, { "instruction": "Write a Python function", "input": "Create a function to subtract two numbers", "output": "def subtract(a, b):\n return a - b", "explanation": "Defined a subtract function that returns the difference between two inputs.", }, { "instruction": "Write a Python function", "input": "Create a function to divide two numbers", "output": "def divide(a, b):\n return a / b", "explanation": "Defined a divide function that returns the quotient of two inputs.", }, ] def format_training_text(template): target = { "code": template["output"], "explanation": template["explanation"], } return ( f"Instruction: {template['instruction']}\n" f"Input: {template['input']}\n" "Return only valid JSON with keys code and explanation.\n" f"JSON: {json.dumps(target, ensure_ascii=False)}\n" ) def generate_sample(): template = random.choice(TEMPLATES) text = format_training_text(template) return { "instruction": template["instruction"], "input": template["input"], "output": template["output"], "explanation": template["explanation"], "text": text, "confidence": round(random.uniform(0.9, 0.99), 2), "relevancy": round(random.uniform(0.85, 0.99), 2), } def build_dataset(train_file, size=DATASET_SIZE): dataset = [generate_sample() for _ in range(size)] with open(train_file, "w", encoding="utf-8") as f: json.dump(dataset, f, indent=2) print(f"Dataset created: {len(dataset)} samples -> {train_file}") def run_training( model_name, train_file, output_dir, epochs, batch_size, learning_rate, max_length, max_train_samples, use_4bit, ): if not os.path.exists(train_file): raise FileNotFoundError( f"Training file not found: {train_file}. Generate it with generate_dataset.py first." ) dataset = load_dataset("json", data_files=train_file) tokenizer = AutoTokenizer.from_pretrained(model_name) tokenizer.pad_token = tokenizer.eos_token def format_data(example): text = example.get("text") if not text: text = format_training_text(example) tokens = tokenizer( text, truncation=True, padding="max_length", max_length=max_length, ) tokens["labels"] = tokens["input_ids"].copy() return tokens tokenized = dataset.map( format_data, remove_columns=dataset["train"].column_names, desc="Tokenizing training dataset", ) if max_train_samples > 0: max_train_samples = min(max_train_samples, len(tokenized["train"])) tokenized["train"] = tokenized["train"].select(range(max_train_samples)) fp16_enabled = torch.cuda.is_available() quantize_4bit = use_4bit and torch.cuda.is_available() if use_4bit and not torch.cuda.is_available(): print("Warning: --use-4bit requested but CUDA not available. Falling back to standard loading.") if quantize_4bit: bnb_config = BitsAndBytesConfig(load_in_4bit=True) model = AutoModelForCausalLM.from_pretrained( model_name, quantization_config=bnb_config, device_map="auto", ) model = prepare_model_for_kbit_training(model) else: model = AutoModelForCausalLM.from_pretrained( model_name, device_map="auto" if torch.cuda.is_available() else None, ) lora_config = LoraConfig( r=8, lora_alpha=16, target_modules=["q_proj", "v_proj"], lora_dropout=0.05, bias="none", task_type="CAUSAL_LM", ) model = get_peft_model(model, lora_config) training_args = TrainingArguments( output_dir=output_dir, per_device_train_batch_size=batch_size, num_train_epochs=epochs, gradient_accumulation_steps=2, logging_steps=10, save_steps=100, learning_rate=learning_rate, fp16=fp16_enabled, dataloader_pin_memory=torch.cuda.is_available(), report_to="none", ) trainer = Trainer( model=model, args=training_args, train_dataset=tokenized["train"], ) trainer.train() model.save_pretrained(output_dir) tokenizer.save_pretrained(output_dir) print(f"Model and tokenizer saved to: {output_dir}") def upload_to_hf(repo_id, output_dir): if not os.path.exists(output_dir): raise FileNotFoundError( f"Model output folder not found: {output_dir}. Run training before upload." ) upload_folder( folder_path=output_dir, repo_id=repo_id, repo_type="model", ) print(f"Uploaded to Hugging Face repo: {repo_id}") if __name__ == "__main__": parser = argparse.ArgumentParser() parser.add_argument("--dataset-size", type=int, default=DATASET_SIZE) parser.add_argument("--train-file", type=str, default=DEFAULT_TRAIN_FILE) parser.add_argument("--output-dir", type=str, default=DEFAULT_OUTPUT_DIR) parser.add_argument("--model-name", type=str, default=DEFAULT_MODEL_NAME) parser.add_argument("--epochs", type=float, default=1) parser.add_argument("--batch-size", type=int, default=2) parser.add_argument("--learning-rate", type=float, default=2e-4) parser.add_argument("--max-length", type=int, default=512) parser.add_argument("--max-train-samples", type=int, default=0) parser.add_argument("--use-4bit", action="store_true") parser.add_argument("--skip-dataset-gen", action="store_true") parser.add_argument("--skip-train", action="store_true") parser.add_argument("--upload", action="store_true") parser.add_argument("--hf-repo", type=str, default=HF_REPO_ID) args = parser.parse_args() if not (5000 <= args.dataset_size <= 10000): raise ValueError("dataset-size must be between 5000 and 10000") if not args.skip_dataset_gen: build_dataset(train_file=args.train_file, size=args.dataset_size) if not args.skip_train: run_training( model_name=args.model_name, train_file=args.train_file, output_dir=args.output_dir, epochs=args.epochs, batch_size=args.batch_size, learning_rate=args.learning_rate, max_length=args.max_length, max_train_samples=args.max_train_samples, use_4bit=args.use_4bit, ) if args.upload: upload_to_hf(repo_id=args.hf_repo, output_dir=args.output_dir)