Elinnos
/

codellama-fine-tuning

Model card Files Files and versions

xet

Community

Prithvik-1 commited on Nov 25, 2025

Commit

8514fc9

verified ·

1 Parent(s): eada9ff

Upload scripts/training/finetune_mistral7b.py with huggingface_hub

Browse files

Files changed (1) hide show

scripts/training/finetune_mistral7b.py +378 -0

scripts/training/finetune_mistral7b.py ADDED Viewed

	@@ -0,0 +1,378 @@

+#!/usr/bin/env python3
+"""
+Fine-tuning script for Mistral models (7B, 3B, etc.) using LoRA (Low-Rank Adaptation)
+This script uses Hugging Face Transformers, PEFT, and BitsAndBytes for efficient training.
+"""
+import os
+import torch
+from datasets import load_dataset
+from transformers import (
+    AutoModelForCausalLM,
+    AutoTokenizer,
+    TrainingArguments,
+    BitsAndBytesConfig,
+    Trainer,
+    DataCollatorForLanguageModeling
+)
+from peft import (
+    LoraConfig,
+    PeftModel,
+    get_peft_model,
+    prepare_model_for_kbit_training,
+    TaskType,
+)
+import json
+def get_device_info():
+    """Detect and return available compute device"""
+    device_info = {
+        "device": "cpu",
+        "device_type": "cpu",
+        "use_quantization": False,
+        "dtype": torch.float32
+    }
+    if torch.cuda.is_available():
+        device_info["device"] = "cuda"
+        device_info["device_type"] = "cuda"
+        device_info["use_quantization"] = True
+        device_info["dtype"] = torch.float16
+        device_info["device_count"] = torch.cuda.device_count()
+        device_info["device_name"] = torch.cuda.get_device_name(0)
+        print(f"✓ CUDA GPU detected: {device_info['device_name']} (Count: {device_info['device_count']})")
+    elif hasattr(torch.backends, "mps") and torch.backends.mps.is_available():
+        device_info["device"] = "mps"
+        device_info["device_type"] = "mps"
+        device_info["use_quantization"] = False  # BitsAndBytes doesn't support MPS
+        device_info["dtype"] = torch.float16
+        print("✓ Apple Silicon GPU (MPS) detected")
+    else:
+        print("⚠ No GPU detected, using CPU (training will be very slow)")
+        device_info["dtype"] = torch.float32
+    return device_info
+# Defaults
+DEFAULT_BASE_MODEL = "mistralai/Mistral-7B-v0.1"
+DEFAULT_OUTPUT_DIR = "./mistral-finetuned"
+DEFAULT_DATASET_PATH = "./training_data.jsonl"  # Path to your training data
+# LoRA Configuration - Updated with increased dropout for regularization
+LORA_CONFIG = LoraConfig(
+    r=16,  # Rank
+    lora_alpha=32,  # LoRA alpha scaling parameter
+    target_modules=["q_proj", "v_proj", "k_proj", "o_proj", "gate_proj", "up_proj", "down_proj"],
+    lora_dropout=0.1,  # Increased from 0.05 to 0.1 for better regularization
+    bias="none",
+    task_type=TaskType.CAUSAL_LM,
+)
+# BitsAndBytes Configuration for 4-bit quantization (CUDA only)
+def get_bitsandbytes_config():
+    """Get BitsAndBytes config if CUDA is available, otherwise None"""
+    if torch.cuda.is_available():
+        return BitsAndBytesConfig(
+            load_in_4bit=True,
+            bnb_4bit_quant_type="nf4",
+            bnb_4bit_compute_dtype=torch.float16,
+            bnb_4bit_use_double_quant=True,
+        )
+    return None
+def load_and_prepare_model(model_name: str, adapter_path: str | None = None):
+    """Load the specified Mistral model, optionally warm-starting from an existing LoRA adapter."""
+    device_info = get_device_info()
+    print(f"\nLoading model: {model_name}")
+    tokenizer_source = adapter_path if adapter_path and os.path.isdir(adapter_path) else model_name
+    tokenizer = AutoTokenizer.from_pretrained(tokenizer_source)
+    if tokenizer.pad_token is None:
+        tokenizer.pad_token = tokenizer.eos_token
+        tokenizer.pad_token_id = tokenizer.eos_token_id
+    # Get quantization config (CUDA only)
+    bnb_config = get_bitsandbytes_config()
+    # Prepare model loading kwargs
+    model_kwargs = {
+        "trust_remote_code": True,
+    }
+    if bnb_config is not None:
+        # Use 4-bit quantization on CUDA
+        print("Using 4-bit quantization (CUDA)")
+        model_kwargs["quantization_config"] = bnb_config
+        model_kwargs["device_map"] = "auto"
+    elif device_info["device_type"] == "mps":
+        # Use MPS with float16
+        print(f"Using MPS device with {device_info['dtype']}")
+        model_kwargs["torch_dtype"] = device_info["dtype"]
+        model_kwargs["device_map"] = "auto"
+    else:
+        # CPU fallback
+        print("Using CPU (no quantization)")
+        model_kwargs["torch_dtype"] = torch.float32
+        model_kwargs["device_map"] = "cpu"
+    # Load base model
+    base_model = AutoModelForCausalLM.from_pretrained(model_name, **model_kwargs)
+    # Prepare model for k-bit training (only if using quantization)
+    if bnb_config is not None:
+        base_model = prepare_model_for_kbit_training(base_model)
+    if adapter_path:
+        print(f"Loading existing LoRA adapter from: {adapter_path}")
+        model = PeftModel.from_pretrained(base_model, adapter_path, is_trainable=True)
+    else:
+        model = get_peft_model(base_model, LORA_CONFIG)
+    # Enable gradient checkpointing to save memory
+    model.gradient_checkpointing_enable()
+    trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
+    total_params = sum(p.numel() for p in model.parameters())
+    print(f"Model loaded successfully!")
+    print(f"  - Device: {device_info['device']}")
+    print(f"  - Trainable parameters: {trainable_params:,}")
+    print(f"  - Total parameters: {total_params:,}")
+    print(f"  - Trainable ratio: {100 * trainable_params / total_params:.2f}%\n")
+    return model, tokenizer, device_info
+def load_training_data(file_path):
+    """Load training data from JSONL file"""
+    print(f"Loading training data from {file_path}")
+    if not os.path.exists(file_path):
+        print(f"Warning: {file_path} not found. Creating a sample dataset...")
+        # Create a sample dataset for demonstration
+        sample_data = [
+            {"instruction": "What is AI?", "response": "AI (Artificial Intelligence) is the simulation of human intelligence by machines."},
+            {"instruction": "Explain machine learning", "response": "Machine learning is a subset of AI that enables systems to learn from data."},
+        ]
+        with open(file_path, 'w') as f:
+            for item in sample_data:
+                f.write(json.dumps(item) + '\n')
+        print(f"Sample dataset created at {file_path}")
+    data = []
+    with open(file_path, 'r') as f:
+        for line in f:
+            data.append(json.loads(line))
+    return data
+def clean_completion(completion):
+    """Remove format markers from completion"""
+    if not completion:
+        return completion
+    # Remove format markers if present
+    if "### Strict JSON ###" in completion:
+        completion = completion.split("### Strict JSON ###")[1]
+    if "### End ###" in completion:
+        completion = completion.split("### End ###")[0]
+    return completion.strip()
+def format_prompt(instruction, response=None):
+    """Format training examples as prompts"""
+    # Clean response to remove format markers
+    if response:
+        response = clean_completion(response)
+    prompt = f"### Instruction:\n{instruction}\n\n### Response:\n"
+    if response:
+        prompt += f"{response}"
+    return prompt
+def tokenize_function(examples, tokenizer, max_length=512):
+    """Tokenize the training examples"""
+    texts = [format_prompt(inst, resp) for inst, resp in zip(examples["instruction"], examples["response"])]
+    tokenized = tokenizer(
+        texts,
+        truncation=True,
+        padding="max_length",
+        max_length=max_length,
+        return_tensors="pt"
+    )
+    tokenized["labels"] = tokenized["input_ids"].clone()
+    return tokenized
+def main():
+    import argparse
+    parser = argparse.ArgumentParser(description="Fine-tune Mistral models with LoRA")
+    parser.add_argument("--base-model", default=DEFAULT_BASE_MODEL, help="HF model id (e.g. mistralai/Mistral-7B-v0.1 or mistralai/Mistral-3B-v0.1)")
+    parser.add_argument("--adapter-path", default=None, help="Optional path to existing LoRA adapters to continue training")
+    parser.add_argument("--output-dir", default=DEFAULT_OUTPUT_DIR, help="Where to write the fine-tuned adapters")
+    parser.add_argument("--dataset", default=DEFAULT_DATASET_PATH, help="Path to training data JSONL")
+    parser.add_argument("--max-length", type=int, default=512, help="Max sequence length for tokenization")
+    args = parser.parse_args()
+    print("Starting Mistral Fine-tuning with LoRA")
+    print("=" * 50)
+    print(f"Base model: {args.base_model}")
+    print(f"Training data: {args.dataset}")
+    print(f"Output dir: {args.output_dir}\n")
+    # Load model and tokenizer
+    model, tokenizer, device_info = load_and_prepare_model(args.base_model, args.adapter_path)
+    # Load training data
+    training_data = load_training_data(args.dataset)
+    # Convert to dataset format
+    instructions = []
+    responses = []
+    for item in training_data:
+        if "instruction" in item:
+            instructions.append(item["instruction"])
+            responses.append(item.get("response", ""))
+        elif "prompt" in item and "completion" in item:
+            instructions.append(item["prompt"])
+            completion_value = item["completion"]
+            if isinstance(completion_value, (dict, list)):
+                responses.append(json.dumps(completion_value))
+            else:
+                responses.append(str(completion_value))
+        elif "messages" in item:
+            messages = item["messages"]
+            if not isinstance(messages, list) or len(messages) == 0:
+                raise KeyError("'messages' entries must be non-empty lists.")
+            prompt_parts = []
+            assistant_reply = None
+            for idx, message in enumerate(messages):
+                role = message.get("role", "user")
+                content = str(message.get("content", "")).strip()
+                if idx == len(messages) - 1 and role == "assistant":
+                    assistant_reply = content
+                else:
+                    role_label = role.upper()
+                    prompt_parts.append(f"{role_label}: {content}")
+            if assistant_reply is None:
+                assistant_reply = str(messages[-1].get("content", "")).strip()
+            prompt_text = "\n\n".join(part for part in prompt_parts if part)
+            instructions.append(prompt_text)
+            responses.append(assistant_reply)
+        else:
+            raise KeyError("Each training example must include either 'instruction'/'response', 'prompt'/'completion', or 'messages'.")
+    # Create a simple dataset dict
+    from datasets import Dataset
+    dataset = Dataset.from_dict({
+        "instruction": instructions,
+        "response": responses
+    })
+    # Tokenize dataset
+    print("Tokenizing dataset...")
+    tokenized_dataset = dataset.map(
+        lambda x: tokenize_function(x, tokenizer, max_length=args.max_length),
+        batched=True,
+        remove_columns=dataset.column_names
+    )
+    # Split dataset into train/validation (80/20)
+    print("Splitting dataset into train/validation (80/20)...")
+    train_val_split = tokenized_dataset.train_test_split(test_size=0.2, seed=42)
+    train_dataset = train_val_split["train"]
+    val_dataset = train_val_split["test"]
+    print(f"  - Training samples: {len(train_dataset)}")
+    print(f"  - Validation samples: {len(val_dataset)}")
+    # Training arguments - adjust based on device
+    use_fp16 = device_info["device_type"] in ["cuda", "mps"]
+    # Calculate total steps and appropriate warmup
+    effective_batch_size = (2 if device_info["device_type"] != "cpu" else 1) * 4  # batch_size * gradient_accumulation
+    total_steps = (len(train_dataset) // effective_batch_size) * 3  # 3 epochs
+    warmup_steps = max(10, int(0.1 * total_steps))  # 10% warmup, minimum 10 steps
+    print(f"\nTraining Configuration:")
+    print(f"  - Total training steps: {total_steps}")
+    print(f"  - Warmup steps: {warmup_steps} ({100*warmup_steps/total_steps:.1f}% of training)")
+    training_args = TrainingArguments(
+        output_dir=args.output_dir,
+        num_train_epochs=3,
+        per_device_train_batch_size=2 if device_info["device_type"] != "cpu" else 1,
+        gradient_accumulation_steps=4,
+        warmup_steps=warmup_steps,  # Dynamic warmup (10% of total steps)
+        learning_rate=5e-5,  # Reduced from 2e-4 to prevent overfitting
+        weight_decay=0.01,  # Added L2 regularization
+        fp16=use_fp16,  # Only enable on GPU (CUDA/MPS)
+        bf16=False,  # Can enable for newer CUDA GPUs if needed
+        logging_steps=10,
+        save_steps=50,  # Save more frequently
+        eval_strategy="steps",  # Enable evaluation
+        eval_steps=50,  # Evaluate every 50 steps
+        save_total_limit=3,
+        load_best_model_at_end=True,  # Load best checkpoint based on validation loss
+        metric_for_best_model="eval_loss",
+        greater_is_better=False,
+        lr_scheduler_type="cosine",  # Cosine learning rate decay
+        max_grad_norm=1.0,  # Gradient clipping
+        report_to="none",
+        push_to_hub=False,
+        dataloader_pin_memory=device_info["device_type"] == "cuda",  # Only pin memory for CUDA
+        remove_unused_columns=False,
+    )
+    print(f"Training Configuration:")
+    print(f"  - Device: {device_info['device']}")
+    print(f"  - Mixed precision (fp16): {use_fp16}")
+    print(f"  - Batch size: {training_args.per_device_train_batch_size}")
+    print(f"  - Gradient accumulation: {training_args.gradient_accumulation_steps}")
+    print(f"  - Learning rate: {training_args.learning_rate}")
+    print(f"  - Weight decay: {training_args.weight_decay}")
+    print(f"  - LR scheduler: {training_args.lr_scheduler_type}")
+    print(f"  - Max grad norm: {training_args.max_grad_norm}")
+    print("=" * 50)
+    # Data collator
+    data_collator = DataCollatorForLanguageModeling(
+        tokenizer=tokenizer,
+        mlm=False,
+    )
+    # Add early stopping callback
+    from transformers import EarlyStoppingCallback
+    # Create trainer with validation set and early stopping
+    trainer = Trainer(
+        model=model,
+        args=training_args,
+        train_dataset=train_dataset,
+        eval_dataset=val_dataset,  # Add validation set
+        data_collator=data_collator,
+        callbacks=[EarlyStoppingCallback(early_stopping_patience=3)],  # Stop if no improvement for 3 evals
+    )
+    # Train
+    print("\nStarting training...")
+    trainer.train()
+    # Save model
+    print(f"\nSaving fine-tuned model to {args.output_dir}")
+    trainer.save_model(args.output_dir)
+    tokenizer.save_pretrained(args.output_dir)
+    # Save LoRA adapters separately
+    model.save_pretrained(args.output_dir)
+    print("\nFine-tuning complete!")
+    print(f"Model saved to: {args.output_dir}")
+    print(f"To load for inference, use the inference script with: {args.output_dir}")
+if __name__ == "__main__":
+    main()