File size: 3,963 Bytes

9ec3d1d

"""
Step 3: Setting up the model for fine-tuning with LoRA
"""

from pathlib import Path

import torch
from peft import LoraConfig, get_peft_model
from transformers import AutoModelForCausalLM, AutoTokenizer


def load_base_model(model_name: str = "Qwen/Qwen2.5-3B-Instruct"):
    """
    Load the base model and tokenizer.
    """
    print(f"Loading model: {model_name}")
    print("(First run will download ~6GB to ~/.cache/huggingface/)")

    tokenizer = AutoTokenizer.from_pretrained(model_name)

    # Ensure tokenizer has a pad token
    if tokenizer.pad_token is None:
        tokenizer.pad_token = tokenizer.eos_token

    # Check if MPS (Apple Silicon) is available
    if torch.backends.mps.is_available():
        print("Using Apple MPS (Metal) backend")
        model = AutoModelForCausalLM.from_pretrained(
            model_name, dtype=torch.float16, trust_remote_code=True
        )
        model = model.to("mps")
    else:
        print("MPS not available, using CPU (this will be slow)")
        model = AutoModelForCausalLM.from_pretrained(
            model_name, dtype=torch.float32, trust_remote_code=True
        )

    return model, tokenizer


def apply_lora(model):
    """
    Apply LoRA adapters to the model for efficient fine-tuning.
    """
    print("\nApplying LoRA configuration...")

    lora_config = LoraConfig(
        r=16,  # Rank of the update matrices
        lora_alpha=32,  # Scaling factor
        target_modules=[  # Which layers to adapt
            "q_proj",
            "v_proj",
            "k_proj",
            "o_proj",
            "gate_proj",
            "up_proj",
            "down_proj",
        ],
        lora_dropout=0.05,  # Dropout for regularization
        bias="none",
        task_type="CAUSAL_LM",
    )

    model = get_peft_model(model, lora_config)
    model.print_trainable_parameters()

    return model


def setup_for_training(model_name: str = "Qwen/Qwen2.5-3B-Instruct"):
    """
    Complete setup: load model and apply LoRA.
    """
    model, tokenizer = load_base_model(model_name)
    peft_model = apply_lora(model)
    return peft_model, tokenizer


def test_inference(model, tokenizer, prompt: str):
    """
    Quick test to verify the model works.
    """
    print(f"\nTest prompt: {prompt[:50]}...")

    device = next(model.parameters()).device
    inputs = tokenizer(prompt, return_tensors="pt").to(device)

    with torch.no_grad():
        outputs = model.generate(
            **inputs,
            max_new_tokens=10,
            do_sample=False,
            pad_token_id=tokenizer.pad_token_id,
        )

    response = tokenizer.decode(outputs[0], skip_special_tokens=True)
    # Get only the new tokens (remove the prompt)
    new_text = response[len(prompt) :].strip()
    print(f"Model output: {new_text}")
    return new_text


# Run this script directly to test the setup
if __name__ == "__main__":
    print("=" * 60)
    print("Step 3: Model Setup Test")
    print("=" * 60)

    # Verify MPS is available
    print(f"\n[Environment Check]")
    print(f"  MPS Available: {torch.backends.mps.is_available()}")
    print(f"  MPS Built: {torch.backends.mps.is_built()}")
    print(f"  PyTorch version: {torch.__version__}")

    # Load and setup the model
    print(f"\n[Loading Model]")
    model, tokenizer = setup_for_training()

    print(f"\n[Status]")
    print(f"  ✓ Model loaded successfully")
    print(f"  ✓ LoRA adapters applied")
    print(f"  Device: {next(model.parameters()).device}")

    # Quick inference test
    print(f"\n[Quick Inference Test]")
    test_prompt = "What is 2 + 2? Answer with just the number:"
    test_inference(model, tokenizer, test_prompt)

    print("\n" + "=" * 60)
    print("✓ Setup complete! Ready for training.")
    print("=" * 60)

    # Summary of what was cached
    print(f"\n[Cache Location]")
    print(f"  Model cached at: ~/.cache/huggingface/hub/")
    print(f"  (This is reused for future runs)")