rotemso23's picture
Fix correctness issues found in full project review
5500299
"""
src/model.py — Base model + LoRA setup for Phi-3-mini fine-tuning.
Loads microsoft/Phi-3-mini-4k-instruct in 4-bit quantization (BitsAndBytes),
applies a LoRA adapter via PEFT, and returns the ready-to-train model + tokenizer.
Call load_model_and_tokenizer() from train.py — do not import data.py from here.
"""
from __future__ import annotations
import torch
from peft import LoraConfig, TaskType, get_peft_model, prepare_model_for_kbit_training
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
from transformers import PreTrainedTokenizerBase
# ---------------------------------------------------------------------------
# Constants
# ---------------------------------------------------------------------------
MODEL_ID = "microsoft/Phi-3-mini-4k-instruct"
HUB_REPO = "rotemso23/dialogsum-phi3-lora"
LORA_R = 16
LORA_ALPHA = 32
LORA_DROPOUT = 0.05
LORA_TARGET_MODULES = ["qkv_proj", "o_proj"]
# ---------------------------------------------------------------------------
# Model + tokenizer loading
# ---------------------------------------------------------------------------
def load_model_and_tokenizer(
model_id: str = MODEL_ID,
load_in_4bit: bool = True,
) -> tuple[object, PreTrainedTokenizerBase]:
"""
Load Phi-3-mini with 4-bit quantization and apply a LoRA adapter.
Steps:
1. Load tokenizer with right-padding (required for causal LM training).
2. Build BitsAndBytesConfig for 4-bit NF4 quantization with fp16 compute.
3. Load the base model with device_map='auto' so it lands on GPU when available.
4. Call prepare_model_for_kbit_training() to enable gradient checkpointing
and cast layer norms to fp32 — required before applying LoRA to a
quantized model.
5. Apply LoraConfig targeting qkv_proj and o_proj attention projections.
6. Return (peft_model, tokenizer).
The returned model has ~1-2% trainable parameters (the LoRA adapter weights).
All base model weights are frozen and kept in 4-bit.
Args:
model_id: HuggingFace model identifier. Defaults to Phi-3-mini-4k-instruct.
load_in_4bit: Whether to use 4-bit quantization. Set False for CPU testing
(model will be large and slow, but functional for import checks).
Returns:
Tuple of (peft_model, tokenizer).
peft_model: PeftModel wrapping the quantized base — ready for Trainer.
tokenizer: AutoTokenizer with padding_side='right' and pad_token set.
"""
# Step 1: tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)
tokenizer.padding_side = "right"
# Phi-3 tokenizer already has pad_token (<|endoftext|> / id=32000).
# Guard in case a variant doesn't:
if tokenizer.pad_token is None:
tokenizer.pad_token = tokenizer.eos_token
# Step 2: 4-bit quantization config
if load_in_4bit:
bnb_config = BitsAndBytesConfig(
load_in_4bit=True,
bnb_4bit_quant_type="nf4", # NF4 is optimal for LLM weights
bnb_4bit_compute_dtype=torch.float16, # fp16 compute for speed
bnb_4bit_use_double_quant=True, # nested quantization saves ~0.4 bits/param
)
else:
bnb_config = None
# Step 3: base model
# trust_remote_code=False: transformers 5.x natively supports Phi-3 — using the
# built-in implementation avoids the RoPE scaling KeyError in the custom modeling_phi3.py.
model = AutoModelForCausalLM.from_pretrained(
model_id,
quantization_config=bnb_config,
device_map="auto",
trust_remote_code=False,
)
# Step 4: prepare for k-bit training
# Enables gradient checkpointing, casts layer norms to fp32, disables cache.
# Must be called BEFORE get_peft_model().
if load_in_4bit:
model = prepare_model_for_kbit_training(model)
# Step 5: LoRA adapter
lora_config = LoraConfig(
r=LORA_R,
lora_alpha=LORA_ALPHA,
lora_dropout=LORA_DROPOUT,
target_modules=LORA_TARGET_MODULES,
task_type=TaskType.CAUSAL_LM,
bias="none",
)
model = get_peft_model(model, lora_config)
return model, tokenizer
def print_trainable_parameters(model: object) -> None:
"""
Print the number of trainable vs. total parameters and the trainable %.
Expected output for Phi-3-mini with r=16, target=[qkv_proj, o_proj]:
trainable params: ~8,388,608 (8M)
total params: ~3,821,079,552 (3.8B)
trainable %: ~0.22%
(Exact numbers depend on the model revision.)
Args:
model: A PeftModel or any nn.Module.
"""
trainable = sum(p.numel() for p in model.parameters() if p.requires_grad)
total = sum(p.numel() for p in model.parameters())
pct = 100.0 * trainable / total
print(f"trainable params : {trainable:,}")
print(f"total params : {total:,}")
print(f"trainable %% : {pct:.4f}%%")
# ---------------------------------------------------------------------------
# CLI entry point
# ---------------------------------------------------------------------------
if __name__ == "__main__":
from dotenv import load_dotenv
load_dotenv()
print(f"Loading model: {MODEL_ID}")
print("(This downloads ~2.3 GB on first run; cached on subsequent runs)\n")
model, tokenizer = load_model_and_tokenizer()
print("\n--- Trainable parameter count ---")
print_trainable_parameters(model)
print("\n--- LoRA adapter summary ---")
model.print_trainable_parameters() # PEFT's built-in version
print("\n--- Tokenizer ---")
print(f"vocab size : {tokenizer.vocab_size:,}")
print(f"pad_token : {tokenizer.pad_token!r} (id={tokenizer.pad_token_id})")
print(f"eos_token : {tokenizer.eos_token!r} (id={tokenizer.eos_token_id})")
print(f"padding_side : {tokenizer.padding_side}")
print("\nmodel.py OK — model and tokenizer ready for train.py")