Spaces:
Sleeping
Sleeping
File size: 6,029 Bytes
7e9d774 49eb6ed 7e9d774 9de1a03 7e9d774 5500299 7e9d774 49eb6ed 7e9d774 49eb6ed 7e9d774 5500299 7e9d774 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 | """
src/model.py — Base model + LoRA setup for Phi-3-mini fine-tuning.
Loads microsoft/Phi-3-mini-4k-instruct in 4-bit quantization (BitsAndBytes),
applies a LoRA adapter via PEFT, and returns the ready-to-train model + tokenizer.
Call load_model_and_tokenizer() from train.py — do not import data.py from here.
"""
from __future__ import annotations
import torch
from peft import LoraConfig, TaskType, get_peft_model, prepare_model_for_kbit_training
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
from transformers import PreTrainedTokenizerBase
# ---------------------------------------------------------------------------
# Constants
# ---------------------------------------------------------------------------
MODEL_ID = "microsoft/Phi-3-mini-4k-instruct"
HUB_REPO = "rotemso23/dialogsum-phi3-lora"
LORA_R = 16
LORA_ALPHA = 32
LORA_DROPOUT = 0.05
LORA_TARGET_MODULES = ["qkv_proj", "o_proj"]
# ---------------------------------------------------------------------------
# Model + tokenizer loading
# ---------------------------------------------------------------------------
def load_model_and_tokenizer(
model_id: str = MODEL_ID,
load_in_4bit: bool = True,
) -> tuple[object, PreTrainedTokenizerBase]:
"""
Load Phi-3-mini with 4-bit quantization and apply a LoRA adapter.
Steps:
1. Load tokenizer with right-padding (required for causal LM training).
2. Build BitsAndBytesConfig for 4-bit NF4 quantization with fp16 compute.
3. Load the base model with device_map='auto' so it lands on GPU when available.
4. Call prepare_model_for_kbit_training() to enable gradient checkpointing
and cast layer norms to fp32 — required before applying LoRA to a
quantized model.
5. Apply LoraConfig targeting qkv_proj and o_proj attention projections.
6. Return (peft_model, tokenizer).
The returned model has ~1-2% trainable parameters (the LoRA adapter weights).
All base model weights are frozen and kept in 4-bit.
Args:
model_id: HuggingFace model identifier. Defaults to Phi-3-mini-4k-instruct.
load_in_4bit: Whether to use 4-bit quantization. Set False for CPU testing
(model will be large and slow, but functional for import checks).
Returns:
Tuple of (peft_model, tokenizer).
peft_model: PeftModel wrapping the quantized base — ready for Trainer.
tokenizer: AutoTokenizer with padding_side='right' and pad_token set.
"""
# Step 1: tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)
tokenizer.padding_side = "right"
# Phi-3 tokenizer already has pad_token (<|endoftext|> / id=32000).
# Guard in case a variant doesn't:
if tokenizer.pad_token is None:
tokenizer.pad_token = tokenizer.eos_token
# Step 2: 4-bit quantization config
if load_in_4bit:
bnb_config = BitsAndBytesConfig(
load_in_4bit=True,
bnb_4bit_quant_type="nf4", # NF4 is optimal for LLM weights
bnb_4bit_compute_dtype=torch.float16, # fp16 compute for speed
bnb_4bit_use_double_quant=True, # nested quantization saves ~0.4 bits/param
)
else:
bnb_config = None
# Step 3: base model
# trust_remote_code=False: transformers 5.x natively supports Phi-3 — using the
# built-in implementation avoids the RoPE scaling KeyError in the custom modeling_phi3.py.
model = AutoModelForCausalLM.from_pretrained(
model_id,
quantization_config=bnb_config,
device_map="auto",
trust_remote_code=False,
)
# Step 4: prepare for k-bit training
# Enables gradient checkpointing, casts layer norms to fp32, disables cache.
# Must be called BEFORE get_peft_model().
if load_in_4bit:
model = prepare_model_for_kbit_training(model)
# Step 5: LoRA adapter
lora_config = LoraConfig(
r=LORA_R,
lora_alpha=LORA_ALPHA,
lora_dropout=LORA_DROPOUT,
target_modules=LORA_TARGET_MODULES,
task_type=TaskType.CAUSAL_LM,
bias="none",
)
model = get_peft_model(model, lora_config)
return model, tokenizer
def print_trainable_parameters(model: object) -> None:
"""
Print the number of trainable vs. total parameters and the trainable %.
Expected output for Phi-3-mini with r=16, target=[qkv_proj, o_proj]:
trainable params: ~8,388,608 (8M)
total params: ~3,821,079,552 (3.8B)
trainable %: ~0.22%
(Exact numbers depend on the model revision.)
Args:
model: A PeftModel or any nn.Module.
"""
trainable = sum(p.numel() for p in model.parameters() if p.requires_grad)
total = sum(p.numel() for p in model.parameters())
pct = 100.0 * trainable / total
print(f"trainable params : {trainable:,}")
print(f"total params : {total:,}")
print(f"trainable %% : {pct:.4f}%%")
# ---------------------------------------------------------------------------
# CLI entry point
# ---------------------------------------------------------------------------
if __name__ == "__main__":
from dotenv import load_dotenv
load_dotenv()
print(f"Loading model: {MODEL_ID}")
print("(This downloads ~2.3 GB on first run; cached on subsequent runs)\n")
model, tokenizer = load_model_and_tokenizer()
print("\n--- Trainable parameter count ---")
print_trainable_parameters(model)
print("\n--- LoRA adapter summary ---")
model.print_trainable_parameters() # PEFT's built-in version
print("\n--- Tokenizer ---")
print(f"vocab size : {tokenizer.vocab_size:,}")
print(f"pad_token : {tokenizer.pad_token!r} (id={tokenizer.pad_token_id})")
print(f"eos_token : {tokenizer.eos_token!r} (id={tokenizer.eos_token_id})")
print(f"padding_side : {tokenizer.padding_side}")
print("\nmodel.py OK — model and tokenizer ready for train.py")
|