""" src/model.py — Base model + LoRA setup for Phi-3-mini fine-tuning. Loads microsoft/Phi-3-mini-4k-instruct in 4-bit quantization (BitsAndBytes), applies a LoRA adapter via PEFT, and returns the ready-to-train model + tokenizer. Call load_model_and_tokenizer() from train.py — do not import data.py from here. """ from __future__ import annotations import torch from peft import LoraConfig, TaskType, get_peft_model, prepare_model_for_kbit_training from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig from transformers import PreTrainedTokenizerBase # --------------------------------------------------------------------------- # Constants # --------------------------------------------------------------------------- MODEL_ID = "microsoft/Phi-3-mini-4k-instruct" HUB_REPO = "rotemso23/dialogsum-phi3-lora" LORA_R = 16 LORA_ALPHA = 32 LORA_DROPOUT = 0.05 LORA_TARGET_MODULES = ["qkv_proj", "o_proj"] # --------------------------------------------------------------------------- # Model + tokenizer loading # --------------------------------------------------------------------------- def load_model_and_tokenizer( model_id: str = MODEL_ID, load_in_4bit: bool = True, ) -> tuple[object, PreTrainedTokenizerBase]: """ Load Phi-3-mini with 4-bit quantization and apply a LoRA adapter. Steps: 1. Load tokenizer with right-padding (required for causal LM training). 2. Build BitsAndBytesConfig for 4-bit NF4 quantization with fp16 compute. 3. Load the base model with device_map='auto' so it lands on GPU when available. 4. Call prepare_model_for_kbit_training() to enable gradient checkpointing and cast layer norms to fp32 — required before applying LoRA to a quantized model. 5. Apply LoraConfig targeting qkv_proj and o_proj attention projections. 6. Return (peft_model, tokenizer). The returned model has ~1-2% trainable parameters (the LoRA adapter weights). All base model weights are frozen and kept in 4-bit. Args: model_id: HuggingFace model identifier. Defaults to Phi-3-mini-4k-instruct. load_in_4bit: Whether to use 4-bit quantization. Set False for CPU testing (model will be large and slow, but functional for import checks). Returns: Tuple of (peft_model, tokenizer). peft_model: PeftModel wrapping the quantized base — ready for Trainer. tokenizer: AutoTokenizer with padding_side='right' and pad_token set. """ # Step 1: tokenizer tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True) tokenizer.padding_side = "right" # Phi-3 tokenizer already has pad_token (<|endoftext|> / id=32000). # Guard in case a variant doesn't: if tokenizer.pad_token is None: tokenizer.pad_token = tokenizer.eos_token # Step 2: 4-bit quantization config if load_in_4bit: bnb_config = BitsAndBytesConfig( load_in_4bit=True, bnb_4bit_quant_type="nf4", # NF4 is optimal for LLM weights bnb_4bit_compute_dtype=torch.float16, # fp16 compute for speed bnb_4bit_use_double_quant=True, # nested quantization saves ~0.4 bits/param ) else: bnb_config = None # Step 3: base model # trust_remote_code=False: transformers 5.x natively supports Phi-3 — using the # built-in implementation avoids the RoPE scaling KeyError in the custom modeling_phi3.py. model = AutoModelForCausalLM.from_pretrained( model_id, quantization_config=bnb_config, device_map="auto", trust_remote_code=False, ) # Step 4: prepare for k-bit training # Enables gradient checkpointing, casts layer norms to fp32, disables cache. # Must be called BEFORE get_peft_model(). if load_in_4bit: model = prepare_model_for_kbit_training(model) # Step 5: LoRA adapter lora_config = LoraConfig( r=LORA_R, lora_alpha=LORA_ALPHA, lora_dropout=LORA_DROPOUT, target_modules=LORA_TARGET_MODULES, task_type=TaskType.CAUSAL_LM, bias="none", ) model = get_peft_model(model, lora_config) return model, tokenizer def print_trainable_parameters(model: object) -> None: """ Print the number of trainable vs. total parameters and the trainable %. Expected output for Phi-3-mini with r=16, target=[qkv_proj, o_proj]: trainable params: ~8,388,608 (8M) total params: ~3,821,079,552 (3.8B) trainable %: ~0.22% (Exact numbers depend on the model revision.) Args: model: A PeftModel or any nn.Module. """ trainable = sum(p.numel() for p in model.parameters() if p.requires_grad) total = sum(p.numel() for p in model.parameters()) pct = 100.0 * trainable / total print(f"trainable params : {trainable:,}") print(f"total params : {total:,}") print(f"trainable %% : {pct:.4f}%%") # --------------------------------------------------------------------------- # CLI entry point # --------------------------------------------------------------------------- if __name__ == "__main__": from dotenv import load_dotenv load_dotenv() print(f"Loading model: {MODEL_ID}") print("(This downloads ~2.3 GB on first run; cached on subsequent runs)\n") model, tokenizer = load_model_and_tokenizer() print("\n--- Trainable parameter count ---") print_trainable_parameters(model) print("\n--- LoRA adapter summary ---") model.print_trainable_parameters() # PEFT's built-in version print("\n--- Tokenizer ---") print(f"vocab size : {tokenizer.vocab_size:,}") print(f"pad_token : {tokenizer.pad_token!r} (id={tokenizer.pad_token_id})") print(f"eos_token : {tokenizer.eos_token!r} (id={tokenizer.eos_token_id})") print(f"padding_side : {tokenizer.padding_side}") print("\nmodel.py OK — model and tokenizer ready for train.py")