Spaces:
Sleeping
Sleeping
| """LoRA-tuned model loading and inference optimization.""" | |
| import torch | |
| import sys | |
| from pathlib import Path | |
| sys.path.append(str(Path(__file__).parent.parent)) | |
| from transformers import AutoModelForCausalLM, AutoTokenizer | |
| from app.config import Config | |
| # Optional imports for GPU features | |
| try: | |
| from transformers import BitsAndBytesConfig | |
| QUANTIZATION_AVAILABLE = True | |
| except ImportError: | |
| QUANTIZATION_AVAILABLE = False | |
| try: | |
| from peft import PeftModel, LoraConfig, get_peft_model, prepare_model_for_kbit_training | |
| LORA_AVAILABLE = True | |
| except ImportError: | |
| LORA_AVAILABLE = False | |
| class OptimizedModelLoader: | |
| """Load and optimize LLM with quantization, GPU offloading, and LoRA.""" | |
| def load_model(model_name: str = Config.MODEL_NAME, use_lora: bool = False): | |
| """Load model with optimizations.""" | |
| # Determine device | |
| device = "cuda" if torch.cuda.is_available() else "cpu" | |
| # Configure 8-bit quantization (only if GPU available and bitsandbytes installed) | |
| bnb_config = None | |
| if Config.ENABLE_QUANTIZATION and device == "cuda" and QUANTIZATION_AVAILABLE: | |
| bnb_config = BitsAndBytesConfig( | |
| load_in_8bit=True, | |
| llm_int8_threshold=6.0, | |
| llm_int8_skip_modules=None, | |
| llm_int8_enable_fp32_cpu_offload=True | |
| ) | |
| # Load tokenizer | |
| tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=True) | |
| if tokenizer.pad_token is None: | |
| tokenizer.pad_token = tokenizer.eos_token | |
| # Load base model | |
| model = AutoModelForCausalLM.from_pretrained( | |
| model_name, | |
| device_map="auto" if device == "cuda" else None, | |
| quantization_config=bnb_config, | |
| torch_dtype=torch.float16 if device == "cuda" else torch.float32, | |
| trust_remote_code=True, | |
| low_cpu_mem_usage=True | |
| ) | |
| # Move to device if not using device_map | |
| if device == "cpu": | |
| model = model.to(device) | |
| # Apply LoRA if requested (only on GPU with LORA available) | |
| if use_lora and device == "cuda" and LORA_AVAILABLE: | |
| model = OptimizedModelLoader._apply_lora(model) | |
| return model, tokenizer | |
| def _apply_lora(model): | |
| """Apply LoRA configuration to model.""" | |
| if not LORA_AVAILABLE: | |
| return model | |
| # Prepare model for k-bit training | |
| model = prepare_model_for_kbit_training(model) | |
| # Configure LoRA | |
| lora_config = LoraConfig( | |
| r=Config.LORA_R, | |
| lora_alpha=Config.LORA_ALPHA, | |
| target_modules=Config.LORA_TARGET_MODULES, | |
| lora_dropout=Config.LORA_DROPOUT, | |
| bias="none", | |
| task_type="CAUSAL_LM" | |
| ) | |
| # Apply LoRA | |
| model = get_peft_model(model, lora_config) | |
| model.print_trainable_parameters() | |
| return model | |
| def load_lora_checkpoint(model, checkpoint_path: str): | |
| """Load fine-tuned LoRA weights.""" | |
| if not LORA_AVAILABLE: | |
| return model | |
| model = PeftModel.from_pretrained(model, checkpoint_path) | |
| return model | |