Spaces:

Kash6
/

local_copilot

Sleeping

App Files Files Community

local_copilot / app /lora_model.py

Kash6

Deploy AI Coding Assistant

04653e2 about 2 months ago

raw

history blame contribute delete

3.34 kB

	"""LoRA-tuned model loading and inference optimization."""
	import torch
	import sys
	from pathlib import Path
	sys.path.append(str(Path(__file__).parent.parent))

	from transformers import AutoModelForCausalLM, AutoTokenizer
	from app.config import Config

	# Optional imports for GPU features
	try:
	from transformers import BitsAndBytesConfig
	QUANTIZATION_AVAILABLE = True
	except ImportError:
	QUANTIZATION_AVAILABLE = False

	try:
	from peft import PeftModel, LoraConfig, get_peft_model, prepare_model_for_kbit_training
	LORA_AVAILABLE = True
	except ImportError:
	LORA_AVAILABLE = False


	class OptimizedModelLoader:
	"""Load and optimize LLM with quantization, GPU offloading, and LoRA."""

	@staticmethod
	def load_model(model_name: str = Config.MODEL_NAME, use_lora: bool = False):
	"""Load model with optimizations."""
	# Determine device
	device = "cuda" if torch.cuda.is_available() else "cpu"

	# Configure 8-bit quantization (only if GPU available and bitsandbytes installed)
	bnb_config = None
	if Config.ENABLE_QUANTIZATION and device == "cuda" and QUANTIZATION_AVAILABLE:
	bnb_config = BitsAndBytesConfig(
	load_in_8bit=True,
	llm_int8_threshold=6.0,
	llm_int8_skip_modules=None,
	llm_int8_enable_fp32_cpu_offload=True
	)

	# Load tokenizer
	tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=True)
	if tokenizer.pad_token is None:
	tokenizer.pad_token = tokenizer.eos_token

	# Load base model
	model = AutoModelForCausalLM.from_pretrained(
	model_name,
	device_map="auto" if device == "cuda" else None,
	quantization_config=bnb_config,
	torch_dtype=torch.float16 if device == "cuda" else torch.float32,
	trust_remote_code=True,
	low_cpu_mem_usage=True
	)

	# Move to device if not using device_map
	if device == "cpu":
	model = model.to(device)

	# Apply LoRA if requested (only on GPU with LORA available)
	if use_lora and device == "cuda" and LORA_AVAILABLE:
	model = OptimizedModelLoader._apply_lora(model)

	return model, tokenizer

	@staticmethod
	def _apply_lora(model):
	"""Apply LoRA configuration to model."""
	if not LORA_AVAILABLE:
	return model

	# Prepare model for k-bit training
	model = prepare_model_for_kbit_training(model)

	# Configure LoRA
	lora_config = LoraConfig(
	r=Config.LORA_R,
	lora_alpha=Config.LORA_ALPHA,
	target_modules=Config.LORA_TARGET_MODULES,
	lora_dropout=Config.LORA_DROPOUT,
	bias="none",
	task_type="CAUSAL_LM"
	)

	# Apply LoRA
	model = get_peft_model(model, lora_config)
	model.print_trainable_parameters()

	return model

	@staticmethod
	def load_lora_checkpoint(model, checkpoint_path: str):
	"""Load fine-tuned LoRA weights."""
	if not LORA_AVAILABLE:
	return model
	model = PeftModel.from_pretrained(model, checkpoint_path)
	return model