"""LoRA-tuned model loading and inference optimization."""
import torch
import sys
from pathlib import Path
sys.path.append(str(Path(__file__).parent.parent))

from transformers import AutoModelForCausalLM, AutoTokenizer
from app.config import Config

# Optional imports for GPU features
try:
    from transformers import BitsAndBytesConfig
    QUANTIZATION_AVAILABLE = True
except ImportError:
    QUANTIZATION_AVAILABLE = False

try:
    from peft import PeftModel, LoraConfig, get_peft_model, prepare_model_for_kbit_training
    LORA_AVAILABLE = True
except ImportError:
    LORA_AVAILABLE = False


class OptimizedModelLoader:
    """Load and optimize LLM with quantization, GPU offloading, and LoRA."""
    
    @staticmethod
    def load_model(model_name: str = Config.MODEL_NAME, use_lora: bool = False):
        """Load model with optimizations."""
        # Determine device
        device = "cuda" if torch.cuda.is_available() else "cpu"
        
        # Configure 8-bit quantization (only if GPU available and bitsandbytes installed)
        bnb_config = None
        if Config.ENABLE_QUANTIZATION and device == "cuda" and QUANTIZATION_AVAILABLE:
            bnb_config = BitsAndBytesConfig(
                load_in_8bit=True,
                llm_int8_threshold=6.0,
                llm_int8_skip_modules=None,
                llm_int8_enable_fp32_cpu_offload=True
            )
        
        # Load tokenizer
        tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=True)
        if tokenizer.pad_token is None:
            tokenizer.pad_token = tokenizer.eos_token
        
        # Load base model
        model = AutoModelForCausalLM.from_pretrained(
            model_name,
            device_map="auto" if device == "cuda" else None,
            quantization_config=bnb_config,
            torch_dtype=torch.float16 if device == "cuda" else torch.float32,
            trust_remote_code=True,
            low_cpu_mem_usage=True
        )
        
        # Move to device if not using device_map
        if device == "cpu":
            model = model.to(device)
        
        # Apply LoRA if requested (only on GPU with LORA available)
        if use_lora and device == "cuda" and LORA_AVAILABLE:
            model = OptimizedModelLoader._apply_lora(model)
        
        return model, tokenizer
    
    @staticmethod
    def _apply_lora(model):
        """Apply LoRA configuration to model."""
        if not LORA_AVAILABLE:
            return model
            
        # Prepare model for k-bit training
        model = prepare_model_for_kbit_training(model)
        
        # Configure LoRA
        lora_config = LoraConfig(
            r=Config.LORA_R,
            lora_alpha=Config.LORA_ALPHA,
            target_modules=Config.LORA_TARGET_MODULES,
            lora_dropout=Config.LORA_DROPOUT,
            bias="none",
            task_type="CAUSAL_LM"
        )
        
        # Apply LoRA
        model = get_peft_model(model, lora_config)
        model.print_trainable_parameters()
        
        return model
    
    @staticmethod
    def load_lora_checkpoint(model, checkpoint_path: str):
        """Load fine-tuned LoRA weights."""
        if not LORA_AVAILABLE:
            return model
        model = PeftModel.from_pretrained(model, checkpoint_path)
        return model