import torch
import os
from transformers import MllamaForConditionalGeneration, AutoProcessor
from peft import PeftModel

# Use Hugging Face model IDs
BASE_MODEL_ID = "unsloth/Llama-3.2-11B-Vision-Instruct-bnb-4bit"
LORA_MODEL_ID = "netprtony/Llama-3.2-11B-Vision-PokemonCard-OCR-LoRA"

def load_model_and_tokenizer(use_lora=True):
    """
    Load the base model and apply LoRA adapter for inference from Hugging Face
    
    Args:
        use_lora: Whether to load and apply LoRA adapter
        
    Returns:
        model: The fine-tuned model ready for inference
        processor: The processor (tokenizer)
    """
    try:
        # Check device availability
        device = "cuda" if torch.cuda.is_available() else "cpu"
        print(f"🖥️  Using device: {device}")
        
        if device == "cpu":
            print("⚠️  Warning: Running on CPU. This will be very slow. GPU strongly recommended.")
        
        # Load base model from Hugging Face
        print("📥 Loading base model from Hugging Face...")
        print(f"📌 Model: {BASE_MODEL_ID}")
        
        model = MllamaForConditionalGeneration.from_pretrained(
            BASE_MODEL_ID,
            torch_dtype=torch.float16 if device == "cuda" else torch.float32,
            device_map="auto" if device == "cuda" else "cpu",
            trust_remote_code=True,
        )
        
        # Load processor (tokenizer) from Hugging Face
        print("📥 Loading processor from Hugging Face...")
        processor = AutoProcessor.from_pretrained(
            BASE_MODEL_ID, 
            trust_remote_code=True,
        )
        
        # Load LoRA adapter from Hugging Face if requested
        if use_lora:
            print("📥 Loading LoRA adapter from Hugging Face...")
            print(f"📌 LoRA Model: {LORA_MODEL_ID}")
            try:
                model = PeftModel.from_pretrained(model, LORA_MODEL_ID)
                print("✅ LoRA adapter loaded successfully!")
            except Exception as lora_error:
                print(f"⚠️  Warning: Could not load LoRA adapter: {str(lora_error)}")
                print("📌 Using base model without fine-tuning")
        else:
            print("📌 Using base model without LoRA adapter")
        
        # Set to eval mode
        model.eval()
        
        print("✅ Model loaded successfully!")
        return model, processor
        
    except Exception as e:
        print(f"❌ Error loading model: {str(e)}")
        import traceback
        traceback.print_exc()
        raise

def prepare_inputs(image, processor, device):
    """
    Prepare inputs for the model from the image using the processor.
    
    Args:
        image: PIL Image
        processor: The processor (tokenizer)
        device: Device to move tensors to
    Returns:
        inputs: Prepared inputs for the model
    """
    inputs = processor(images=image, return_tensors="pt").to(device)
    return inputs