import torch from transformers import AutoProcessor, LlavaForConditionalGeneration from peft import PeftModel from PIL import Image import requests from io import BytesIO import base64 class EndpointHandler: def __init__(self, path=""): # path is the local path to your LoRA adapter repository # 1. Define the base model ID base_model_id = "llava-hf/llava-v1.5-7b" # The path to your LoRA adapters is the local path provided lora_model_path = path print("Loading processor...") # ADDED: trust_remote_code=True is required for custom models self.processor = AutoProcessor.from_pretrained(base_model_id, trust_remote_code=True) print("Loading base model...") # Load the base model in 4-bit and add trust_remote_code=True self.model = LlavaForConditionalGeneration.from_pretrained( base_model_id, load_in_4bit=True, torch_dtype=torch.float16, device_map="auto", trust_remote_code=True ) print(f"Loading and merging LoRA adapters from: {lora_model_path}...") # Load and merge your LoRA adapters onto the base model self.model = PeftModel.from_pretrained(self.model, lora_model_path) print("✅ Model and adapters loaded successfully.") def __call__(self, data: dict) -> dict: prompt_text = data.pop("prompt", "Describe the image in detail.") image_b64 = data.pop("image_b64", None) max_new_tokens = data.pop("max_new_tokens", 200) if not image_b64: return {"error": "No image provided. Please use the 'image_b64' key."} try: image_bytes = base64.b64decode(image_b64) image = Image.open(BytesIO(image_bytes)) except Exception as e: return {"error": f"Failed to decode or open base64 image: {e}"} prompt = f"USER: \n{prompt_text} ASSISTANT:" inputs = self.processor(text=prompt, images=image, return_tensors="pt").to("cuda") with torch.no_grad(): output = self.model.generate(**inputs, max_new_tokens=max_new_tokens) full_response = self.processor.decode(output[0], skip_special_tokens=True) assistant_response = full_response.split("ASSISTANT:")[-1].strip() return {"generated_text": assistant_response}