import torch from transformers import AutoProcessor, LlavaForConditionalGeneration from PIL import Image import base64 from io import BytesIO class EndpointHandler: def __init__(self, path=""): # The 'path' is a self-contained directory with the complete, merged model. print("Loading model and processor from local path...") self.processor = AutoProcessor.from_pretrained(path, trust_remote_code=True) self.model = LlavaForConditionalGeneration.from_pretrained( path, load_in_4bit=True, torch_dtype=torch.float16, device_map="auto", trust_remote_code=True ) print("✅ Model loaded successfully.") def __call__(self, data: dict) -> dict: payload = data.pop("inputs", data) prompt_text = payload.pop("prompt", "Describe the image in detail.") image_b64 = payload.pop("image_b64", None) max_new_tokens = payload.pop("max_new_tokens", 200) image = None if image_b64: try: image_bytes = base64.b64decode(image_b64) image = Image.open(BytesIO(image_bytes)) except Exception as e: return {"error": f"Failed to decode or open base64 image: {e}"} if image is not None: # --- Case 1: Multimodal (Image + Text) --- print("Processing multimodal request...") prompt = f"USER: \n{prompt_text} ASSISTANT:" inputs = self.processor(text=prompt, images=image, return_tensors="pt").to(self.model.device) else: # --- Case 2: Text-Only - CORRECTED LOGIC --- print("Processing text-only request...") prompt = f"USER: {prompt_text} ASSISTANT:" # First, process the text to get input_ids inputs = self.processor(text=prompt, return_tensors="pt") # --- THE FIX: Get image dimensions from the processor's .config --- image_processor = self.processor.image_processor config = image_processor.config # Create a dummy image tensor using the correct config values dummy_pixel_values = torch.zeros( ( 1, config.num_channels, config.crop_size['height'], config.crop_size['width'] ), dtype=self.model.dtype, device=self.model.device ) # Add the dummy tensor to the inputs dictionary inputs['pixel_values'] = dummy_pixel_values # Ensure the entire input dictionary is on the correct device inputs = inputs.to(self.model.device) # Generate the output (this part is the same for both cases) with torch.no_grad(): output = self.model.generate(**inputs, max_new_tokens=max_new_tokens) full_response = self.processor.decode(output[0], skip_special_tokens=True) # Clean up the response string assistant_response = full_response.split("ASSISTANT:")[-1].strip() return {"generated_text": assistant_response}