Abdulmateen
/

llava-merged

4-bit precision

Model card Files Files and versions

Abdulmateen commited on Aug 3, 2025

Commit

5a65a07

·

verified ·

1 Parent(s): 2eccf7e

Update handler.py

Files changed (1) hide show

handler.py +11 -9

handler.py CHANGED Viewed

@@ -34,30 +34,32 @@ class EndpointHandler:
                 return {"error": f"Failed to decode or open base64 image: {e}"}
         if image is not None:
-            # --- Case 1: Multimodal (Image + Text) - This part is correct ---
             print("Processing multimodal request...")
             prompt = f"USER: <image>\n{prompt_text} ASSISTANT:"
             inputs = self.processor(text=prompt, images=image, return_tensors="pt").to(self.model.device)
         else:
-            # --- Case 2: Text-Only - This is the corrected logic ---
             print("Processing text-only request...")
             prompt = f"USER: {prompt_text} ASSISTANT:"
             # First, process the text to get input_ids
             inputs = self.processor(text=prompt, return_tensors="pt")
-            # Second, create a dummy image tensor with the correct shape, type, and device.
-            # This placeholder satisfies the model's input requirements.
             image_processor = self.processor.image_processor
             dummy_pixel_values = torch.zeros(
                 (
                     1,
-                    image_processor.num_channels,
-                    image_processor.size['height'],
-                    image_processor.size['width']
                 ),
-                dtype=self.model.dtype,      # Use the model's dtype (e.g., float16)
-                device=self.model.device    # Put the tensor on the same device as the model
             )
             # Add the dummy tensor to the inputs dictionary

                 return {"error": f"Failed to decode or open base64 image: {e}"}
         if image is not None:
+            # --- Case 1: Multimodal (Image + Text) ---
             print("Processing multimodal request...")
             prompt = f"USER: <image>\n{prompt_text} ASSISTANT:"
             inputs = self.processor(text=prompt, images=image, return_tensors="pt").to(self.model.device)
         else:
+            # --- Case 2: Text-Only - CORRECTED LOGIC ---
             print("Processing text-only request...")
             prompt = f"USER: {prompt_text} ASSISTANT:"
             # First, process the text to get input_ids
             inputs = self.processor(text=prompt, return_tensors="pt")
+            # --- THE FIX: Get image dimensions from the processor's .config ---
             image_processor = self.processor.image_processor
+            config = image_processor.config
+            # Create a dummy image tensor using the correct config values
             dummy_pixel_values = torch.zeros(
                 (
                     1,
+                    config.num_channels,
+                    config.crop_size['height'],
+                    config.crop_size['width']
                 ),
+                dtype=self.model.dtype,
+                device=self.model.device
             )
             # Add the dummy tensor to the inputs dictionary