Abdulmateen
/

llava-merged

@@ -6,9 +6,7 @@ from io import BytesIO
 class EndpointHandler:
     def __init__(self, path=""):
-        # The 'path' is now a self-contained directory with the complete, merged model.
-        # No internet access is needed here.
         print("Loading model and processor from local path...")
         self.processor = AutoProcessor.from_pretrained(path, trust_remote_code=True)
         self.model = LlavaForConditionalGeneration.from_pretrained(
@@ -21,25 +19,39 @@ class EndpointHandler:
         print("✅ Model loaded successfully.")
     def __call__(self, data: dict) -> dict:
-        prompt_text = data.pop("prompt", "Describe the image in detail.")
-        image_b64 = data.pop("image_b64", None)
-        max_new_tokens = data.pop("max_new_tokens", 200)
-        if not image_b64:
-            return {"error": "No image provided. Please use the 'image_b64' key."}
-        try:
-            image_bytes = base64.b64decode(image_b64)
-            image = Image.open(BytesIO(image_bytes))
-        except Exception as e:
-            return {"error": f"Failed to decode or open base64 image: {e}"}
-        prompt = f"USER: <image>\n{prompt_text} ASSISTANT:"
-        inputs = self.processor(text=prompt, images=image, return_tensors="pt").to("cuda")
         with torch.no_grad():
             output = self.model.generate(**inputs, max_new_tokens=max_new_tokens)
         full_response = self.processor.decode(output[0], skip_special_tokens=True)
         assistant_response = full_response.split("ASSISTANT:")[-1].strip()

 class EndpointHandler:
     def __init__(self, path=""):
+        # The 'path' is a self-contained directory with the complete, merged model.
         print("Loading model and processor from local path...")
         self.processor = AutoProcessor.from_pretrained(path, trust_remote_code=True)
         self.model = LlavaForConditionalGeneration.from_pretrained(
         print("✅ Model loaded successfully.")
     def __call__(self, data: dict) -> dict:
+        # FIX 1: Correctly handle the payload, whether it's wrapped in "inputs" or not.
+        payload = data.pop("inputs", data)
+        # Extract data from the payload
+        prompt_text = payload.pop("prompt", "Describe the image in detail.")
+        image_b64 = payload.pop("image_b64", None)
+        max_new_tokens = payload.pop("max_new_tokens", 200)
+        image = None
+        # Try to process an image only if it was provided
+        if image_b64:
+            try:
+                image_bytes = base64.b64decode(image_b64)
+                image = Image.open(BytesIO(image_bytes))
+            except Exception as e:
+                return {"error": f"Failed to decode or open base64 image: {e}"}
+        # FIX 2: Use separate logic for multimodal and text-only requests.
+        if image is not None:
+            # --- Case 1: Multimodal (Image + Text) ---
+            print("Processing multimodal request...")
+            prompt = f"USER: <image>\n{prompt_text} ASSISTANT:"
+            inputs = self.processor(text=prompt, images=image, return_tensors="pt").to("cuda")
+        else:
+            # --- Case 2: Text-Only ---
+            print("Processing text-only request...")
+            prompt = f"USER: {prompt_text} ASSISTANT:"
+            inputs = self.processor(text=prompt, return_tensors="pt").to("cuda")
+        # Generate the output
         with torch.no_grad():
             output = self.model.generate(**inputs, max_new_tokens=max_new_tokens)
         full_response = self.processor.decode(output[0], skip_special_tokens=True)
         assistant_response = full_response.split("ASSISTANT:")[-1].strip()