Abdulmateen
/

llava-merged

@@ -7,33 +7,27 @@ import base64
 class EndpointHandler:
     def __init__(self, path=""):
-        # The 'path' argument is now the path to your single, merged model repository
         print(f"Loading processor and model from: {path}...")
-        # --- SIMPLIFIED LOADING ---
-        # No more base model or PeftModel. Load everything directly.
-        self.processor = AutoProcessor.from_pretrained(path, revision="a272c74")
         self.model = LlavaForConditionalGeneration.from_pretrained(
             path,
             load_in_4bit=True,
             torch_dtype=torch.float16,
             device_map="auto"
         )
-        # --- END OF SIMPLIFICATION ---
         print("✅ Model loaded successfully.")
     def __call__(self, data: dict) -> dict:
-        # The inference logic remains the same
         payload = data.pop("inputs", data)
         prompt_text = payload.pop("prompt", "Describe the image in detail.")
         image_url = payload.pop("image_url", None)
         image_b64 = payload.pop("image_b64", None)
         max_new_tokens = payload.pop("max_new_tokens", 200)
-        # Load image from either a URL or a base64 string
         if image_url:
             try:
                 response = requests.get(image_url)
@@ -47,21 +41,26 @@ class EndpointHandler:
                 image = Image.open(BytesIO(image_bytes))
             except Exception as e:
                 return {"error": f"Failed to decode base64 image: {e}"}
-        else:
-            return {"error": "No image provided. Please use 'image_url' or 'image_b64'."}
-        # Format the prompt for LLaVA
-        prompt = f"USER: <image>\n{prompt_text} ASSISTANT:"
-        # Process inputs
-        inputs = self.processor(text=prompt, images=image, return_tensors="pt").to("cuda")
-        # Generate a response
-        with torch.no_grad():
             output = self.model.generate(**inputs, max_new_tokens=max_new_tokens)
-        # Decode and clean up the response
-        full_response = self.processor.decode(output[0], skip_special_tokens=True)
         assistant_response = full_response.split("ASSISTANT:")[-1].strip()
-        return {"generated_text": assistant_response}

 class EndpointHandler:
     def __init__(self, path=""):
+        # This part remains the same
         print(f"Loading processor and model from: {path}...")
+        self.processor = AutoProcessor.from_pretrained(path) # Removed revision for broader compatibility
         self.model = LlavaForConditionalGeneration.from_pretrained(
             path,
             load_in_4bit=True,
             torch_dtype=torch.float16,
             device_map="auto"
         )
         print("✅ Model loaded successfully.")
     def __call__(self, data: dict) -> dict:
         payload = data.pop("inputs", data)
         prompt_text = payload.pop("prompt", "Describe the image in detail.")
         image_url = payload.pop("image_url", None)
         image_b64 = payload.pop("image_b64", None)
         max_new_tokens = payload.pop("max_new_tokens", 200)
+        image = None
+        # Try to load an image if provided
         if image_url:
             try:
                 response = requests.get(image_url)
                 image = Image.open(BytesIO(image_bytes))
             except Exception as e:
                 return {"error": f"Failed to decode base64 image: {e}"}
+        # --- NEW LOGIC: Check if an image is present ---
+        if image is not None:
+            # --- Case 1: Multimodal (Image + Text) ---
+            print("Processing multimodal request...")
+            prompt = f"USER: <image>\n{prompt_text} ASSISTANT:"
+            inputs = self.processor(text=prompt, images=image, return_tensors="pt").to("cuda")
+            output = self.model.generate(**inputs, max_new_tokens=max_new_tokens)
+            full_response = self.processor.decode(output[0], skip_special_tokens=True)
+        else:
+            # --- Case 2: Text-Only ---
+            print("Processing text-only request...")
+            prompt = f"USER: {prompt_text} ASSISTANT:"
+            # Note: We do NOT pass the 'images' argument here
+            inputs = self.processor(text=prompt, return_tensors="pt").to("cuda")
+            # Note: We do NOT pass the 'images' keyword to generate()
             output = self.model.generate(**inputs, max_new_tokens=max_new_tokens)
+            full_response = self.processor.decode(output[0], skip_special_tokens=True)
+        # Clean up the response to get only the assistant's part
         assistant_response = full_response.split("ASSISTANT:")[-1].strip()
+        return {"generated_text": assistant_response}