Abdulmateen
/

llava-finetuned

@@ -7,55 +7,56 @@ import base64
 class EndpointHandler:
     def __init__(self, path=""):
-        # This loading logic is robust and correct
-        base_model_id = "llava-hf/llava-1.5-7b-hf"
-        print("Loading processor...")
-        self.processor = AutoProcessor.from_pretrained(base_model_id)
-        print("Loading base model...")
         self.model = LlavaForConditionalGeneration.from_pretrained(
-            base_model_id,
             load_in_4bit=True,
             torch_dtype=torch.float16,
             device_map="auto"
         )
-        print(f"Loading LoRA adapters from repository path: {path}...")
-        # This assumes your repo at `path` contains the LoRA adapter files
-        self.model.load_adapter(path)
-        print("✅ Model and adapters loaded successfully.")
     def __call__(self, data: dict) -> dict:
-        # --- Simplified and Corrected Inference Logic ---
         payload = data.pop("inputs", data)
-        prompt_text = payload.pop("prompt", "What can you do?")
         image_b64 = payload.pop("image_b64", None)
         max_new_tokens = payload.pop("max_new_tokens", 200)
         image = None
-        if image_b64:
             try:
                 image_bytes = base64.b64decode(image_b64)
-                image = Image.open(BytesIO(image_bytes)).convert("RGB")
             except Exception as e:
                 return {"error": f"Failed to decode base64 image: {e}"}
-        # This is the key change: a simple, clear separation of logic
         if image is not None:
-            # --- Case 1: Multimodal (Image + Text) Request ---
             prompt = f"USER: <image>\n{prompt_text} ASSISTANT:"
-            inputs = self.processor(text=prompt, images=image, return_tensors="pt").to(self.model.device)
             output = self.model.generate(**inputs, max_new_tokens=max_new_tokens)
         else:
-            # --- Case 2: Text-Only Request ---
             prompt = f"USER: {prompt_text} ASSISTANT:"
-            inputs = self.processor(text=prompt, return_tensors="pt").to(self.model.device)
             output = self.model.generate(**inputs, max_new_tokens=max_new_tokens)
-        # Decode the output and extract the assistant's response
-        full_response = self.processor.decode(output[0], skip_special_tokens=True)
         assistant_response = full_response.split("ASSISTANT:")[-1].strip()
         return {"generated_text": assistant_response}

 class EndpointHandler:
     def __init__(self, path=""):
+        print(f"Loading processor and model from: {path}...")
+        self.processor = AutoProcessor.from_pretrained(path)
         self.model = LlavaForConditionalGeneration.from_pretrained(
+            path,
             load_in_4bit=True,
             torch_dtype=torch.float16,
             device_map="auto"
         )
+        print("✅ Model loaded successfully.")
     def __call__(self, data: dict) -> dict:
         payload = data.pop("inputs", data)
+        prompt_text = payload.pop("prompt", "Describe the image in detail.")
+        image_url = payload.pop("image_url", None)
         image_b64 = payload.pop("image_b64", None)
         max_new_tokens = payload.pop("max_new_tokens", 200)
         image = None
+        # Try to load an image if provided
+        if image_url:
+            try:
+                response = requests.get(image_url)
+                response.raise_for_status()
+                image = Image.open(BytesIO(response.content))
+            except Exception as e:
+                return {"error": f"Failed to load image from URL: {e}"}
+        elif image_b64:
             try:
                 image_bytes = base64.b64decode(image_b64)
+                image = Image.open(BytesIO(image_bytes))
             except Exception as e:
                 return {"error": f"Failed to decode base64 image: {e}"}
+        # Check if an image is present and choose the correct logic path
         if image is not None:
+            # --- Case 1: Multimodal (Image + Text) ---
+            print("Processing multimodal request...")
             prompt = f"USER: <image>\n{prompt_text} ASSISTANT:"
+            inputs = self.processor(text=prompt, images=image, return_tensors="pt").to("cuda")
             output = self.model.generate(**inputs, max_new_tokens=max_new_tokens)
+            full_response = self.processor.decode(output[0], skip_special_tokens=True)
         else:
+            # --- Case 2: Text-Only ---
+            print("Processing text-only request...")
             prompt = f"USER: {prompt_text} ASSISTANT:"
+            inputs = self.processor(text=prompt, return_tensors="pt").to("cuda")
             output = self.model.generate(**inputs, max_new_tokens=max_new_tokens)
+            full_response = self.processor.decode(output[0], skip_special_tokens=True)
         assistant_response = full_response.split("ASSISTANT:")[-1].strip()
         return {"generated_text": assistant_response}