Abdulmateen
/

llava-finetuned

@@ -1,5 +1,6 @@
 import torch
 from transformers import AutoProcessor, LlavaForConditionalGeneration
 from PIL import Image
 import requests
 from io import BytesIO
@@ -7,56 +8,55 @@ import base64
 class EndpointHandler:
     def __init__(self, path=""):
-        print(f"Loading processor and model from: {path}...")
-        self.processor = AutoProcessor.from_pretrained(path)
         self.model = LlavaForConditionalGeneration.from_pretrained(
-            path,
             load_in_4bit=True,
             torch_dtype=torch.float16,
-            device_map="auto"
         )
-        print("✅ Model loaded successfully.")
     def __call__(self, data: dict) -> dict:
-        payload = data.pop("inputs", data)
-        prompt_text = payload.pop("prompt", "Describe the image in detail.")
-        image_url = payload.pop("image_url", None)
-        image_b64 = payload.pop("image_b64", None)
-        max_new_tokens = payload.pop("max_new_tokens", 200)
-        image = None
-        # Try to load an image if provided
-        if image_url:
-            try:
-                response = requests.get(image_url)
-                response.raise_for_status()
-                image = Image.open(BytesIO(response.content))
-            except Exception as e:
-                return {"error": f"Failed to load image from URL: {e}"}
-        elif image_b64:
-            try:
-                image_bytes = base64.b64decode(image_b64)
-                image = Image.open(BytesIO(image_bytes))
-            except Exception as e:
-                return {"error": f"Failed to decode base64 image: {e}"}
-        # Check if an image is present and choose the correct logic path
-        if image is not None:
-            # --- Case 1: Multimodal (Image + Text) ---
-            print("Processing multimodal request...")
-            prompt = f"USER: <image>\n{prompt_text} ASSISTANT:"
-            inputs = self.processor(text=prompt, images=image, return_tensors="pt").to("cuda")
-            output = self.model.generate(**inputs, max_new_tokens=max_new_tokens)
-            full_response = self.processor.decode(output[0], skip_special_tokens=True)
-        else:
-            # --- Case 2: Text-Only ---
-            print("Processing text-only request...")
-            prompt = f"USER: {prompt_text} ASSISTANT:"
-            inputs = self.processor(text=prompt, return_tensors="pt").to("cuda")
             output = self.model.generate(**inputs, max_new_tokens=max_new_tokens)
-            full_response = self.processor.decode(output[0], skip_special_tokens=True)
         assistant_response = full_response.split("ASSISTANT:")[-1].strip()
         return {"generated_text": assistant_response}

 import torch
 from transformers import AutoProcessor, LlavaForConditionalGeneration
+from peft import PeftModel
 from PIL import Image
 import requests
 from io import BytesIO
 class EndpointHandler:
     def __init__(self, path=""):
+        # path is the local path to your LoRA adapter repository
+        # 1. Define the base model ID
+        base_model_id = "llava-hf/llava-v1.5-7b"
+        # The path to your LoRA adapters is the local path provided
+        lora_model_path = path
+        print("Loading processor...")
+        # ADDED: trust_remote_code=True is required for custom models
+        self.processor = AutoProcessor.from_pretrained(base_model_id, trust_remote_code=True)
+        print("Loading base model...")
+        # Load the base model in 4-bit and add trust_remote_code=True
         self.model = LlavaForConditionalGeneration.from_pretrained(
+            base_model_id,
             load_in_4bit=True,
             torch_dtype=torch.float16,
+            device_map="auto",
+            trust_remote_code=True
         )
+        print(f"Loading and merging LoRA adapters from: {lora_model_path}...")
+        # Load and merge your LoRA adapters onto the base model
+        self.model = PeftModel.from_pretrained(self.model, lora_model_path)
+        print("✅ Model and adapters loaded successfully.")
     def __call__(self, data: dict) -> dict:
+        prompt_text = data.pop("prompt", "Describe the image in detail.")
+        image_b64 = data.pop("image_b64", None)
+        max_new_tokens = data.pop("max_new_tokens", 200)
+        if not image_b64:
+            return {"error": "No image provided. Please use the 'image_b64' key."}
+        try:
+            image_bytes = base64.b64decode(image_b64)
+            image = Image.open(BytesIO(image_bytes))
+        except Exception as e:
+            return {"error": f"Failed to decode or open base64 image: {e}"}
+        prompt = f"USER: <image>\n{prompt_text} ASSISTANT:"
+        inputs = self.processor(text=prompt, images=image, return_tensors="pt").to("cuda")
+        with torch.no_grad():
             output = self.model.generate(**inputs, max_new_tokens=max_new_tokens)
+        full_response = self.processor.decode(output[0], skip_special_tokens=True)
         assistant_response = full_response.split("ASSISTANT:")[-1].strip()
         return {"generated_text": assistant_response}