Florence2

+from transformers import AutoModelForCausalLM, AutoProcessor
+from PIL import Image
+import requests
+class EndpointHandler:
+    def __init__(self, model_dir):
+        # Load the model with trust_remote_code=True
+        self.model = AutoModelForCausalLM.from_pretrained(
+            model_dir,
+            trust_remote_code=True
+        ).eval().cuda()  # Use .cuda() if GPU is available, otherwise remove
+        self.processor = AutoProcessor.from_pretrained(
+            model_dir,
+            trust_remote_code=True
+        )
+    def __call__(self, data):
+        # Extract inputs from the request data
+        task_prompt = data.get("task_prompt", "<MORE_DETAILED_CAPTION>")
+        image_url = data.get("image_url")
+        # Load and process the image
+        image = self.load_image(image_url)
+        # Prepare inputs for the model
+        inputs = self.processor(
+            text=task_prompt,
+            images=image,
+            return_tensors="pt"
+        ).to("cuda")  # Use "cpu" if GPU is not available
+        # Generate output
+        generated_ids = self.model.generate(
+            input_ids=inputs["input_ids"],
+            pixel_values=inputs["pixel_values"],
+            max_new_tokens=1024,
+            num_beams=3,
+        )
+        # Decode and post-process the output
+        generated_text = self.processor.batch_decode(
+            generated_ids,
+            skip_special_tokens=True
+        )[0]
+        return {"caption": generated_text}
+    def load_image(self, image_url):
+        # Load image from the provided URL
+        image = Image.open(requests.get(image_url, stream=True).raw).convert("RGB")
+        return image