Florence2

@@ -3,17 +3,18 @@ from PIL import Image
 import requests
 import torch
 class EndpointHandler:
     def __init__(self, model_dir):
-        # Check if a GPU is available; use CPU if not
         device = "cuda" if torch.cuda.is_available() else "cpu"
-        # Load the model with trust_remote_code=True
         self.model = AutoModelForCausalLM.from_pretrained(
             model_dir,
             trust_remote_code=True
         ).eval().to(device)  # Dynamically move to the correct device
         self.processor = AutoProcessor.from_pretrained(
             model_dir,
             trust_remote_code=True
@@ -21,37 +22,50 @@ class EndpointHandler:
         self.device = device
     def __call__(self, data):
-        # Extract inputs from the request data
-        task_prompt = data.get("task_prompt", "<MORE_DETAILED_CAPTION>")
-        image_url = data.get("image_url")
-        # Load and process the image
-        image = self.load_image(image_url)
-        # Prepare inputs for the model
-        inputs = self.processor(
-            text=task_prompt,
-            images=image,
-            return_tensors="pt"
-        ).to(self.device)  # Use the correct device
-        # Generate output
-        generated_ids = self.model.generate(
-            input_ids=inputs["input_ids"],
-            pixel_values=inputs["pixel_values"],
-            max_new_tokens=1024,
-            num_beams=3,
-        )
-        # Decode and post-process the output
-        generated_text = self.processor.batch_decode(
-            generated_ids,
-            skip_special_tokens=True
-        )[0]
-        return {"caption": generated_text}
     def load_image(self, image_url):
-        # Load image from the provided URL
-        image = Image.open(requests.get(image_url, stream=True).raw).convert("RGB")
-        return image

 import requests
 import torch
 class EndpointHandler:
     def __init__(self, model_dir):
+        # Check if GPU is available, otherwise use CPU
         device = "cuda" if torch.cuda.is_available() else "cpu"
+        # Load the Florence model and processor
         self.model = AutoModelForCausalLM.from_pretrained(
             model_dir,
             trust_remote_code=True
         ).eval().to(device)  # Dynamically move to the correct device
         self.processor = AutoProcessor.from_pretrained(
             model_dir,
             trust_remote_code=True
         self.device = device
     def __call__(self, data):
+        try:
+            # Extract inputs from the request data
+            task_prompt = data.get("task_prompt", "<MORE_DETAILED_CAPTION>")
+            image_url = data.get("url")  # Match the key sent from n8n
+            if not image_url or not image_url.startswith("http"):
+                raise ValueError("Invalid or missing 'url' field. Please provide a valid image URL.")
+            # Load and process the image
+            image = self.load_image(image_url)
+            # Prepare inputs for the Florence model
+            inputs = self.processor(
+                text=task_prompt,
+                images=image,
+                return_tensors="pt"
+            ).to(self.device)
+            # Generate detailed caption using Florence
+            generated_ids = self.model.generate(
+                input_ids=inputs["input_ids"],
+                pixel_values=inputs["pixel_values"],
+                max_new_tokens=512,  # Adjust token limit for detailed captions
+                num_beams=3,         # Use beam search for better captions
+                early_stopping=True  # Stop when the best output is found
+            )
+            # Decode the generated text
+            generated_text = self.processor.batch_decode(
+                generated_ids,
+                skip_special_tokens=True
+            )[0]
+            return {"caption": generated_text}
+        except Exception as e:
+            return {"error": str(e)}
     def load_image(self, image_url):
+        try:
+            # Load image from URL
+            response = requests.get(image_url, stream=True)
+            response.raise_for_status()  # Raise an error for failed requests
+            image = Image.open(response.raw).convert("RGB")
+            return image
+        except Exception as e:
+            raise ValueError(f"Failed to load image from URL: {image_url}. Error: {e}")