blip-image-captioning-large-inference

@@ -8,7 +8,6 @@ device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
 class EndpointHandler():
     def __init__(self, path=""):
-        # load the optimized model
         self.processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-large")
         self.model = BlipForConditionalGeneration.from_pretrained(
             "Salesforce/blip-image-captioning-large"
@@ -17,31 +16,31 @@ class EndpointHandler():
         self.model = self.model.to(device)
     def __call__(self, data: Dict[str, Any]) -> Dict[str, Any]:
-        """
-        Args:
-            data (dict):
-                Should contain:
-                - 'images': List[bytes] of images.
-                - 'texts': List[str] of associated texts. (Optional for unconditional captioning)
-        Return:
-            A dict with key "captions" and associated list of generated captions.
-        """
         images = data.get("images")
-        texts = data.get("texts", ["a photography of"] * len(images))  # Default to "a photography of" if not provided
-        raw_images = [Image.open(BytesIO(_img)).convert("RGB") for _img in images]
-        # Here, process both image and text
-        processed_inputs = [self.processor(img, txt, return_tensors="pt") for img, txt in zip(raw_images, texts)]
-        processed_inputs = {
-            "pixel_values": torch.cat([inp["pixel_values"] for inp in processed_inputs], dim=0).to(device),
-            "input_ids": torch.cat([inp["input_ids"] for inp in processed_inputs], dim=0).to(device),
-            "attention_mask": torch.cat([inp["attention_mask"] for inp in processed_inputs], dim=0).to(device)
-        }
-        with torch.no_grad():
-            out = self.model.generate(**processed_inputs)
-        captions = self.processor.batch_decode(out, skip_special_tokens=True)
-        return {"captions": captions}

 class EndpointHandler():
     def __init__(self, path=""):
         self.processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-large")
         self.model = BlipForConditionalGeneration.from_pretrained(
             "Salesforce/blip-image-captioning-large"
         self.model = self.model.to(device)
     def __call__(self, data: Dict[str, Any]) -> Dict[str, Any]:
         images = data.get("images")
+        # Check if images is None or empty and handle it appropriately
+        if not images:
+            return {"captions": []}
+        # Default to "a photography of" if texts not provided
+        texts = data.get("texts", ["a photography of"] * len(images))
+        try:
+            raw_images = [Image.open(BytesIO(_img)).convert("RGB") for _img in images]
+            processed_inputs = [
+                self.processor(img, txt, return_tensors="pt") for img, txt in zip(raw_images, texts)
+            ]
+            processed_inputs = {
+                "pixel_values": torch.cat([inp["pixel_values"] for inp in processed_inputs], dim=0).to(device),
+                "input_ids": torch.cat([inp["input_ids"] for inp in processed_inputs], dim=0).to(device),
+                "attention_mask": torch.cat([inp["attention_mask"] for inp in processed_inputs], dim=0).to(device)
+            }
+            with torch.no_grad():
+                out = self.model.generate(**processed_inputs)
+            captions = self.processor.batch_decode(out, skip_special_tokens=True)
+            return {"captions": captions}
+        except Exception as e:
+            # Handle or log the exception and optionally return an error message
+            print(f"Error during processing: {str(e)}")
+            return {"error": str(e)}