v2 verison of handler.py

Browse files

added the info about the inputs as a additional parameters

Files changed (2) hide show

handler.py +18 -15
test_handler.py +7 -3

handler.py CHANGED Viewed

@@ -10,32 +10,35 @@ class EndpointHandler():
         self.processor = BlipProcessor.from_pretrained(path)
         self.model = BlipForConditionalGeneration.from_pretrained(path).to("cuda" if torch.cuda.is_available() else "cpu")
-    def __call__(self, data: Dict[str, Any]) -> List[Dict[str, Any]]:
         """
-        data args:
-            image_url (:obj: `str`): URL of the image to caption
-            prompt (:obj: `str`, optional): Text prompt for conditional captioning
         Return:
-            A :obj:`list` with caption as `dict`
         """
-        # Get inputs from the data
-        image_url = data.get("image_url")
-        prompt = data.get("prompt", "")  # Optional prompt for conditional captioning
         # Load image from URL and ensure RGB format
         image = Image.open(requests.get(image_url, stream=True).raw).convert("RGB")
-        # Conditional or Unconditional Captioning
         if prompt:
-            # Conditional captioning
-            inputs = self.processor(image, prompt, return_tensors="pt").to(self.model.device)
         else:
-            # Unconditional captioning
-            inputs = self.processor(image, return_tensors="pt").to(self.model.device)
         # Generate caption
-        out = self.model.generate(**inputs)
         caption = self.processor.decode(out[0], skip_special_tokens=True)
         # Return the generated caption
-        return [{"caption": caption}]

         self.processor = BlipProcessor.from_pretrained(path)
         self.model = BlipForConditionalGeneration.from_pretrained(path).to("cuda" if torch.cuda.is_available() else "cpu")
+    def __call__(self, data: Any) -> Dict[str, str]:
         """
+        Args:
+            data (:obj:):
+                includes the input data and the parameters for the inference.
         Return:
+            A :obj:`dict`:. The object returned should be a dict like {"caption": "Generated caption for the image"} containing:
+                - "caption": The generated caption as a string.
         """
+        # Extract inputs and parameters
+        inputs = data.pop("inputs", data)
+        parameters = data.pop("parameters", {"mode": "image"})
+        # Get image URL and prompt from the inputs
+        image_url = inputs.get("image_url")
+        prompt = inputs.get("prompt", "")  # Optional prompt for conditional captioning
         # Load image from URL and ensure RGB format
         image = Image.open(requests.get(image_url, stream=True).raw).convert("RGB")
+        # Process inputs with or without a prompt
         if prompt:
+            processed_inputs = self.processor(image, prompt, return_tensors="pt").to(self.model.device)
         else:
+            processed_inputs = self.processor(image, return_tensors="pt").to(self.model.device)
         # Generate caption
+        out = self.model.generate(**processed_inputs)
         caption = self.processor.decode(out[0], skip_special_tokens=True)
         # Return the generated caption
+        return {"caption": caption}

test_handler.py CHANGED Viewed

@@ -6,13 +6,17 @@ my_handler = EndpointHandler()
 # Sample payload for conditional captioning
 conditional_payload = {
-    "image_url": "https://storage.googleapis.com/sfr-vision-language-research/BLIP/demo.jpg",
-    "prompt": "a photography of"
 }
 # Sample payload for unconditional captioning
 unconditional_payload = {
-    "image_url": "https://storage.googleapis.com/sfr-vision-language-research/BLIP/demo.jpg"
 }
 # Run the handler for both cases and print the outputs

 # Sample payload for conditional captioning
 conditional_payload = {
+    "inputs": {
+        "image_url": "https://storage.googleapis.com/sfr-vision-language-research/BLIP/demo.jpg",
+        "prompt": "a photography of"
+    }
 }
 # Sample payload for unconditional captioning
 unconditional_payload = {
+    "inputs": {
+        "image_url": "https://storage.googleapis.com/sfr-vision-language-research/BLIP/demo.jpg"
+    }
 }
 # Run the handler for both cases and print the outputs