Upload ModularPipeline

Browse files

Files changed (4) hide show

README.md +3 -0
block.py +398 -0
modular_config.json +2 -2
modular_model_index.json +33 -0

README.md CHANGED Viewed

	@@ -0,0 +1,3 @@

+---
+library_name: diffusers
+---

block.py ADDED Viewed

	@@ -0,0 +1,398 @@

+from typing import List, Union
+import numpy as np
+import torch
+from diffusers.modular_pipelines import (
+    ComponentSpec,
+    InputParam,
+    ModularPipelineBlocks,
+    OutputParam,
+    PipelineState,
+)
+from PIL import Image, ImageDraw
+from transformers import AutoProcessor, Florence2ForConditionalGeneration
+class Florence2ImageAnnotatorBlock(ModularPipelineBlocks):
+    @property
+    def expected_components(self):
+        return [
+            ComponentSpec(
+                name="image_annotator",
+                type_hint=Florence2ForConditionalGeneration,
+                repo="florence-community/Florence-2-base-ft",
+            ),
+            ComponentSpec(
+                name="image_annotator_processor",
+                type_hint=AutoProcessor,
+                repo="florence-community/Florence-2-base-ft",
+            ),
+        ]
+    @property
+    def inputs(self) -> List[InputParam]:
+        return [
+            InputParam(
+                "image",
+                type_hint=Union[Image.Image, List[Image.Image]],
+                required=True,
+                description="Image(s) to annotate",
+            ),
+            InputParam(
+                "annotation_task",
+                type_hint=Union[str, List[str]],
+                default="<REFERRING_EXPRESSION_SEGMENTATION>",
+                description="""Annotation Task to perform on the image.
+                Supported Tasks:
+                <OD>
+                <REFERRING_EXPRESSION_SEGMENTATION>
+                <CAPTION>
+                <DETAILED_CAPTION>
+                <MORE_DETAILED_CAPTION>
+                <DENSE_REGION_CAPTION>
+                <REGION_PROPOSAL>
+                <CAPTION_TO_PHRASE_GROUNDING>
+                <OPEN_VOCABULARY_DETECTION>
+                <OCR>
+                <OCR_WITH_REGION>
+                """,
+            ),
+            InputParam(
+                "annotation_prompt",
+                type_hint=Union[str, List[str]],
+                required=True,
+                description="""Annotation Prompt to provide more context to the task.
+                Can be used to detect or segment out specific elements in the image
+                """,
+            ),
+            InputParam(
+                "annotation_output_type",
+                type_hint=str,
+                default="mask_image",
+                description="""Output type from annotation predictions. Availabe options are
+                annotation:
+                    - raw annotation predictions from the model based on task type.
+                mask_image:
+                    -black and white mask image for the given image based on the task type
+                mask_overlay:
+                    - white mask overlayed on the original image
+                bounding_box:
+                    - bounding boxes drawn on the original image
+                """,
+            ),
+            InputParam(
+                "annotation_overlay",
+                type_hint=bool,
+                required=True,
+                default=False,
+                description="",
+            ),
+            InputParam(
+                "fill",
+                type_hint=str,
+                default="white",
+                description="",
+            ),
+        ]
+    @property
+    def intermediate_outputs(self) -> List[OutputParam]:
+        return [
+            OutputParam(
+                "annotations",
+                type_hint=dict,
+                description="Annotations Predictions for input Image(s)",
+            ),
+            OutputParam(
+                "images",
+                type_hint=Image,
+                description="Annotated input Image(s)",
+            ),
+        ]
+    def get_annotations(self, components, images, prompts, task):
+        task_prompts = [task + prompt for prompt in prompts]
+        inputs = components.image_annotator_processor(
+            text=task_prompts, images=images, return_tensors="pt"
+        ).to(components.image_annotator.device, components.image_annotator.dtype)
+        generated_ids = components.image_annotator.generate(
+            input_ids=inputs["input_ids"],
+            pixel_values=inputs["pixel_values"],
+            max_new_tokens=1024,
+            early_stopping=False,
+            do_sample=False,
+            num_beams=3,
+        )
+        annotations = components.image_annotator_processor.batch_decode(
+            generated_ids, skip_special_tokens=False
+        )
+        outputs = []
+        for image, annotation in zip(images, annotations):
+            outputs.append(
+                components.image_annotator_processor.post_process_generation(
+                    annotation, task=task, image_size=(image.width, image.height)
+                )
+            )
+        return outputs
+    def _iter_polygon_point_sets(self, poly):
+        """
+        Yields lists of (x, y) points for all simple polygons found in `poly`.
+        Supports formats:
+          - [x1, y1, x2, y2, ...]
+          - [[x, y], [x, y], ...]
+          - [xs, ys]
+          - dict {'x': xs, 'y': ys}
+          - nested lists containing any of the above
+        """
+        if poly is None:
+            return
+        def is_num(v):
+            return isinstance(v, (int, float, np.number))
+        # dict {'x': [...], 'y': [...]}
+        if isinstance(poly, dict) and "x" in poly and "y" in poly:
+            xs, ys = poly["x"], poly["y"]
+            if (
+                isinstance(xs, (list, tuple))
+                and isinstance(ys, (list, tuple))
+                and len(xs) == len(ys)
+            ):
+                pts = list(zip(xs, ys))
+                if len(pts) >= 3:
+                    yield pts
+            return
+        if isinstance(poly, (list, tuple)):
+            # flat numeric [x1, y1, ...]
+            if all(is_num(v) for v in poly):
+                coords = list(poly)
+                if len(coords) >= 6 and len(coords) % 2 == 0:
+                    yield list(zip(coords[0::2], coords[1::2]))
+                return
+            # list of pairs [[x, y], ...]
+            if all(
+                isinstance(v, (list, tuple))
+                and len(v) == 2
+                and all(is_num(n) for n in v)
+                for v in poly
+            ):
+                if len(poly) >= 3:
+                    yield [tuple(v) for v in poly]
+                return
+            # [xs, ys]
+            if len(poly) == 2 and all(isinstance(v, (list, tuple)) for v in poly):
+                xs, ys = poly
+                try:
+                    if len(xs) == len(ys) and len(xs) >= 3:
+                        yield list(zip(xs, ys))
+                        return
+                except TypeError:
+                    pass
+            # nested: recurse into parts
+            for part in poly:
+                yield from self._iter_polygon_point_sets(part)
+        # other types are ignored
+    def prepare_mask(self, images, annotations, overlay=False, fill="white"):
+        masks = []
+        for image, annotation in zip(images, annotations):
+            mask_image = image.copy() if overlay else Image.new("L", image.size, 0)
+            draw = ImageDraw.Draw(mask_image)
+            # use a safe fill for grayscale masks
+            mask_fill = fill
+            if not overlay and isinstance(fill, str):
+                # for "L" mode, white -> 255
+                mask_fill = 255
+            for _, _annotation in annotation.items():
+                if "polygons" in _annotation:
+                    for poly in _annotation["polygons"]:
+                        for pts in self._iter_polygon_point_sets(poly):
+                            if len(pts) < 3:
+                                continue
+                            # clip to image bounds and flatten
+                            flat = []
+                            for x, y in pts:
+                                xi = int(round(max(0, min(image.width - 1, x))))
+                                yi = int(round(max(0, min(image.height - 1, y))))
+                                flat.extend([xi, yi])
+                            draw.polygon(flat, fill=mask_fill)
+                elif "bboxes" in _annotation:
+                    for bbox in _annotation["bboxes"]:
+                        flat = np.array(bbox).flatten().tolist()
+                        if len(flat) == 4:
+                            x0, y0, x1, y1 = flat
+                            draw.rectangle(
+                                (
+                                    int(round(x0)),
+                                    int(round(y0)),
+                                    int(round(x1)),
+                                    int(round(y1)),
+                                ),
+                                fill=mask_fill,
+                            )
+                elif "quad_boxes" in _annotation:
+                    for quad in _annotation["quad_boxes"]:
+                        for pts in self._iter_polygon_point_sets(quad):
+                            if len(pts) < 3:
+                                continue
+                            flat = []
+                            for x, y in pts:
+                                xi = int(round(max(0, min(image.width - 1, x))))
+                                yi = int(round(max(0, min(image.height - 1, y))))
+                                flat.extend([xi, yi])
+                            draw.polygon(flat, fill=mask_fill)
+            masks.append(mask_image)
+        return masks
+    def prepare_bounding_boxes(self, images, annotations):
+        outputs = []
+        for image, annotation in zip(images, annotations):
+            image_copy = image.copy()
+            draw = ImageDraw.Draw(image_copy)
+            for _, _annotation in annotation.items():
+                # Standard axis-aligned boxes
+                bboxes = _annotation.get("bboxes", [])
+                labels = _annotation.get("labels", [])
+                if len(labels) == 0:
+                    labels = _annotation.get("bboxes_labels", [])
+                for i, bbox in enumerate(bboxes):
+                    flat = np.array(bbox).flatten().tolist()
+                    if len(flat) != 4:
+                        continue
+                    x0, y0, x1, y1 = flat
+                    draw.rectangle(
+                        (
+                            int(round(x0)),
+                            int(round(y0)),
+                            int(round(x1)),
+                            int(round(y1)),
+                        ),
+                        outline="red",
+                        width=3,
+                    )
+                    label = labels[i] if i < len(labels) else ""
+                    if label:
+                        text_y = max(0, int(y0) - 20)
+                        draw.text((int(x0), text_y), label, fill="red")
+                # Quadrilateral boxes (draw as polygons)
+                quad_boxes = _annotation.get("quad_boxes", [])
+                qlabels = _annotation.get("labels", [])
+                for i, quad in enumerate(quad_boxes):
+                    for pts in self._iter_polygon_point_sets(quad):
+                        if len(pts) < 3:
+                            continue
+                        flat = []
+                        xs, ys = [], []
+                        for x, y in pts:
+                            xi = int(round(max(0, min(image.width - 1, x))))
+                            yi = int(round(max(0, min(image.height - 1, y))))
+                            flat.extend([xi, yi])
+                            xs.append(xi)
+                            ys.append(yi)
+                        # Outline polygon
+                        try:
+                            draw.polygon(flat, outline="red", width=3)
+                        except TypeError:
+                            # Pillow without width for polygon
+                            draw.polygon(flat, outline="red")
+                        # Optional label at centroid (inside the quad)
+                        label = qlabels[i] if i < len(qlabels) else ""
+                        if label:
+                            cx = int(round(sum(xs) / len(xs)))
+                            cy = int(round(sum(ys) / len(ys)))
+                            cx = max(0, min(image.width - 1, cx))
+                            cy = max(0, min(image.height - 1, cy))
+                            draw.text((cx, cy), label, fill="red")
+            outputs.append(image_copy)
+        return outputs
+    def prepare_inputs(self, images, prompts):
+        prompts = prompts or ""
+        if isinstance(images, Image.Image):
+            images = [images]
+        if isinstance(prompts, str):
+            prompts = [prompts]
+        if len(images) != len(prompts):
+            raise ValueError("Number of images and annotation prompts must match.")
+        return images, prompts
+    @torch.no_grad()
+    def __call__(self, components, state: PipelineState) -> PipelineState:
+        block_state = self.get_block_state(state)
+        skip_image = False
+        # these don't require a prompt and fail if one is given
+        if (
+            block_state.annotation_task == "<OD>"
+            or block_state.annotation_task == "<DENSE_REGION_CAPTION>"
+            or block_state.annotation_task == "<REGION_PROPOSAL>"
+            or block_state.annotation_task == "<OCR_WITH_REGION>"
+        ):
+            block_state.annotation_prompt = ""
+            block_state.annotation_output_type = "bounding_box"
+        # these don't require a prompt and doesn't ouput an image
+        elif (
+            block_state.annotation_task == "<CAPTION>"
+            or block_state.annotation_task == "<DETAILED_CAPTION>"
+            or block_state.annotation_task == "<MORE_DETAILED_CAPTION>"
+            or block_state.annotation_task == "<OCR>"
+        ):
+            block_state.annotation_prompt = ""
+            skip_image = True
+        images, annotation_task_prompt = self.prepare_inputs(
+            block_state.image, block_state.annotation_prompt
+        )
+        task = block_state.annotation_task
+        fill = block_state.fill
+        annotations = self.get_annotations(
+            components, images, annotation_task_prompt, task
+        )
+        block_state.annotations = annotations
+        block_state.images = None
+        if not skip_image:
+            if block_state.annotation_output_type == "mask_image":
+                block_state.images = self.prepare_mask(images, annotations)
+            if block_state.annotation_output_type == "mask_overlay":
+                block_state.images = self.prepare_mask(
+                    images, annotations, overlay=True, fill=fill
+                )
+            elif block_state.annotation_output_type == "bounding_box":
+                block_state.images = self.prepare_bounding_boxes(images, annotations)
+        self.set_block_state(state, block_state)
+        return components, state

modular_config.json CHANGED Viewed

@@ -1,7 +1,7 @@
 {
   "_class_name": "Florence2ImageAnnotatorBlock",
-  "_diffusers_version": "0.37.0.dev0",
   "auto_map": {
     "ModularPipelineBlocks": "block.Florence2ImageAnnotatorBlock"
   }
-}

 {
   "_class_name": "Florence2ImageAnnotatorBlock",
+  "_diffusers_version": "0.35.1",
   "auto_map": {
     "ModularPipelineBlocks": "block.Florence2ImageAnnotatorBlock"
   }
+}

modular_model_index.json ADDED Viewed

	@@ -0,0 +1,33 @@

+{
+  "_blocks_class_name": "Florence2ImageAnnotatorBlock",
+  "_class_name": "ModularPipeline",
+  "_diffusers_version": "0.37.0.dev0",
+  "image_annotator": [
+    null,
+    null,
+    {
+      "pretrained_model_name_or_path": "florence-community/Florence-2-base-ft",
+      "revision": null,
+      "subfolder": "",
+      "type_hint": [
+        "transformers",
+        "Florence2ForConditionalGeneration"
+      ],
+      "variant": null
+    }
+  ],
+  "image_annotator_processor": [
+    null,
+    null,
+    {
+      "pretrained_model_name_or_path": "florence-community/Florence-2-base-ft",
+      "revision": null,
+      "subfolder": "",
+      "type_hint": [
+        "transformers",
+        "AutoProcessor"
+      ],
+      "variant": null
+    }
+  ]
+}