yiyi clone

Browse files

Files changed (5) hide show

README.md +83 -0
block.py +226 -0
mellon_config.json +66 -0
modular_config.json +7 -0
modular_model_index.json +33 -0

README.md CHANGED Viewed

@@ -1,3 +1,86 @@
 ---
 license: apache-2.0
 ---

 ---
 license: apache-2.0
 ---
+a modular custom block that can be dynamically load in mellon!
+mellon param map is saved like this, for now ... we will make it really easy soon!
+```py
+from diffusers.modular_pipelines.mellon_node_utils import MellonNodeConfig, MellonParam
+SUPPORTED_ANNOTATION_TASKS = [
+    "<OD>",
+    "<REFERRING_EXPRESSION_SEGMENTATION>",
+    "<CAPTION>",
+    "<DETAILED_CAPTION>",
+    "<MORE_DETAILED_CAPTION>",
+    "<DENSE_REGION_CAPTION>",
+    "<CAPTION_TO_PHRASE_GROUNDING>",
+    "<OPEN_VOCABULARY_DETECTION>",
+]
+SUPPORTED_ANNOTATION_OUTPUT_TYPES = [
+    "mask_image",
+    "bounding_box",
+    "mask_overlay",
+]
+node_config = MellonNodeConfig(
+    inputs= [
+        "image",
+         MellonParam(name="annotation_task", label="Annotation Task", type="string", options=SUPPORTED_ANNOTATION_TASKS, value="<CAPTION_TO_PHRASE_GROUNDING>"),
+         MellonParam(name="annotation_prompt", label="Annotation Prompt", type="string", default="", display="textarea"),
+         MellonParam(
+            name="annotation_output_type",
+            label="Annotation Output Type",
+            type="string",
+            options=SUPPORTED_ANNOTATION_OUTPUT_TYPES,
+            value="bounding_box",
+            onChange={
+                "mask_image": ["mask_image"],
+                "bounding_box": [],
+                "mask_overlay": [],
+            }),
+    ],
+    model_inputs= [],
+    outputs= [
+        MellonParam(name="images", label="Images", type="image", display="output"),
+        MellonParam(name="annotations", label="Annotations", type="string", display="output"),
+        MellonParam(name="mask_image", label="Mask Image", type="image", display="output"),
+    ],
+    blocks_names= ["Florence2ImageAnnotatorBlock"],
+    node_type="custom",
+)
+node_config.save_mellon_config("YiYiXu/florence-2-block", push_to_hub=True)
+```
+to run the block for bbox
+```py
+import torch
+from diffusers.modular_pipelines import ModularPipeline
+from diffusers.utils import load_image
+repo_id = "YiYiXu/florence-2-block"
+# fetch the Florence2 image annotator block that will create our mask
+pipe = ModularPipeline.from_pretrained("./florence-2-custom-block", trust_remote_code=True)
+pipe.load_components(torch_dtype=torch.float16)
+pipe.to("cuda")
+image = load_image("https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/tasks/car.jpg?download=true")
+image = image.resize((1024, 1024))
+annotation_task = '<CAPTION_TO_PHRASE_GROUNDING>'
+annotation_prompt = "car"
+output = pipe(
+    image=image,
+    annotation_task=annotation_task,
+    annotation_prompt=annotation_prompt,
+    annotation_output_type="bounding_box",
+).image[0].save("output.png")
+```

block.py ADDED Viewed

	@@ -0,0 +1,226 @@

+from typing import List, Union
+import numpy as np
+import torch
+from diffusers.modular_pipelines import (
+    ComponentSpec,
+    InputParam,
+    ModularPipelineBlocks,
+    OutputParam,
+    PipelineState,
+)
+from PIL import Image, ImageDraw
+from transformers import Florence2ForConditionalGeneration, AutoProcessor
+class Florence2ImageAnnotatorBlock(ModularPipelineBlocks):
+    @property
+    def expected_components(self):
+        return [
+            ComponentSpec(
+                name="image_annotator",
+                type_hint=Florence2ForConditionalGeneration,
+                repo="florence-community/Florence-2-base-ft",
+            ),
+            ComponentSpec(
+                name="image_annotator_processor",
+                type_hint=AutoProcessor,
+                repo="florence-community/Florence-2-base-ft",
+            ),
+        ]
+    @property
+    def inputs(self) -> List[InputParam]:
+        return [
+            InputParam(
+                "image",
+                type_hint=Union[Image.Image, List[Image.Image]],
+                required=True,
+                description="Image(s) to annotate",
+            ),
+            InputParam(
+                "annotation_task",
+                type_hint=Union[str, List[str]],
+                default="<REFERRING_EXPRESSION_SEGMENTATION>",
+                description="""Annotation Task to perform on the image.
+                Supported Tasks:
+                <OD>
+                <REFERRING_EXPRESSION_SEGMENTATION>
+                <CAPTION>
+                <DETAILED_CAPTION>
+                <MORE_DETAILED_CAPTION>
+                <DENSE_REGION_CAPTION>
+                <CAPTION_TO_PHRASE_GROUNDING>
+                <OPEN_VOCABULARY_DETECTION>
+                """,
+            ),
+            InputParam(
+                "annotation_prompt",
+                type_hint=Union[str, List[str]],
+                required=True,
+                description="""Annotation Prompt to provide more context to the task.
+                Can be used to detect or segment out specific elements in the image
+                """,
+            ),
+            InputParam(
+                "annotation_output_type",
+                type_hint=str,
+                default="mask_image",
+                description="""Output type from annotation predictions. Availabe options are
+                annotation:
+                    - raw annotation predictions from the model based on task type.
+                mask_image:
+                    -black and white mask image for the given image based on the task type
+                mask_overlay:
+                    - white mask overlayed on the original image
+                bounding_box:
+                    - bounding boxes drawn on the original image
+                """,
+            ),
+            InputParam(
+                "annotation_overlay",
+                type_hint=bool,
+                required=True,
+                default=False,
+                description="",
+            ),
+            InputParam(
+                "fill",
+                type_hint=str,
+                default="white",
+                description="",
+            ),
+        ]
+    @property
+    def intermediate_outputs(self) -> List[OutputParam]:
+        return [
+            OutputParam(
+                "mask_image",
+                type_hint=Image,
+                description="Inpainting Mask for input Image(s)",
+            ),
+            OutputParam(
+                "annotations",
+                type_hint=dict,
+                description="Annotations Predictions for input Image(s)",
+            ),
+            OutputParam(
+                "images",
+                type_hint=Image,
+                description="Annotated input Image(s)",
+            ),
+        ]
+    def get_annotations(self, components, images, prompts, task):
+        task_prompts = [task + prompt for prompt in prompts]
+        inputs = components.image_annotator_processor(
+            text=task_prompts, images=images, return_tensors="pt"
+        ).to(components.image_annotator.device, components.image_annotator.dtype)
+        generated_ids = components.image_annotator.generate(
+            input_ids=inputs["input_ids"],
+            pixel_values=inputs["pixel_values"],
+            max_new_tokens=1024,
+            early_stopping=False,
+            do_sample=False,
+            num_beams=3,
+        )
+        annotations = components.image_annotator_processor.batch_decode(
+            generated_ids, skip_special_tokens=False
+        )
+        outputs = []
+        for image, annotation in zip(images, annotations):
+            outputs.append(
+                components.image_annotator_processor.post_process_generation(
+                    annotation, task=task, image_size=(image.width, image.height)
+                )
+            )
+        return outputs
+    def prepare_mask(self, images, annotations, overlay=False, fill="white"):
+        masks = []
+        for image, annotation in zip(images, annotations):
+            mask_image = image.copy() if overlay else Image.new("L", image.size, 0)
+            draw = ImageDraw.Draw(mask_image)
+            for _, _annotation in annotation.items():
+                if "polygons" in _annotation:
+                    for polygon in _annotation["polygons"]:
+                        polygon = np.array(polygon).reshape(-1, 2)
+                        if len(polygon) < 3:
+                            continue
+                        polygon = polygon.reshape(-1).tolist()
+                        draw.polygon(polygon, fill=fill)
+                elif "bboxes" in _annotation:
+                    bbox = _annotation["bboxes"]
+                    draw.rectangle(bbox, fill="white")
+            masks.append(mask_image)
+        return masks
+    def prepare_bounding_boxes(self, images, annotations):
+        outputs = []
+        for image, annotation in zip(images, annotations):
+            image_copy = image.copy()
+            draw = ImageDraw.Draw(image_copy)
+            for _, _annotation in annotation.items():
+                bbox = _annotation["bboxes"][0]
+                label = _annotation["labels"][0]
+                draw.rectangle(bbox, outline="red", width=3)
+                draw.text((bbox[0], bbox[1] - 20), label, fill="red")
+            outputs.append(image_copy)
+        return outputs
+    def prepare_inputs(self, images, prompts):
+        prompts = prompts or ""
+        if isinstance(images, Image.Image):
+            images = [images]
+        if isinstance(prompts, str):
+            prompts = [prompts]
+        if len(images) != len(prompts):
+            raise ValueError("Number of images and annotation prompts must match.")
+        return images, prompts
+    @torch.no_grad()
+    def __call__(self, components, state: PipelineState) -> PipelineState:
+        block_state = self.get_block_state(state)
+        images, annotation_task_prompt = self.prepare_inputs(
+            block_state.image, block_state.annotation_prompt
+        )
+        task = block_state.annotation_task
+        fill = block_state.fill
+        annotations = self.get_annotations(
+            components, images, annotation_task_prompt, task
+        )
+        block_state.annotations = annotations
+        if block_state.annotation_output_type == "mask_image":
+            block_state.mask_image = self.prepare_mask(images, annotations)
+        else:
+            block_state.mask_image = None
+        if block_state.annotation_output_type == "mask_overlay":
+            block_state.images = self.prepare_mask(
+                images, annotations, overlay=True, fill=fill
+            )
+        elif block_state.annotation_output_type == "bounding_box":
+            block_state.images = self.prepare_bounding_boxes(images, annotations)
+        self.set_block_state(state, block_state)
+        return components, state

mellon_config.json ADDED Viewed

	@@ -0,0 +1,66 @@

+{
+  "blocks_names": [
+    "Florence2ImageAnnotatorBlock"
+  ],
+  "node_type": "custom",
+  "params": {
+    "annotation_output_type": {
+      "label": "Annotation Output Type",
+      "onChange": {
+        "bounding_box": [],
+        "mask_image": [
+          "mask_image"
+        ],
+        "mask_overlay": []
+      },
+      "options": [
+        "mask_image",
+        "bounding_box",
+        "mask_overlay"
+      ],
+      "type": "string",
+      "value": "bounding_box"
+    },
+    "annotation_prompt": {
+      "default": "",
+      "display": "textarea",
+      "label": "Annotation Prompt",
+      "type": "string"
+    },
+    "annotation_task": {
+      "label": "Annotation Task",
+      "options": [
+        "<OD>",
+        "<REFERRING_EXPRESSION_SEGMENTATION>",
+        "<CAPTION>",
+        "<DETAILED_CAPTION>",
+        "<MORE_DETAILED_CAPTION>",
+        "<DENSE_REGION_CAPTION>",
+        "<CAPTION_TO_PHRASE_GROUNDING>",
+        "<OPEN_VOCABULARY_DETECTION>"
+      ],
+      "type": "string",
+      "value": "<CAPTION_TO_PHRASE_GROUNDING>"
+    },
+    "annotations": {
+      "display": "output",
+      "label": "Annotations",
+      "type": "string"
+    },
+    "image": {
+      "display": "input",
+      "label": "Image",
+      "type": "image"
+    },
+    "images": {
+      "display": "output",
+      "label": "Images",
+      "type": "image"
+    },
+    "mask_image": {
+      "display": "output",
+      "label": "Mask Image",
+      "type": "image"
+    }
+  }
+}

modular_config.json ADDED Viewed

	@@ -0,0 +1,7 @@

+{
+  "_class_name": "Florence2ImageAnnotatorBlock",
+  "_diffusers_version": "0.35.1",
+  "auto_map": {
+    "ModularPipelineBlocks": "block.Florence2ImageAnnotatorBlock"
+  }
+}

modular_model_index.json ADDED Viewed

	@@ -0,0 +1,33 @@

+{
+  "_blocks_class_name": "Florence2ImageAnnotatorBlock",
+  "_class_name": "ModularPipeline",
+  "_diffusers_version": "0.36.0.dev0",
+  "image_annotator": [
+    "transformers",
+    "Florence2ForConditionalGeneration",
+    {
+      "repo": "florence-community/Florence-2-base-ft",
+      "revision": null,
+      "subfolder": "",
+      "type_hint": [
+        "transformers",
+        "Florence2ForConditionalGeneration"
+      ],
+      "variant": null
+    }
+  ],
+  "image_annotator_processor": [
+    "transformers",
+    "Florence2Processor",
+    {
+      "repo": "florence-community/Florence-2-base-ft",
+      "revision": null,
+      "subfolder": "",
+      "type_hint": [
+        "transformers",
+        "AutoProcessor"
+      ],
+      "variant": null
+    }
+  ]
+}