OzzyGT HF Staff commited on
Commit
259d55d
·
1 Parent(s): 1c2f921

yiyi clone

Browse files
Files changed (5) hide show
  1. README.md +83 -0
  2. block.py +226 -0
  3. mellon_config.json +66 -0
  4. modular_config.json +7 -0
  5. modular_model_index.json +33 -0
README.md CHANGED
@@ -1,3 +1,86 @@
1
  ---
2
  license: apache-2.0
3
  ---
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  ---
2
  license: apache-2.0
3
  ---
4
+
5
+ a modular custom block that can be dynamically load in mellon!
6
+
7
+ mellon param map is saved like this, for now ... we will make it really easy soon!
8
+
9
+ ```py
10
+ from diffusers.modular_pipelines.mellon_node_utils import MellonNodeConfig, MellonParam
11
+
12
+ SUPPORTED_ANNOTATION_TASKS = [
13
+ "<OD>",
14
+ "<REFERRING_EXPRESSION_SEGMENTATION>",
15
+ "<CAPTION>",
16
+ "<DETAILED_CAPTION>",
17
+ "<MORE_DETAILED_CAPTION>",
18
+ "<DENSE_REGION_CAPTION>",
19
+ "<CAPTION_TO_PHRASE_GROUNDING>",
20
+ "<OPEN_VOCABULARY_DETECTION>",
21
+ ]
22
+
23
+ SUPPORTED_ANNOTATION_OUTPUT_TYPES = [
24
+ "mask_image",
25
+ "bounding_box",
26
+ "mask_overlay",
27
+ ]
28
+
29
+ node_config = MellonNodeConfig(
30
+ inputs= [
31
+ "image",
32
+ MellonParam(name="annotation_task", label="Annotation Task", type="string", options=SUPPORTED_ANNOTATION_TASKS, value="<CAPTION_TO_PHRASE_GROUNDING>"),
33
+ MellonParam(name="annotation_prompt", label="Annotation Prompt", type="string", default="", display="textarea"),
34
+ MellonParam(
35
+ name="annotation_output_type",
36
+ label="Annotation Output Type",
37
+ type="string",
38
+ options=SUPPORTED_ANNOTATION_OUTPUT_TYPES,
39
+ value="bounding_box",
40
+ onChange={
41
+ "mask_image": ["mask_image"],
42
+ "bounding_box": [],
43
+ "mask_overlay": [],
44
+ }),
45
+ ],
46
+ model_inputs= [],
47
+ outputs= [
48
+ MellonParam(name="images", label="Images", type="image", display="output"),
49
+ MellonParam(name="annotations", label="Annotations", type="string", display="output"),
50
+ MellonParam(name="mask_image", label="Mask Image", type="image", display="output"),
51
+ ],
52
+ blocks_names= ["Florence2ImageAnnotatorBlock"],
53
+ node_type="custom",
54
+ )
55
+
56
+ node_config.save_mellon_config("YiYiXu/florence-2-block", push_to_hub=True)
57
+ ```
58
+
59
+ to run the block for bbox
60
+
61
+ ```py
62
+ import torch
63
+ from diffusers.modular_pipelines import ModularPipeline
64
+ from diffusers.utils import load_image
65
+
66
+ repo_id = "YiYiXu/florence-2-block"
67
+ # fetch the Florence2 image annotator block that will create our mask
68
+ pipe = ModularPipeline.from_pretrained("./florence-2-custom-block", trust_remote_code=True)
69
+ pipe.load_components(torch_dtype=torch.float16)
70
+ pipe.to("cuda")
71
+
72
+
73
+
74
+ image = load_image("https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/tasks/car.jpg?download=true")
75
+ image = image.resize((1024, 1024))
76
+
77
+ annotation_task = '<CAPTION_TO_PHRASE_GROUNDING>'
78
+ annotation_prompt = "car"
79
+
80
+ output = pipe(
81
+ image=image,
82
+ annotation_task=annotation_task,
83
+ annotation_prompt=annotation_prompt,
84
+ annotation_output_type="bounding_box",
85
+ ).image[0].save("output.png")
86
+ ```
block.py ADDED
@@ -0,0 +1,226 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import List, Union
2
+
3
+ import numpy as np
4
+ import torch
5
+ from diffusers.modular_pipelines import (
6
+ ComponentSpec,
7
+ InputParam,
8
+ ModularPipelineBlocks,
9
+ OutputParam,
10
+ PipelineState,
11
+ )
12
+ from PIL import Image, ImageDraw
13
+ from transformers import Florence2ForConditionalGeneration, AutoProcessor
14
+
15
+
16
+ class Florence2ImageAnnotatorBlock(ModularPipelineBlocks):
17
+ @property
18
+ def expected_components(self):
19
+ return [
20
+ ComponentSpec(
21
+ name="image_annotator",
22
+ type_hint=Florence2ForConditionalGeneration,
23
+ repo="florence-community/Florence-2-base-ft",
24
+ ),
25
+ ComponentSpec(
26
+ name="image_annotator_processor",
27
+ type_hint=AutoProcessor,
28
+ repo="florence-community/Florence-2-base-ft",
29
+ ),
30
+ ]
31
+
32
+ @property
33
+ def inputs(self) -> List[InputParam]:
34
+ return [
35
+ InputParam(
36
+ "image",
37
+ type_hint=Union[Image.Image, List[Image.Image]],
38
+ required=True,
39
+ description="Image(s) to annotate",
40
+ ),
41
+ InputParam(
42
+ "annotation_task",
43
+ type_hint=Union[str, List[str]],
44
+ default="<REFERRING_EXPRESSION_SEGMENTATION>",
45
+ description="""Annotation Task to perform on the image.
46
+ Supported Tasks:
47
+
48
+ <OD>
49
+ <REFERRING_EXPRESSION_SEGMENTATION>
50
+ <CAPTION>
51
+ <DETAILED_CAPTION>
52
+ <MORE_DETAILED_CAPTION>
53
+ <DENSE_REGION_CAPTION>
54
+ <CAPTION_TO_PHRASE_GROUNDING>
55
+ <OPEN_VOCABULARY_DETECTION>
56
+
57
+ """,
58
+ ),
59
+ InputParam(
60
+ "annotation_prompt",
61
+ type_hint=Union[str, List[str]],
62
+ required=True,
63
+ description="""Annotation Prompt to provide more context to the task.
64
+ Can be used to detect or segment out specific elements in the image
65
+ """,
66
+ ),
67
+ InputParam(
68
+ "annotation_output_type",
69
+ type_hint=str,
70
+ default="mask_image",
71
+ description="""Output type from annotation predictions. Availabe options are
72
+ annotation:
73
+ - raw annotation predictions from the model based on task type.
74
+ mask_image:
75
+ -black and white mask image for the given image based on the task type
76
+ mask_overlay:
77
+ - white mask overlayed on the original image
78
+ bounding_box:
79
+ - bounding boxes drawn on the original image
80
+ """,
81
+ ),
82
+ InputParam(
83
+ "annotation_overlay",
84
+ type_hint=bool,
85
+ required=True,
86
+ default=False,
87
+ description="",
88
+ ),
89
+ InputParam(
90
+ "fill",
91
+ type_hint=str,
92
+ default="white",
93
+ description="",
94
+ ),
95
+ ]
96
+
97
+ @property
98
+ def intermediate_outputs(self) -> List[OutputParam]:
99
+ return [
100
+ OutputParam(
101
+ "mask_image",
102
+ type_hint=Image,
103
+ description="Inpainting Mask for input Image(s)",
104
+ ),
105
+ OutputParam(
106
+ "annotations",
107
+ type_hint=dict,
108
+ description="Annotations Predictions for input Image(s)",
109
+ ),
110
+ OutputParam(
111
+ "images",
112
+ type_hint=Image,
113
+ description="Annotated input Image(s)",
114
+ ),
115
+ ]
116
+
117
+ def get_annotations(self, components, images, prompts, task):
118
+ task_prompts = [task + prompt for prompt in prompts]
119
+
120
+ inputs = components.image_annotator_processor(
121
+ text=task_prompts, images=images, return_tensors="pt"
122
+ ).to(components.image_annotator.device, components.image_annotator.dtype)
123
+
124
+ generated_ids = components.image_annotator.generate(
125
+ input_ids=inputs["input_ids"],
126
+ pixel_values=inputs["pixel_values"],
127
+ max_new_tokens=1024,
128
+ early_stopping=False,
129
+ do_sample=False,
130
+ num_beams=3,
131
+ )
132
+ annotations = components.image_annotator_processor.batch_decode(
133
+ generated_ids, skip_special_tokens=False
134
+ )
135
+
136
+ outputs = []
137
+ for image, annotation in zip(images, annotations):
138
+ outputs.append(
139
+ components.image_annotator_processor.post_process_generation(
140
+ annotation, task=task, image_size=(image.width, image.height)
141
+ )
142
+ )
143
+
144
+ return outputs
145
+
146
+ def prepare_mask(self, images, annotations, overlay=False, fill="white"):
147
+ masks = []
148
+ for image, annotation in zip(images, annotations):
149
+ mask_image = image.copy() if overlay else Image.new("L", image.size, 0)
150
+ draw = ImageDraw.Draw(mask_image)
151
+
152
+ for _, _annotation in annotation.items():
153
+ if "polygons" in _annotation:
154
+ for polygon in _annotation["polygons"]:
155
+ polygon = np.array(polygon).reshape(-1, 2)
156
+ if len(polygon) < 3:
157
+ continue
158
+ polygon = polygon.reshape(-1).tolist()
159
+ draw.polygon(polygon, fill=fill)
160
+
161
+ elif "bboxes" in _annotation:
162
+ bbox = _annotation["bboxes"]
163
+ draw.rectangle(bbox, fill="white")
164
+
165
+ masks.append(mask_image)
166
+
167
+ return masks
168
+
169
+ def prepare_bounding_boxes(self, images, annotations):
170
+ outputs = []
171
+ for image, annotation in zip(images, annotations):
172
+ image_copy = image.copy()
173
+ draw = ImageDraw.Draw(image_copy)
174
+ for _, _annotation in annotation.items():
175
+ bbox = _annotation["bboxes"][0]
176
+ label = _annotation["labels"][0]
177
+
178
+ draw.rectangle(bbox, outline="red", width=3)
179
+ draw.text((bbox[0], bbox[1] - 20), label, fill="red")
180
+
181
+ outputs.append(image_copy)
182
+
183
+ return outputs
184
+
185
+ def prepare_inputs(self, images, prompts):
186
+ prompts = prompts or ""
187
+
188
+ if isinstance(images, Image.Image):
189
+ images = [images]
190
+ if isinstance(prompts, str):
191
+ prompts = [prompts]
192
+
193
+ if len(images) != len(prompts):
194
+ raise ValueError("Number of images and annotation prompts must match.")
195
+
196
+ return images, prompts
197
+
198
+ @torch.no_grad()
199
+ def __call__(self, components, state: PipelineState) -> PipelineState:
200
+ block_state = self.get_block_state(state)
201
+ images, annotation_task_prompt = self.prepare_inputs(
202
+ block_state.image, block_state.annotation_prompt
203
+ )
204
+ task = block_state.annotation_task
205
+ fill = block_state.fill
206
+
207
+ annotations = self.get_annotations(
208
+ components, images, annotation_task_prompt, task
209
+ )
210
+ block_state.annotations = annotations
211
+ if block_state.annotation_output_type == "mask_image":
212
+ block_state.mask_image = self.prepare_mask(images, annotations)
213
+ else:
214
+ block_state.mask_image = None
215
+
216
+ if block_state.annotation_output_type == "mask_overlay":
217
+ block_state.images = self.prepare_mask(
218
+ images, annotations, overlay=True, fill=fill
219
+ )
220
+
221
+ elif block_state.annotation_output_type == "bounding_box":
222
+ block_state.images = self.prepare_bounding_boxes(images, annotations)
223
+
224
+ self.set_block_state(state, block_state)
225
+
226
+ return components, state
mellon_config.json ADDED
@@ -0,0 +1,66 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "blocks_names": [
3
+ "Florence2ImageAnnotatorBlock"
4
+ ],
5
+ "node_type": "custom",
6
+ "params": {
7
+ "annotation_output_type": {
8
+ "label": "Annotation Output Type",
9
+ "onChange": {
10
+ "bounding_box": [],
11
+ "mask_image": [
12
+ "mask_image"
13
+ ],
14
+ "mask_overlay": []
15
+ },
16
+ "options": [
17
+ "mask_image",
18
+ "bounding_box",
19
+ "mask_overlay"
20
+ ],
21
+ "type": "string",
22
+ "value": "bounding_box"
23
+ },
24
+ "annotation_prompt": {
25
+ "default": "",
26
+ "display": "textarea",
27
+ "label": "Annotation Prompt",
28
+ "type": "string"
29
+ },
30
+ "annotation_task": {
31
+ "label": "Annotation Task",
32
+ "options": [
33
+ "<OD>",
34
+ "<REFERRING_EXPRESSION_SEGMENTATION>",
35
+ "<CAPTION>",
36
+ "<DETAILED_CAPTION>",
37
+ "<MORE_DETAILED_CAPTION>",
38
+ "<DENSE_REGION_CAPTION>",
39
+ "<CAPTION_TO_PHRASE_GROUNDING>",
40
+ "<OPEN_VOCABULARY_DETECTION>"
41
+ ],
42
+ "type": "string",
43
+ "value": "<CAPTION_TO_PHRASE_GROUNDING>"
44
+ },
45
+ "annotations": {
46
+ "display": "output",
47
+ "label": "Annotations",
48
+ "type": "string"
49
+ },
50
+ "image": {
51
+ "display": "input",
52
+ "label": "Image",
53
+ "type": "image"
54
+ },
55
+ "images": {
56
+ "display": "output",
57
+ "label": "Images",
58
+ "type": "image"
59
+ },
60
+ "mask_image": {
61
+ "display": "output",
62
+ "label": "Mask Image",
63
+ "type": "image"
64
+ }
65
+ }
66
+ }
modular_config.json ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ {
2
+ "_class_name": "Florence2ImageAnnotatorBlock",
3
+ "_diffusers_version": "0.35.1",
4
+ "auto_map": {
5
+ "ModularPipelineBlocks": "block.Florence2ImageAnnotatorBlock"
6
+ }
7
+ }
modular_model_index.json ADDED
@@ -0,0 +1,33 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_blocks_class_name": "Florence2ImageAnnotatorBlock",
3
+ "_class_name": "ModularPipeline",
4
+ "_diffusers_version": "0.36.0.dev0",
5
+ "image_annotator": [
6
+ "transformers",
7
+ "Florence2ForConditionalGeneration",
8
+ {
9
+ "repo": "florence-community/Florence-2-base-ft",
10
+ "revision": null,
11
+ "subfolder": "",
12
+ "type_hint": [
13
+ "transformers",
14
+ "Florence2ForConditionalGeneration"
15
+ ],
16
+ "variant": null
17
+ }
18
+ ],
19
+ "image_annotator_processor": [
20
+ "transformers",
21
+ "Florence2Processor",
22
+ {
23
+ "repo": "florence-community/Florence-2-base-ft",
24
+ "revision": null,
25
+ "subfolder": "",
26
+ "type_hint": [
27
+ "transformers",
28
+ "AutoProcessor"
29
+ ],
30
+ "variant": null
31
+ }
32
+ ]
33
+ }