Spaces:

justinj92
/

florence-2

Runtime error

App Files Files Community

justinj92 commited on Jan 14, 2025

Commit

39bd209

verified ·

1 Parent(s): 5c757cb

Upload 7 files

Browse files

Files changed (7) hide show

app.py +171 -0
requirements.txt +7 -0
utils/__init__.py +0 -0
utils/annotate.py +17 -0
utils/imports.py +13 -0
utils/models.py +73 -0
utils/tasks.py +79 -0

app.py ADDED Viewed

	@@ -0,0 +1,171 @@

+from typing import Tuple, Optional
+import gradio as gr
+import spaces
+import supervision as sv
+import torch
+from PIL import Image
+from gradio_image_prompter import ImagePrompter
+from utils.annotate import annotate_with_boxes
+from utils.models import load_models, run_inference, CHECKPOINTS, \
+    pre_process_region_task_input, post_process_region_output
+from utils.tasks import TASK_NAMES, TASKS, OBJECT_DETECTION_TASK_NAME, \
+    CAPTION_TASK_NAME, DETAILED_CAPTION_TASK_NAME, \
+    MORE_DETAILED_CAPTION_TASK_NAME, OCR_WITH_REGION_TASK_NAME, OCR_TASK_NAME, \
+    IMAGE_INPUT_TASK_NAMES, IMAGE_PROMPTER_INPUT_TASK_NAMES, IMAGE_OUTPUT_TASK_NAMES, \
+    TEXTBOX_OUTPUT_TASK_NAMES, IMAGE_TO_IMAGE_TASK_NAMES, IMAGE_TO_TEXT_TASK_NAMES, \
+    IMAGE_PROMPT_TO_IMAGE_TASK_NAMES, REGION_PROPOSAL_TASK_NAME, \
+    DENSE_REGION_CAPTION_TASK_NAME
+MARKDOWN = """
+# Florence-2 🔥
+Florence-2 is a lightweight vision-language model open-sourced by Microsoft under the
+MIT license. The model demonstrates strong zero-shot and fine-tuning capabilities
+across tasks such as captioning, object detection, grounding, and segmentation.
+The model takes images and task prompts as input, generating the desired results in
+text format. It uses a DaViT vision encoder to convert images into visual token
+embeddings. These are then concatenated with BERT-generated text embeddings and
+processed by a transformer-based multi-modal encoder-decoder to generate the response.
+"""
+EXAMPLES = [
+    ["microsoft/Florence-2-large-ft", OBJECT_DETECTION_TASK_NAME, "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/tasks/temple-bar-dublin-world-famous-irish-pub.jpg?download=true", None],
+    ["microsoft/Florence-2-large-ft", REGION_PROPOSAL_TASK_NAME, "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/tasks/idefics-vqa.jpg?download=true", None],
+    ["microsoft/Florence-2-large-ft", DENSE_REGION_CAPTION_TASK_NAME, "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/tasks/idefics-vqa.jpg?download=true", None],
+    ["microsoft/Florence-2-large-ft", CAPTION_TASK_NAME, "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/tasks/idefics-vqa.jpg?download=true", None],
+    ["microsoft/Florence-2-large-ft", DETAILED_CAPTION_TASK_NAME, "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/tasks/idefics-vqa.jpg?download=true", None],
+    ["microsoft/Florence-2-large-ft", MORE_DETAILED_CAPTION_TASK_NAME, "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/tasks/idefics-vqa.jpg?download=true", None],
+    ["microsoft/Florence-2-large-ft", OCR_TASK_NAME, "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/tasks/temple-bar-dublin-world-famous-irish-pub.jpg?download=true", None],
+    ["microsoft/Florence-2-large-ft", OCR_WITH_REGION_TASK_NAME, "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/tasks/temple-bar-dublin-world-famous-irish-pub.jpg?download=true", None],
+    ["microsoft/Florence-2-large-ft", OCR_WITH_REGION_TASK_NAME, "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/tasks/temple-bar-dublin-world-famous-irish-pub.jpg?download=true", None],
+]
+# DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+DEVICE = "cuda"
+MODELS, PROCESSORS = load_models(DEVICE)
+@spaces.GPU
+def process(
+    checkpoint_dropdown,
+    task_dropdown,
+    image_input,
+    image_prompter_input
+) -> Tuple[Optional[Image.Image], Optional[str]]:
+    model = MODELS[checkpoint_dropdown]
+    processor = PROCESSORS[checkpoint_dropdown]
+    task = TASKS[task_dropdown]
+    if task_dropdown in IMAGE_TO_IMAGE_TASK_NAMES:
+        _, response = run_inference(
+            model, processor, DEVICE, image_input, task)
+        detections = sv.Detections.from_lmm(
+            lmm=sv.LMM.FLORENCE_2, result=response, resolution_wh=image_input.size)
+        return annotate_with_boxes(image_input, detections), None
+    elif task_dropdown in IMAGE_TO_TEXT_TASK_NAMES:
+        _, response = run_inference(
+            model, processor, DEVICE, image_input, task)
+        return None, response[task]
+    elif task_dropdown in IMAGE_PROMPT_TO_IMAGE_TASK_NAMES:
+        detections_list = []
+        print(image_prompter_input)
+        image_input = image_prompter_input["image"]
+        for prompt in image_prompter_input["points"]:
+            text = pre_process_region_task_input(
+                prompt=prompt,
+                resolution_wh=image_input.size
+            )
+            _, response = run_inference(
+                model, processor, DEVICE, image_input, task, text)
+            detections = sv.Detections.from_lmm(
+                lmm=sv.LMM.FLORENCE_2, result=response, resolution_wh=image_input.size)
+            detections_list.append(detections)
+        detections = sv.Detections.merge(detections_list=detections_list)
+        detections = post_process_region_output(
+            detections=detections, resolution_wh=image_input.size)
+        return annotate_with_boxes(image_input, detections), None
+with gr.Blocks() as demo:
+    gr.Markdown(MARKDOWN)
+    with gr.Row():
+        checkpoint_dropdown_component = gr.Dropdown(
+            choices=CHECKPOINTS,
+            value=CHECKPOINTS[0],
+            label="Model", info="Select a Florence 2 model to use.",
+            interactive=True
+        )
+        task_dropdown_component = gr.Dropdown(
+            choices=TASK_NAMES,
+            value=TASK_NAMES[0],
+            label="Task", info="Select a task to perform with the model.",
+            interactive=True
+        )
+    with gr.Row():
+        with gr.Column():
+            image_input_component = gr.Image(
+                type='pil', label='Upload image')
+            image_prompter_input_component = ImagePrompter(
+                type='pil', label='Image prompt', visible=False)
+            submit_button_component = gr.Button(value='Submit', variant='primary')
+        with gr.Column():
+            image_output_component = gr.Image(type='pil', label='Image Output')
+            text_output_component = gr.Textbox(label='Caption Output', visible=False)
+    with gr.Row():
+        gr.Examples(
+            fn=process,
+            examples=EXAMPLES,
+            inputs=[
+                checkpoint_dropdown_component,
+                task_dropdown_component,
+                image_input_component,
+                image_prompter_input_component
+            ],
+            outputs=[
+                image_output_component,
+                text_output_component
+            ],
+            run_on_click=True
+        )
+    def on_dropdown_change(text):
+        return [
+            gr.Image(visible=text in IMAGE_INPUT_TASK_NAMES),
+            ImagePrompter(visible=text in IMAGE_PROMPTER_INPUT_TASK_NAMES),
+            gr.Image(visible=text in IMAGE_OUTPUT_TASK_NAMES),
+            gr.Textbox(visible=text in TEXTBOX_OUTPUT_TASK_NAMES)
+        ]
+    task_dropdown_component.change(
+        on_dropdown_change,
+        inputs=[task_dropdown_component],
+        outputs=[
+            image_input_component,
+            image_prompter_input_component,
+            image_output_component,
+            text_output_component
+        ]
+    )
+    submit_button_component.click(
+        fn=process,
+        inputs=[
+            checkpoint_dropdown_component,
+            task_dropdown_component,
+            image_input_component,
+            image_prompter_input_component
+        ],
+        outputs=[
+            image_output_component,
+            text_output_component
+        ]
+    )
+demo.launch(debug=False, show_error=True, max_threads=1)

requirements.txt ADDED Viewed

	@@ -0,0 +1,7 @@

+einops
+spaces
+timm
+gradio
+transformers
+gradio-image-prompter
+supervision==0.22.0rc1

utils/__init__.py ADDED Viewed

File without changes

utils/annotate.py ADDED Viewed

	@@ -0,0 +1,17 @@

+import supervision as sv
+from PIL import Image
+def annotate_with_boxes(image: Image, detections: sv.Detections) -> Image:
+    annotated_image = image.copy()
+    thickness = sv.calculate_optimal_line_thickness(resolution_wh=image.size)
+    text_scale = sv.calculate_optimal_text_scale(resolution_wh=image.size)
+    bounding_box_annotator = sv.BoundingBoxAnnotator(
+        color_lookup=sv.ColorLookup.INDEX, thickness=thickness)
+    label_annotator = sv.LabelAnnotator(
+        color_lookup=sv.ColorLookup.INDEX,
+        text_scale=text_scale,
+        text_thickness=thickness)
+    annotated_image = bounding_box_annotator.annotate(annotated_image, detections)
+    annotated_image = label_annotator.annotate(annotated_image, detections)
+    return annotated_image

utils/imports.py ADDED Viewed

	@@ -0,0 +1,13 @@

+import os
+from typing import Union
+from transformers.dynamic_module_utils import get_imports
+def fixed_get_imports(filename: Union[str, os.PathLike]) -> list[str]:
+    """Work around for https://huggingface.co/microsoft/phi-1_5/discussions/72."""
+    if not str(filename).endswith("/modeling_florence2.py"):
+        return get_imports(filename)
+    imports = get_imports(filename)
+    imports.remove("flash_attn")
+    return imports

utils/models.py ADDED Viewed

	@@ -0,0 +1,73 @@

+from typing import Tuple, Dict, Any, List
+from unittest.mock import patch
+import numpy as np
+import supervision as sv
+import torch
+from PIL import Image
+from transformers import AutoModelForCausalLM, AutoProcessor
+from utils.imports import fixed_get_imports
+CHECKPOINTS = [
+    "microsoft/Florence-2-large-ft",
+    "microsoft/Florence-2-large",
+    "microsoft/Florence-2-base-ft",
+    "microsoft/Florence-2-base",
+]
+def load_models(device: torch.device) -> Tuple[Dict[str, Any], Dict[str, Any]]:
+    with patch("transformers.dynamic_module_utils.get_imports", fixed_get_imports):
+        models = {}
+        processors = {}
+        for checkpoint in CHECKPOINTS:
+            models[checkpoint] = AutoModelForCausalLM.from_pretrained(
+                checkpoint, trust_remote_code=True).to(device).eval()
+            processors[checkpoint] = AutoProcessor.from_pretrained(
+                checkpoint, trust_remote_code=True)
+    return models, processors
+def run_inference(
+    model: Any,
+    processor: Any,
+    device: torch.device,
+    image: Image,
+    task: str,
+    text: str = ""
+) -> Tuple[str, Dict]:
+    prompt = task + text
+    inputs = processor(text=prompt, images=image, return_tensors="pt").to(device)
+    generated_ids = model.generate(
+        input_ids=inputs["input_ids"],
+        pixel_values=inputs["pixel_values"],
+        max_new_tokens=1024,
+        num_beams=3
+    )
+    generated_text = processor.batch_decode(
+        generated_ids, skip_special_tokens=False)[0]
+    response = processor.post_process_generation(
+        generated_text, task=task, image_size=image.size)
+    return generated_text, response
+def pre_process_region_task_input(
+    prompt: List[float],
+    resolution_wh: Tuple[int, int]
+) -> str:
+    x1, y1, _, x2, y2, _ = prompt
+    w, h = resolution_wh
+    box = np.array([x1, y1, x2, y2])
+    box /= np.array([w, h, w, h])
+    box *= 1000
+    return "".join([f"<loc_{int(coordinate)}>" for coordinate in box])
+def post_process_region_output(
+    detections: sv.Detections,
+    resolution_wh: Tuple[int, int]
+) -> sv.Detections:
+    w, h = resolution_wh
+    detections.xyxy = (detections.xyxy / 1000 * np.array([w, h, w, h])).astype(np.int32)
+    return detections

utils/tasks.py ADDED Viewed

	@@ -0,0 +1,79 @@

+OBJECT_DETECTION_TASK_NAME = "Object Detection"
+REGION_PROPOSAL_TASK_NAME = "Region Proposal"
+DENSE_REGION_CAPTION_TASK_NAME = "Dense Region Caption"
+CAPTION_TASK_NAME = "Caption"
+DETAILED_CAPTION_TASK_NAME = "Detailed Caption"
+MORE_DETAILED_CAPTION_TASK_NAME = "More Detailed Caption"
+OCR_TASK_NAME = "OCR"
+OCR_WITH_REGION_TASK_NAME = "OCR with Region"
+REGION_TO_CATEGORY_TASK_NAME = "Region to Category"
+REGION_TO_DESCRIPTION_TASK_NAME = "Region to Description"
+TASK_NAMES = [
+    OBJECT_DETECTION_TASK_NAME,
+    REGION_PROPOSAL_TASK_NAME,
+    DENSE_REGION_CAPTION_TASK_NAME,
+    CAPTION_TASK_NAME,
+    DETAILED_CAPTION_TASK_NAME,
+    MORE_DETAILED_CAPTION_TASK_NAME,
+    OCR_TASK_NAME,
+    OCR_WITH_REGION_TASK_NAME,
+    REGION_TO_CATEGORY_TASK_NAME,
+    REGION_TO_DESCRIPTION_TASK_NAME
+]
+TASKS = {
+    OBJECT_DETECTION_TASK_NAME: "<OD>",
+    REGION_PROPOSAL_TASK_NAME: "<REGION_PROPOSAL>",
+    DENSE_REGION_CAPTION_TASK_NAME: "<DENSE_REGION_CAPTION>",
+    CAPTION_TASK_NAME: "<CAPTION>",
+    DETAILED_CAPTION_TASK_NAME: "<DETAILED_CAPTION>",
+    MORE_DETAILED_CAPTION_TASK_NAME: "<MORE_DETAILED_CAPTION>",
+    OCR_TASK_NAME: "<OCR>",
+    OCR_WITH_REGION_TASK_NAME: "<OCR_WITH_REGION>",
+    REGION_TO_CATEGORY_TASK_NAME: "<REGION_TO_CATEGORY>",
+    REGION_TO_DESCRIPTION_TASK_NAME: "<REGION_TO_DESCRIPTION>"
+}
+IMAGE_INPUT_TASK_NAMES = [
+    OBJECT_DETECTION_TASK_NAME,
+    REGION_PROPOSAL_TASK_NAME,
+    DENSE_REGION_CAPTION_TASK_NAME,
+    CAPTION_TASK_NAME,
+    DETAILED_CAPTION_TASK_NAME,
+    MORE_DETAILED_CAPTION_TASK_NAME,
+    OCR_TASK_NAME,
+    OCR_WITH_REGION_TASK_NAME,
+]
+IMAGE_PROMPTER_INPUT_TASK_NAMES = [
+    REGION_TO_CATEGORY_TASK_NAME,
+    REGION_TO_DESCRIPTION_TASK_NAME
+]
+IMAGE_OUTPUT_TASK_NAMES = [
+    OBJECT_DETECTION_TASK_NAME,
+    REGION_PROPOSAL_TASK_NAME,
+    DENSE_REGION_CAPTION_TASK_NAME,
+    OCR_WITH_REGION_TASK_NAME,
+    REGION_TO_CATEGORY_TASK_NAME,
+    REGION_TO_DESCRIPTION_TASK_NAME
+]
+TEXTBOX_OUTPUT_TASK_NAMES = [
+    CAPTION_TASK_NAME,
+    DETAILED_CAPTION_TASK_NAME,
+    MORE_DETAILED_CAPTION_TASK_NAME,
+    OCR_TASK_NAME
+]
+IMAGE_TO_IMAGE_TASK_NAMES = [
+    OBJECT_DETECTION_TASK_NAME,
+    OCR_WITH_REGION_TASK_NAME,
+    REGION_PROPOSAL_TASK_NAME,
+    DENSE_REGION_CAPTION_TASK_NAME
+]
+IMAGE_TO_TEXT_TASK_NAMES = [
+    CAPTION_TASK_NAME,
+    DETAILED_CAPTION_TASK_NAME,
+    MORE_DETAILED_CAPTION_TASK_NAME,
+    OCR_TASK_NAME
+]
+IMAGE_PROMPT_TO_IMAGE_TASK_NAMES = [
+    REGION_TO_CATEGORY_TASK_NAME,
+    REGION_TO_DESCRIPTION_TASK_NAME
+]