Kosmos-2-API

Running on Zero

App Files Files Community

[Admin maintenance] Migrate to ZeroGPU

by multimodalart HF Staff - opened May 25

base: refs/heads/main

←

from: refs/pr/1

Discussion Files changed

+76

-72

Files changed (1) hide show

app.py +76 -72

app.py CHANGED Viewed

@@ -1,3 +1,4 @@
 import gradio as gr
 import random
 import numpy as np
@@ -208,92 +209,95 @@ def draw_entity_boxes_on_image(image, entities, show=False, save_path=None, enti
     return pil_image
-def main():
-    ckpt = "microsoft/kosmos-2-patch14-224"
-    model = AutoModelForVision2Seq.from_pretrained(ckpt).to("cuda")
-    processor = AutoProcessor.from_pretrained(ckpt)
-    def generate_predictions(image_input, text_input):
-        """
-        Generate a grounded image description and annotated entity boxes with Kosmos-2.
-        Use this tool when you need to describe an image and identify grounded visual entities.
-        Args:
-            image_input (PIL.Image.Image): Input image to describe and ground.
-            text_input (str): Description mode, either "Brief" or "Detailed".
-        Returns:
-            tuple: Annotated image, highlighted generated description, and serialized entity data.
-        """
-        # Save the image and load it again to match the original Kosmos-2 demo.
-        # (https://github.com/microsoft/unilm/blob/f4695ed0244a275201fff00bee495f76670fbe70/kosmos-2/demo/gradio_app.py#L345-L346)
-        user_image_path = "/tmp/user_input_test_image.jpg"
-        image_input.save(user_image_path)
-        # This might give different results from the original argument `image_input`
-        image_input = Image.open(user_image_path)
-        if text_input == "Brief":
-            text_input = "An image of"
-        elif text_input == "Detailed":
-            text_input = "Describe this image in detail:"
-        else:
-            text_input = f"{text_input}"
-        inputs = processor(text=text_input, images=image_input, return_tensors="pt").to(
-            "cuda"
-        )
-        generated_ids = model.generate(
-            pixel_values=inputs["pixel_values"],
-            input_ids=inputs["input_ids"],
-            attention_mask=inputs["attention_mask"],
-            image_embeds=None,
-            image_embeds_position_mask=inputs["image_embeds_position_mask"],
-            use_cache=True,
-            max_new_tokens=128,
-        )
-        generated_text = processor.batch_decode(
-            generated_ids, skip_special_tokens=True
-        )[0]
-        # By default, the generated text is cleanup and the entities are extracted.
-        processed_text, entities = processor.post_process_generation(generated_text)
-        annotated_image = draw_entity_boxes_on_image(image_input, entities, show=False)
-        color_id = -1
-        entity_info = []
-        filtered_entities = []
-        for entity in entities:
-            entity_name, (start, end), bboxes = entity
-            if start == end:
-                # skip bounding bbox without a `phrase` associated
-                continue
-            color_id += 1
-            entity_info.append(((start, end), color_id))
-            filtered_entities.append(entity)
-        colored_text = []
-        prev_start = 0
-        end = 0
-        for idx, ((start, end), color_id) in enumerate(entity_info):
-            if start > prev_start:
-                colored_text.append((processed_text[prev_start:start], None))
-            colored_text.append((processed_text[start:end], f"{color_id}"))
-            prev_start = end
-        if end < len(processed_text):
-            colored_text.append((processed_text[end : len(processed_text)], None))
-        return annotated_image, colored_text, str(filtered_entities)
     term_of_use = """
 ### Terms of use

+import spaces
 import gradio as gr
 import random
 import numpy as np
     return pil_image
+ckpt = "microsoft/kosmos-2-patch14-224"
+model = AutoModelForVision2Seq.from_pretrained(ckpt).to("cuda")
+processor = AutoProcessor.from_pretrained(ckpt)
+@spaces.GPU
+def generate_predictions(image_input, text_input):
+    """
+    Generate a grounded image description and annotated entity boxes with Kosmos-2.
+    Use this tool when you need to describe an image and identify grounded visual entities.
+    Args:
+        image_input (PIL.Image.Image): Input image to describe and ground.
+        text_input (str): Description mode, either "Brief" or "Detailed".
+    Returns:
+        tuple: Annotated image, highlighted generated description, and serialized entity data.
+    """
+    # Save the image and load it again to match the original Kosmos-2 demo.
+    # (https://github.com/microsoft/unilm/blob/f4695ed0244a275201fff00bee495f76670fbe70/kosmos-2/demo/gradio_app.py#L345-L346)
+    user_image_path = "/tmp/user_input_test_image.jpg"
+    image_input.save(user_image_path)
+    # This might give different results from the original argument `image_input`
+    image_input = Image.open(user_image_path)
+    if text_input == "Brief":
+        text_input = "An image of"
+    elif text_input == "Detailed":
+        text_input = "Describe this image in detail:"
+    else:
+        text_input = f"{text_input}"
+    inputs = processor(text=text_input, images=image_input, return_tensors="pt").to(
+        "cuda"
+    )
+    generated_ids = model.generate(
+        pixel_values=inputs["pixel_values"],
+        input_ids=inputs["input_ids"],
+        attention_mask=inputs["attention_mask"],
+        image_embeds=None,
+        image_embeds_position_mask=inputs["image_embeds_position_mask"],
+        use_cache=True,
+        max_new_tokens=128,
+    )
+    generated_text = processor.batch_decode(
+        generated_ids, skip_special_tokens=True
+    )[0]
+    # By default, the generated text is cleanup and the entities are extracted.
+    processed_text, entities = processor.post_process_generation(generated_text)
+    annotated_image = draw_entity_boxes_on_image(image_input, entities, show=False)
+    color_id = -1
+    entity_info = []
+    filtered_entities = []
+    for entity in entities:
+        entity_name, (start, end), bboxes = entity
+        if start == end:
+            # skip bounding bbox without a `phrase` associated
+            continue
+        color_id += 1
+        entity_info.append(((start, end), color_id))
+        filtered_entities.append(entity)
+    colored_text = []
+    prev_start = 0
+    end = 0
+    for idx, ((start, end), color_id) in enumerate(entity_info):
+        if start > prev_start:
+            colored_text.append((processed_text[prev_start:start], None))
+        colored_text.append((processed_text[start:end], f"{color_id}"))
+        prev_start = end
+    if end < len(processed_text):
+        colored_text.append((processed_text[end : len(processed_text)], None))
+    return annotated_image, colored_text, str(filtered_entities)
+def main():
     term_of_use = """
 ### Terms of use