grounded-vqa

Paused

App Files Files Community

vikhyatk commited on Sep 19, 2025

Commit

8d0b547

verified ·

1 Parent(s): 7e87351

Update app.py

Browse files

Files changed (1) hide show

app.py +3 -151

app.py CHANGED Viewed

@@ -42,165 +42,17 @@ os.environ["HF_TOKEN"] = os.environ.get("TOKEN_FROM_SECRET") or True
 moondream = AutoModelForCausalLM.from_pretrained(
     "vikhyatk/moondream-next",
     trust_remote_code=True,
-    torch_dtype=torch.bfloat16,
     device_map={"": "cuda"},
     revision=REVISION
 )
 moondream.eval()
-def convert_to_entities(text, coords):
-    """
-    Converts a string with special markers into an entity representation.
-    Markers:
-    - <|coord|> pairs indicate coordinate markers
-    - <|start_ground_points|> indicates the start of grounding
-    - <|start_ground_text|> indicates the start of a ground term
-    - <|end_ground|> indicates the end of a ground term
-    Returns:
-    - Dictionary with cleaned text and entities with their character positions
-    """
-    # Initialize variables
-    cleaned_text = ""
-    entities = []
-    entity = []
-    # Track current position in cleaned text
-    current_pos = 0
-    # Track if we're currently processing an entity
-    in_entity = False
-    entity_start = 0
-    i = 0
-    while i < len(text):
-        # Check for markers
-        if text[i : i + 9] == "<|coord|>":
-            i += 9
-            entity.append(coords.pop(0))
-            continue
-        elif text[i : i + 23] == "<|start_ground_points|>":
-            in_entity = True
-            entity_start = current_pos
-            i += 23
-            continue
-        elif text[i : i + 21] == "<|start_ground_text|>":
-            entity_start = current_pos
-            i += 21
-            continue
-        elif text[i : i + 14] == "<|end_ground|>":
-            # Store entity position
-            entities.append(
-                {
-                    "entity": json.dumps(entity),
-                    "start": entity_start,
-                    "end": current_pos,
-                }
-            )
-            entity = []
-            in_entity = False
-            i += 14
-            continue
-        # Add character to cleaned text
-        cleaned_text += text[i]
-        current_pos += 1
-        i += 1
-    return {"text": cleaned_text, "entities": entities}
-@spaces.GPU(duration=30)
-def answer_question(img, prompt, reasoning):
-    buffer = ""
-    resp = moondream.query(img, prompt, stream=True, reasoning=reasoning)
-    reasoning_text = resp["reasoning"]["text"] if reasoning else "[reasoning disabled]"
-    entities = [
-        {"start": g["start_idx"], "end": g["end_idx"], "entity": json.dumps(g["points"])}
-        for g in resp["reasoning"]["grounding"]
-    ] if reasoning else []
-    for new_text in resp["answer"]:
-        buffer += new_text
-        yield buffer.strip(), {"text": reasoning_text, "entities": entities}
-@spaces.GPU(duration=10)
-def caption(img, mode):
-    if img is None:
-        yield ""
-        return
-    buffer = ""
-    if mode == "Short":
-        l = "short"
-    elif mode == "Long":
-        l = "long"
-    else:
-        l = "normal"
-    for t in moondream.caption(img, length=l, stream=True)["caption"]:
-        buffer += t
-        yield buffer.strip()
-@spaces.GPU(duration=10)
-def detect(img, object, eos_bias):
-    if img is None:
-        yield "", gr.update(visible=False, value=None)
-        return
-    eos_bias = float(eos_bias)
-    objs = moondream.detect(img, object, settings={"eos_bias": eos_bias})["objects"]
-    w, h = img.size
-    if w > 768 or h > 768:
-        img = Resize(768)(img)
-        w, h = img.size
-    draw_image = ImageDraw.Draw(img)
-    for o in objs:
-        draw_image.rectangle(
-            (o["x_min"] * w, o["y_min"] * h, o["x_max"] * w, o["y_max"] * h),
-            outline="red",
-            width=3,
-        )
-    yield {"text": f"{len(objs)} detected", "entities": []}, gr.update(
-        visible=True, value=img
-    )
-@spaces.GPU(duration=10)
-def point(img, object):
-    if img is None:
-        yield "", gr.update(visible=False, value=None)
-        return
-    w, h = img.size
-    if w > 768 or h > 768:
-        img = Resize(768)(img)
-        w, h = img.size
-    objs = moondream.point(img, object, settings={"max_objects": 200})["points"]
-    draw_image = ImageDraw.Draw(img)
-    for o in objs:
-        draw_image.ellipse(
-            (o["x"] * w - 5, o["y"] * h - 5, o["x"] * w + 5, o["y"] * h + 5),
-            fill="red",
-            outline="blue",
-            width=2,
-        )
-    yield {"text": f"{len(objs)} detected", "entities": []}, gr.update(
-        visible=True, value=img
-    )
 @spaces.GPU(duration=10)
 def localized_query(img, x, y, question):
     if img is None:
-        yield "", {"text": "", "entities": []}, gr.update(visible=False, value=None)
         return
     answer = moondream.query(img, question, spatial_refs=[(x, y)])["answer"]
@@ -277,7 +129,7 @@ with gr.Blocks(title="moondream vl (new)", css=css, js=js) as demo:
         with gr.Column():
             output = gr.Markdown(label="Response", elem_classes=["output-text"], line_breaks=True)
-            ann = gr.Image(visible=False)
 demo.queue().launch()

 moondream = AutoModelForCausalLM.from_pretrained(
     "vikhyatk/moondream-next",
     trust_remote_code=True,
+    dtype=torch.bfloat16,
     device_map={"": "cuda"},
     revision=REVISION
 )
 moondream.eval()
 @spaces.GPU(duration=10)
 def localized_query(img, x, y, question):
     if img is None:
+        yield "", gr.update(visible=False, value=None)
         return
     answer = moondream.query(img, question, spatial_refs=[(x, y)])["answer"]
         with gr.Column():
             output = gr.Markdown(label="Response", elem_classes=["output-text"], line_breaks=True)
+            ann = gr.Image(visible=False, watermark="Click on the image on the right, not here")
 demo.queue().launch()