Spaces:

prithivMLmods
/

Molmo2-HF-Demo

Running on Zero

App Files Files Community

prithivMLmods commited on Dec 17, 2025

Commit

95b8274

verified ·

1 Parent(s): 3196312

add app

Browse files

Files changed (1) hide show

app.py +377 -0

app.py ADDED Viewed

	@@ -0,0 +1,377 @@

+import os
+import re
+import torch
+import gradio as gr
+import numpy as np
+from PIL import Image, ImageDraw
+from transformers import AutoProcessor, AutoModelForImageTextToText
+from typing import List, Tuple, Dict, Any
+# -----------------------------------------------------------------------------
+# 1. Model Setup
+# -----------------------------------------------------------------------------
+MODEL_ID = "allenai/Molmo2-4B"
+DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
+DTYPE = torch.bfloat16 if torch.cuda.is_available() else torch.float32
+print(f"Loading {MODEL_ID} on {DEVICE}...")
+# Load Processor
+processor = AutoProcessor.from_pretrained(
+    MODEL_ID,
+    trust_remote_code=True,
+    dtype="auto",
+    device_map="auto"
+)
+# Load Model
+model = AutoModelForImageTextToText.from_pretrained(
+    MODEL_ID,
+    trust_remote_code=True,
+    dtype="auto",
+    device_map="auto"
+)
+print("Model loaded successfully.")
+# -----------------------------------------------------------------------------
+# 2. Parsing Utilities (Regex from your snippets)
+# -----------------------------------------------------------------------------
+COORD_REGEX = re.compile(rf"<(?:points|tracks).*? coords=\"([0-9\t:;, .]+)\"/?>")
+FRAME_REGEX = re.compile(rf"(?:^|\t|:|,|;)([0-9\.]+) ([0-9\. ]+)")
+POINTS_REGEX = re.compile(r"([0-9]+) ([0-9]{3,4}) ([0-9]{3,4})")
+def _points_from_num_str(text, image_w, image_h):
+    for points in POINTS_REGEX.finditer(text):
+        ix, x, y = points.group(1), points.group(2), points.group(3)
+        # our points format assume coordinates are scaled by 1000
+        x, y = float(x)/1000*image_w, float(y)/1000*image_h
+        if 0 <= x <= image_w and 0 <= y <= image_h:
+            yield ix, x, y
+def extract_multi_image_points(text, image_w, image_h, extract_ids=False):
+    """Extract pointing coordinates for images."""
+    all_points = []
+    # Handle list of dimensions for multi-image
+    if isinstance(image_w, (list, tuple)) and isinstance(image_h, (list, tuple)):
+        assert len(image_w) == len(image_h)
+        diff_res = True
+    else:
+        diff_res = False
+    for coord in COORD_REGEX.finditer(text):
+        for point_grp in FRAME_REGEX.finditer(coord.group(1)):
+            frame_id_raw = point_grp.group(1)
+            # Molmo 1-indexes images in multi-image context
+            frame_id = int(frame_id_raw) if diff_res else float(frame_id_raw)
+            if diff_res:
+                # Safety check for index
+                idx_access = frame_id - 1
+                if idx_access < 0 or idx_access >= len(image_w):
+                    continue
+                w, h = image_w[idx_access], image_h[idx_access]
+            else:
+                w, h = image_w, image_h
+            for idx, x, y in _points_from_num_str(point_grp.group(2), w, h):
+                if extract_ids:
+                    all_points.append((frame_id, idx, x, y))
+                else:
+                    all_points.append((frame_id, x, y))
+    return all_points
+def extract_video_points(text, image_w, image_h, extract_ids=False):
+    """Extract video pointing coordinates."""
+    all_points = []
+    for coord in COORD_REGEX.finditer(text):
+        for point_grp in FRAME_REGEX.finditer(coord.group(1)):
+            frame_id = float(point_grp.group(1))
+            w, h = (image_w, image_h)
+            for idx, x, y in _points_from_num_str(point_grp.group(2), w, h):
+                if extract_ids:
+                    all_points.append((frame_id, idx, x, y))
+                else:
+                    all_points.append((frame_id, x, y))
+    return all_points
+# -----------------------------------------------------------------------------
+# 3. Video Utilities (Standalone implementation)
+# -----------------------------------------------------------------------------
+def process_vision_info_custom(messages: List[Dict]) -> Tuple[Any, List[Any], Dict[str, Any]]:
+    """
+    Standalone replacement for molmo_utils.process_vision_info using Decord.
+    Handles loading video frames.
+    """
+    try:
+        from decord import VideoReader, cpu
+    except ImportError:
+        raise ImportError("Please run `pip install decord` to handle video inputs.")
+    videos = []
+    # Iterate through messages to find video content
+    for msg in messages:
+        if "content" not in msg: continue
+        for content_item in msg["content"]:
+            if content_item.get("type") == "video":
+                video_path = content_item.get("video")
+                # Load video
+                vr = VideoReader(video_path, ctx=cpu(0))
+                total_frames = len(vr)
+                fps = vr.get_avg_fps()
+                width = vr[0].shape[1]
+                height = vr[0].shape[0]
+                # Sample frames (Molmo standard behavior)
+                # Usually samples around 64 frames or similar depending on config,
+                # here we keep it simple or strictly what the processor handles.
+                # The Molmo2 processor is quite flexible, but let's just pass the PIL images.
+                # Simple uniform sampling
+                num_frames_to_sample = 64
+                if total_frames > num_frames_to_sample:
+                    indices = np.linspace(0, total_frames - 1, num_frames_to_sample).astype(int)
+                else:
+                    indices = np.arange(total_frames)
+                frames = vr.get_batch(indices).asnumpy()
+                pil_frames = [Image.fromarray(f) for f in frames]
+                video_metadata = {
+                    "fps": fps,
+                    "total_frames": total_frames,
+                    "width": width,
+                    "height": height
+                }
+                videos.append((pil_frames, video_metadata))
+    # Molmo expects videos list and specific kwargs
+    video_kwargs = {"videos": videos} if videos else {}
+    return None, videos, video_kwargs
+# -----------------------------------------------------------------------------
+# 4. Processing Functions
+# -----------------------------------------------------------------------------
+def process_images_qa(files, prompt):
+    if not files:
+        return "Please upload at least one image.", None
+    # Load images
+    pil_images = []
+    try:
+        for file_path in files:
+            pil_images.append(Image.open(file_path).convert("RGB"))
+    except Exception as e:
+        return f"Error loading images: {e}", None
+    # Construct Message
+    content = [dict(type="text", text=prompt)]
+    for img in pil_images:
+        content.append(dict(type="image", image=img))
+    messages = [{"role": "user", "content": content}]
+    # Process
+    inputs = processor.apply_chat_template(
+        messages,
+        tokenize=True,
+        add_generation_prompt=True,
+        return_tensors="pt",
+        return_dict=True,
+    )
+    inputs = {k: v.to(model.device) for k, v in inputs.items()}
+    # Generate
+    with torch.inference_mode():
+        generated_ids = model.generate(**inputs, max_new_tokens=512)
+    generated_tokens = generated_ids[0, inputs['input_ids'].size(1):]
+    generated_text = processor.tokenizer.decode(generated_tokens, skip_special_tokens=True)
+    # Check for points
+    points = extract_multi_image_points(
+        generated_text,
+        [img.width for img in pil_images],
+        [img.height for img in pil_images]
+    )
+    # Visualization (Draw on the first image that has points, or all)
+    # We will return the first image in the list modified if points exist for it
+    output_vis = pil_images[0]
+    if points:
+        # Create copies to draw on
+        vis_images = [img.copy() for img in pil_images]
+        colors = ["red", "blue", "green", "yellow", "cyan", "magenta"]
+        for p in points:
+            # Format: (frame_id, x, y)
+            fid, x, y = p
+            # Adjust 1-based index from output to 0-based
+            img_idx = int(fid) - 1
+            if 0 <= img_idx < len(vis_images):
+                draw = ImageDraw.Draw(vis_images[img_idx])
+                # Draw crosshair/circle
+                r = 10
+                color = colors[img_idx % len(colors)]
+                draw.ellipse((x-r, y-r, x+r, y+r), outline=color, width=3)
+                draw.text((x+r, y-r), "P", fill=color)
+        # For the Gradio output, we just return the first image for simplicity
+        # unless we want to stitch them. Let's stitch them if multiple.
+        if len(vis_images) > 1:
+            total_width = sum(img.width for img in vis_images)
+            max_height = max(img.height for img in vis_images)
+            combined = Image.new('RGB', (total_width, max_height))
+            x_offset = 0
+            for img in vis_images:
+                combined.paste(img, (x_offset, 0))
+                x_offset += img.width
+            output_vis = combined
+        else:
+            output_vis = vis_images[0]
+    return generated_text, output_vis
+def process_video_qa(video_path, prompt):
+    if not video_path:
+        return "Please upload a video.", "No points detected."
+    # Construct Message
+    messages = [
+        {
+            "role": "user",
+            "content": [
+                dict(type="text", text=prompt),
+                dict(type="video", video=video_path),
+            ],
+        }
+    ]
+    # Process Video (Using custom function or molmo_utils)
+    _, videos, video_kwargs = process_vision_info_custom(messages)
+    # Check if video loaded
+    if not videos:
+        return "Error processing video file.", ""
+    videos_list, video_metadatas = zip(*videos)
+    videos_list, video_metadatas = list(videos_list), list(video_metadatas)
+    # Apply template
+    text_prompt = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
+    # Inputs
+    inputs = processor(
+        videos=videos_list,
+        video_metadata=video_metadatas,
+        text=text_prompt,
+        padding=True,
+        return_tensors="pt",
+        **video_kwargs,
+    )
+    inputs = {k: v.to(model.device) for k, v in inputs.items()}
+    # Generate
+    with torch.inference_mode():
+        generated_ids = model.generate(**inputs, max_new_tokens=1024)
+    generated_tokens = generated_ids[0, inputs['input_ids'].size(1):]
+    generated_text = processor.tokenizer.decode(generated_tokens, skip_special_tokens=True)
+    # Point Extraction
+    points = extract_video_points(
+        generated_text,
+        image_w=video_metadatas[0]["width"],
+        image_h=video_metadatas[0]["height"]
+    )
+    points_str = ""
+    if points:
+        points_str = "Detected Coordinates (Time/Frame, X, Y):\n" + "\n".join([str(p) for p in points])
+    else:
+        points_str = "No coordinates detected in output."
+    return generated_text, points_str
+# -----------------------------------------------------------------------------
+# 5. Gradio Interface
+# -----------------------------------------------------------------------------
+with gr.Blocks() as demo:
+    gr.Markdown("# **Molmo2-4B Multimodal Demo**")
+    with gr.Tabs():
+        # --- TAB 1: IMAGE QA ---
+        with gr.TabItem("🖼️ Image QA & Pointing"):
+            with gr.Row():
+                with gr.Column():
+                    img_input = gr.File(
+                        label="Upload Image(s)",
+                        file_count="multiple",
+                        type="filepath",
+                        file_types=["image"]
+                    )
+                    img_prompt = gr.Textbox(
+                        label="Prompt",
+                        placeholder="Describe this image. OR Point to the...",
+                        value="Describe this image."
+                    )
+                    img_btn = gr.Button("Generate", variant="primary")
+                with gr.Column():
+                    img_output_text = gr.Textbox(label="Response")
+                    img_output_vis = gr.Image(label="Visualization (If pointing detected)")
+            img_btn.click(
+                fn=process_images_qa,
+                inputs=[img_input, img_prompt],
+                outputs=[img_output_text, img_output_vis]
+            )
+        # --- TAB 2: VIDEO QA ---
+        with gr.TabItem("🎥 Video QA & Tracking"):
+            gr.Markdown("Supports General QA, Pointing, and Tracking.")
+            with gr.Row():
+                with gr.Column():
+                    vid_input = gr.Video(label="Upload Video")
+                    vid_prompt = gr.Textbox(
+                        label="Prompt",
+                        placeholder="What happens in this video? OR Track the...",
+                        value="Which animal appears in the video?"
+                    )
+                    vid_btn = gr.Button("Analyze Video", variant="primary")
+                with gr.Column():
+                    vid_output_text = gr.Textbox(label="Response")
+                    vid_output_points = gr.Textbox(
+                        label="Extracted Coordinates",
+                        info="Format: (Frame Index, X, Y). Visualization not supported in web UI yet.",
+                        lines=10
+                    )
+            vid_btn.click(
+                fn=process_video_qa,
+                inputs=[vid_input, vid_prompt],
+                outputs=[vid_output_text, vid_output_points]
+            )
+    gr.Markdown("""
+    **Notes:**
+    - **Image Tab:** Supports Multi-image inputs. If the model points to objects, the output image will show markers. If multiple images are uploaded, they are stitched horizontally for visualization.
+    - **Video Tab:** Supports General QA and Temporal Pointing/Tracking. Coordinates are output as text.
+    """)
+if __name__ == "__main__":
+    demo.queue().launch()