Molmo2-HF-Demo

Runtime error

App Files Files Community

prithivMLmods commited on Dec 17, 2025

Commit

3b22d33

verified ·

1 Parent(s): 8a54b80

Update app.py

Browse files

Files changed (1) hide show

app.py +180 -265

app.py CHANGED Viewed

@@ -2,88 +2,14 @@ import gradio as gr
 import torch
 from transformers import AutoProcessor, AutoModelForImageTextToText
 from PIL import Image, ImageDraw
-import requests
-import re
 import numpy as np
 import cv2
 import os
-import tempfile
 from molmo_utils import process_vision_info
-from typing import Iterable
-from gradio.themes import Soft
-from gradio.themes.utils import colors, fonts, sizes
-colors.orange_red = colors.Color(
-    name="orange_red",
-    c50="#FFF0E5",
-    c100="#FFE0CC",
-    c200="#FFC299",
-    c300="#FFA366",
-    c400="#FF8533",
-    c500="#FF4500",
-    c600="#E63E00",
-    c700="#CC3700",
-    c800="#B33000",
-    c900="#992900",
-    c950="#802200",
-)
-class OrangeRedTheme(Soft):
-    def __init__(
-        self,
-        *,
-        primary_hue: colors.Color | str = colors.gray,
-        secondary_hue: colors.Color | str = colors.orange_red, # Use the new color
-        neutral_hue: colors.Color | str = colors.slate,
-        text_size: sizes.Size | str = sizes.text_lg,
-        font: fonts.Font | str | Iterable[fonts.Font | str] = (
-            fonts.GoogleFont("Outfit"), "Arial", "sans-serif",
-        ),
-        font_mono: fonts.Font | str | Iterable[fonts.Font | str] = (
-            fonts.GoogleFont("IBM Plex Mono"), "ui-monospace", "monospace",
-        ),
-    ):
-        super().__init__(
-            primary_hue=primary_hue,
-            secondary_hue=secondary_hue,
-            neutral_hue=neutral_hue,
-            text_size=text_size,
-            font=font,
-            font_mono=font_mono,
-        )
-        super().set(
-            background_fill_primary="*primary_50",
-            background_fill_primary_dark="*primary_900",
-            body_background_fill="linear-gradient(135deg, *primary_200, *primary_100)",
-            body_background_fill_dark="linear-gradient(135deg, *primary_900, *primary_800)",
-            button_primary_text_color="white",
-            button_primary_text_color_hover="white",
-            button_primary_background_fill="linear-gradient(90deg, *secondary_500, *secondary_600)",
-            button_primary_background_fill_hover="linear-gradient(90deg, *secondary_600, *secondary_700)",
-            button_primary_background_fill_dark="linear-gradient(90deg, *secondary_600, *secondary_700)",
-            button_primary_background_fill_hover_dark="linear-gradient(90deg, *secondary_500, *secondary_600)",
-            button_secondary_text_color="black",
-            button_secondary_text_color_hover="white",
-            button_secondary_background_fill="linear-gradient(90deg, *primary_300, *primary_300)",
-            button_secondary_background_fill_hover="linear-gradient(90deg, *primary_400, *primary_400)",
-            button_secondary_background_fill_dark="linear-gradient(90deg, *primary_500, *primary_600)",
-            button_secondary_background_fill_hover_dark="linear-gradient(90deg, *primary_500, *primary_500)",
-            slider_color="*secondary_500",
-            slider_color_dark="*secondary_600",
-            block_title_text_weight="600",
-            block_border_width="3px",
-            block_shadow="*shadow_drop_lg",
-            button_primary_shadow="*shadow_drop_lg",
-            button_large_padding="11px",
-            color_accent_soft="*primary_100",
-            block_label_background_fill="*primary_200",
-        )
-orange_red_theme = OrangeRedTheme()
 # -----------------------------------------------------------------------------
-# 1. Model Setup
 # -----------------------------------------------------------------------------
 MODEL_ID = "allenai/Molmo2-4B"
@@ -104,158 +30,164 @@ model = AutoModelForImageTextToText.from_pretrained(
 print("Model loaded successfully.")
 # -----------------------------------------------------------------------------
-# 2. Parsing & Visualization Utilities
 # -----------------------------------------------------------------------------
 COORD_REGEX = re.compile(rf"<(?:points|tracks).*? coords=\"([0-9\t:;, .]+)\"/?>")
 FRAME_REGEX = re.compile(rf"(?:^|\t|:|,|;)([0-9\.]+) ([0-9\. ]+)")
 POINTS_REGEX = re.compile(r"([0-9]+) ([0-9]{3,4}) ([0-9]{3,4})")
 def _points_from_num_str(text, image_w, image_h):
-    """Yields (index, x, y) from the coordinate string."""
     for points in POINTS_REGEX.finditer(text):
         ix, x, y = points.group(1), points.group(2), points.group(3)
-        # Coordinates are scaled by 1000 in Molmo output
         x, y = float(x)/1000*image_w, float(y)/1000*image_h
         if 0 <= x <= image_w and 0 <= y <= image_h:
             yield ix, x, y
-def extract_multi_image_points(text, image_sizes):
-    """
-    Extracts points for multiple images.
-    image_sizes: list of (width, height) tuples corresponding to the images.
-    Returns: list of (image_index, x, y)
-    """
     all_points = []
-    # Check if we have multiple resolutions or single
-    diff_res = True # Molmo usually treats multi-image inputs as distinct frames/indices
     for coord in COORD_REGEX.finditer(text):
         for point_grp in FRAME_REGEX.finditer(coord.group(1)):
-            # frame_id is 1-based index for images in multi-image context
-            frame_id_raw = float(point_grp.group(1))
-            frame_idx = int(frame_id_raw) - 1
-            if 0 <= frame_idx < len(image_sizes):
-                w, h = image_sizes[frame_idx]
-                for _, x, y in _points_from_num_str(point_grp.group(2), w, h):
-                    all_points.append((frame_idx, x, y))
     return all_points
-def extract_video_points(text, image_w, image_h):
-    """
-    Extracts video points.
-    Returns: list of (time_or_frame_float, x, y)
-    """
     all_points = []
     for coord in COORD_REGEX.finditer(text):
         for point_grp in FRAME_REGEX.finditer(coord.group(1)):
-            frame_id = float(point_grp.group(1))
-            for _, x, y in _points_from_num_str(point_grp.group(2), image_w, image_h):
-                all_points.append((frame_id, x, y))
     return all_points
 def draw_points_on_images(images, points):
     """Draws points on a list of PIL Images."""
     annotated_images = [img.copy() for img in images]
-    draws = [ImageDraw.Draw(img) for img in annotated_images]
-    # Colors for visualization
-    color = "red"
-    radius = 5
-    for (img_idx, x, y) in points:
-        if 0 <= img_idx < len(draws):
-            draws[img_idx].ellipse((x - radius, y - radius, x + radius, y + radius), fill=color, outline="white")
     return annotated_images
-def draw_points_on_video(video_path, points, original_w, original_h):
     """
-    Overlay points on video.
-    Note: Molmo outputs time/frame info. Mapping exact frames can be tricky depending on how Molmo sampled them.
-    This is a best-effort visualization assuming frame_id loosely maps to seconds or sequence.
     """
     cap = cv2.VideoCapture(video_path)
     fps = cap.get(cv2.CAP_PROP_FPS)
-    width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
-    height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
     total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
-    # Create temp output file
-    temp_out = tempfile.NamedTemporaryFile(suffix='.mp4', delete=False).name
-    fourcc = cv2.VideoWriter_fourcc(*'mp4v')
-    out = cv2.VideoWriter(temp_out, fourcc, fps, (width, height))
-    # Group points by frame/time for faster lookup
-    # Molmo frame_id is often the index of the sampled frame.
-    # For robust visualization, we'd need to know exactly which frames Molmo sampled.
-    # Here, we will try to match based on the assumption that points come with a timestamp or frame index.
-    # If points are sparse, we might want to "hold" the point for a few frames.
-    frame_idx = 0
     while cap.isOpened():
         ret, frame = cap.read()
         if not ret:
             break
-        # Current time in seconds
-        current_time = frame_idx / fps
-        # Simple Logic: Check if any point exists closely to this frame/time
-        # Molmo video output usually uses frame indices relative to the *sampled* frames,
-        # but sometimes outputs timestamps. For this demo, we'll draw purely if we find a match
-        # in the raw output or if it's a tracking task.
-        for (p_time, px, py) in points:
-            # Map coordinates from model resolution (original_w) to video resolution (width)
-            scale_x = width / original_w
-            scale_y = height / original_h
-            final_x = int(px * scale_x)
-            final_y = int(py * scale_y)
-            # Heuristic: if p_time is close to current_time (assuming p_time is seconds)
-            # OR if p_time is an integer close to the frame index (if it's frame count).
-            # Molmo utils usually samples roughly 1fps or specific clips.
-            # Let's assume p_time refers to the sampled frame index.
-            # To simplify for the demo: We will draw all points found for 'approximately' this moment.
-            # In a real production app, you need the `video_kwargs` mapping from process_vision_info.
-            # Draw a circle
-            cv2.circle(frame, (final_x, final_y), 10, (0, 0, 255), -1)
-            cv2.circle(frame, (final_x, final_y), 10, (255, 255, 255), 2)
         out.write(frame)
-        frame_idx += 1
     cap.release()
     out.release()
-    return temp_out
 # -----------------------------------------------------------------------------
-# 3. Inference Functions
 # -----------------------------------------------------------------------------
-def process_images(image_files, prompt):
-    if not image_files:
-        return "Please upload an image.", []
-    # Load images
-    images = [Image.open(f).convert("RGB") for f in image_files]
-    # Construct Message
-    content = [{"type": "text", "text": prompt}]
-    for img in images:
-        content.append({"type": "image", "image": img})
     messages = [{"role": "user", "content": content}]
-    # Inputs
     inputs = processor.apply_chat_template(
         messages,
         tokenize=True,
@@ -264,48 +196,52 @@ def process_images(image_files, prompt):
         return_dict=True,
     )
     inputs = {k: v.to(model.device) for k, v in inputs.items()}
     # Generate
     with torch.inference_mode():
         generated_ids = model.generate(**inputs, max_new_tokens=1024)
     generated_tokens = generated_ids[0, inputs['input_ids'].size(1):]
     generated_text = processor.tokenizer.decode(generated_tokens, skip_special_tokens=True)
     # Check for points
-    image_sizes = [(img.width, img.height) for img in images]
-    points = extract_multi_image_points(generated_text, image_sizes)
-    annotated_images = []
     if points:
-        annotated_images = draw_points_on_images(images, points)
-        return generated_text, annotated_images
-    else:
-        return generated_text, images
-def process_video(video_file, prompt, task_type):
-    if not video_file:
         return "Please upload a video.", None
-    # Construct Message
     messages = [
         {
             "role": "user",
             "content": [
-                dict(type="text", text=prompt),
-                dict(type="video", video=video_file), # helper handles file path or url
             ],
         }
     ]
-    # Process Vision Info (Crucial for Video)
     _, videos, video_kwargs = process_vision_info(messages)
     videos, video_metadatas = zip(*videos)
     videos, video_metadatas = list(videos), list(video_metadatas)
-    # Apply Template
     text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
     inputs = processor(
         videos=videos,
         video_metadata=video_metadatas,
@@ -315,99 +251,78 @@ def process_video(video_file, prompt, task_type):
         **video_kwargs,
     )
     inputs = {k: v.to(model.device) for k, v in inputs.items()}
     # Generate
     with torch.inference_mode():
         generated_ids = model.generate(**inputs, max_new_tokens=2048)
     generated_tokens = generated_ids[0, inputs['input_ids'].size(1):]
     generated_text = processor.tokenizer.decode(generated_tokens, skip_special_tokens=True)
-    # Visualization logic
-    output_video_path = None
-    # If the text contains coordinates, we try to extract and visualize
-    if "coords=" in generated_text:
-        try:
-            # Extract points
-            w = video_metadatas[0]["width"]
-            h = video_metadatas[0]["height"]
-            points = extract_video_points(generated_text, w, h)
-            if points:
-                # We attempt to draw on the original video
-                # Note: This is a basic overlay. Molmo's temporal alignment is complex.
-                # In a full app, you might only draw on specific frames or returned keyframes.
-                # Here we return the original video if we can't process, or the processed one.
-                # For demonstration, we just return the original video to avoid long processing times
-                # unless you implement the full CV2 write loop efficiently.
-                # Uncomment to enable full video processing (might be slow)
-                # output_video_path = draw_points_on_video(video_file, points, w, h)
-                output_video_path = video_file # Placeholder: return original
-        except Exception as e:
-            print(f"Error visualizing video: {e}")
-            output_video_path = video_file
-    else:
-        output_video_path = video_file
-    return generated_text, output_video_path
 # -----------------------------------------------------------------------------
-# 4. Gradio Interface
 # -----------------------------------------------------------------------------
-with gr.Blocks() as demo:
-    gr.Markdown("# **Molmo2-8B Multimodal Demo**")
-    gr.Markdown("Supports Single/Multi-Image QA, Pointing, and Video QA (General, Pointing, Tracking).")
     with gr.Tabs():
-        # --- Tab 1: Image QA ---
-        with gr.Tab("📷 Image QA & Pointing"):
             with gr.Row():
-                with gr.Column(scale=1):
-                    img_input = gr.Gallery(label="Upload Image(s)", type="filepath", columns=2)
-                    img_prompt = gr.Textbox(label="Prompt", placeholder="Describe this image... or Point to the cat.", value="Describe this image.")
-                    img_btn = gr.Button("Generate", variant="primary")
-                with gr.Column(scale=1):
-                    img_output_text = gr.Markdown(label="Response")
-                    img_output_vis = gr.Gallery(label="Visualization (if applicable)")
             img_btn.click(
-                process_images,
-                inputs=[img_input, img_prompt],
-                outputs=[img_output_text, img_output_vis]
             )
-        # --- Tab 2: Video QA ---
-        with gr.Tab("🎥 Video QA & Tracking"):
             with gr.Row():
-                with gr.Column(scale=1):
-                    vid_input = gr.Video(label="Upload Video", sources=["upload"])
-                    vid_prompt = gr.Textbox(label="Prompt", placeholder="What is happening? or Track the ball.", value="What is happening in this video?")
-                    # We treat all video tasks via the same prompt mechanism,
-                    # but visualizer behavior might change based on detection of coordinates.
-                    vid_task = gr.Radio(["General QA", "Pointing/Tracking"], label="Task Type", value="General QA", visible=False)
-                    vid_btn = gr.Button("Generate", variant="primary")
-                with gr.Column(scale=1):
-                    vid_output_text = gr.Markdown(label="Response")
-                    # Note: Full video visualization in real-time is heavy.
-                    # The code returns the video path.
-                    vid_output_vis = gr.Video(label="Output Video")
             vid_btn.click(
-                process_video,
-                inputs=[vid_input, vid_prompt, vid_task],
-                outputs=[vid_output_text, vid_output_vis]
             )
-    gr.Markdown("""
-    **Note:**
-    - For Pointing/Tracking, include keywords like "Point to..." or "Track..." in your prompt.
-    - Video processing for visualization is computationally expensive; this demo may return the text response quickly but the video visualization might require custom implementation logic for perfect frame alignment.
-    """)
 if __name__ == "__main__":
-    demo.queue().launch(theme=orange_red_theme, mcp_server=True, ssr_mode=False, show_error=True)

 import torch
 from transformers import AutoProcessor, AutoModelForImageTextToText
 from PIL import Image, ImageDraw
 import numpy as np
 import cv2
+import re
 import os
 from molmo_utils import process_vision_info
 # -----------------------------------------------------------------------------
+# 1. Model & Processor Setup
 # -----------------------------------------------------------------------------
 MODEL_ID = "allenai/Molmo2-4B"
 print("Model loaded successfully.")
 # -----------------------------------------------------------------------------
+# 2. Parsing Utilities (From provided snippets)
 # -----------------------------------------------------------------------------
 COORD_REGEX = re.compile(rf"<(?:points|tracks).*? coords=\"([0-9\t:;, .]+)\"/?>")
 FRAME_REGEX = re.compile(rf"(?:^|\t|:|,|;)([0-9\.]+) ([0-9\. ]+)")
 POINTS_REGEX = re.compile(r"([0-9]+) ([0-9]{3,4}) ([0-9]{3,4})")
 def _points_from_num_str(text, image_w, image_h):
     for points in POINTS_REGEX.finditer(text):
         ix, x, y = points.group(1), points.group(2), points.group(3)
+        # our points format assume coordinates are scaled by 1000
         x, y = float(x)/1000*image_w, float(y)/1000*image_h
         if 0 <= x <= image_w and 0 <= y <= image_h:
             yield ix, x, y
+def extract_multi_image_points(text, image_w, image_h, extract_ids=False):
+    """Extract pointing coordinates for images."""
     all_points = []
+    # Handle list of dimensions for multi-image
+    if isinstance(image_w, (list, tuple)) and isinstance(image_h, (list, tuple)):
+        assert len(image_w) == len(image_h)
+        diff_res = True
+    else:
+        diff_res = False
     for coord in COORD_REGEX.finditer(text):
         for point_grp in FRAME_REGEX.finditer(coord.group(1)):
+            # For images, frame_id corresponds to the image index (1-based in text usually, but we need to check)
+            frame_id = int(point_grp.group(1)) if diff_res else float(point_grp.group(1))
+            if diff_res:
+                # bounds check
+                idx = int(frame_id) - 1
+                if 0 <= idx < len(image_w):
+                    w, h = (image_w[idx], image_h[idx])
+                else:
+                    continue
+            else:
+                w, h = (image_w, image_h)
+            for idx, x, y in _points_from_num_str(point_grp.group(2), w, h):
+                if extract_ids:
+                    all_points.append((frame_id, idx, x, y))
+                else:
+                    all_points.append((frame_id, x, y))
     return all_points
+def extract_video_points(text, image_w, image_h, extract_ids=False):
+    """Extract video pointing coordinates (t, x, y)."""
     all_points = []
     for coord in COORD_REGEX.finditer(text):
         for point_grp in FRAME_REGEX.finditer(coord.group(1)):
+            frame_id = float(point_grp.group(1)) # This is usually timestamp in seconds or frame index
+            w, h = (image_w, image_h)
+            for idx, x, y in _points_from_num_str(point_grp.group(2), w, h):
+                if extract_ids:
+                    all_points.append((frame_id, idx, x, y))
+                else:
+                    all_points.append((frame_id, x, y))
     return all_points
+# -----------------------------------------------------------------------------
+# 3. Visualization Utilities
+# -----------------------------------------------------------------------------
 def draw_points_on_images(images, points):
     """Draws points on a list of PIL Images."""
     annotated_images = [img.copy() for img in images]
+    # Points format: [(image_index_1_based, x, y), ...]
+    for p in points:
+        img_idx = int(p[0]) - 1 # Convert 1-based index to 0-based
+        x, y = p[1], p[2]
+        if 0 <= img_idx < len(annotated_images):
+            draw = ImageDraw.Draw(annotated_images[img_idx])
+            r = 10 # radius
+            # Draw a red circle with outline
+            draw.ellipse((x-r, y-r, x+r, y+r), outline="red", width=3)
+            draw.text((x+r, y), "target", fill="red")
     return annotated_images
+def draw_points_on_video(video_path, points, original_width, original_height):
     """
+    Draws points on video.
+    points format: [(timestamp_seconds, x, y), ...]
     """
     cap = cv2.VideoCapture(video_path)
     fps = cap.get(cv2.CAP_PROP_FPS)
     total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
+    vid_w = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
+    vid_h = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
+    # Scale factor if Molmo processed a resized version vs original video file
+    # Note: Molmo points are usually scaled to the dimensions passed in metadata.
+    # If the video metadata passed to Molmo matches the file, x/y are correct for the file.
+    scale_x = vid_w / original_width
+    scale_y = vid_h / original_height
+    # Organize points by frame index for faster lookup
+    # Molmo outputs timestamps. frame_idx = timestamp * fps
+    points_by_frame = {}
+    for t, x, y in points:
+        f_idx = int(round(t * fps))
+        if f_idx not in points_by_frame:
+            points_by_frame[f_idx] = []
+        points_by_frame[f_idx].append((x * scale_x, y * scale_y))
+    # Output setup
+    output_path = "annotated_video.mp4"
+    fourcc = cv2.VideoWriter_fourcc(*'mp4v')
+    out = cv2.VideoWriter(output_path, fourcc, fps, (vid_w, vid_h))
+    current_frame = 0
     while cap.isOpened():
         ret, frame = cap.read()
         if not ret:
             break
+        # Draw points if they exist for this frame (or nearby frames to persist visualization slightly)
+        # Simple approach: Exact frame match
+        if current_frame in points_by_frame:
+            for px, py in points_by_frame[current_frame]:
+                cv2.circle(frame, (int(px), int(py)), 10, (0, 0, 255), -1)
+                cv2.circle(frame, (int(px), int(py)), 12, (255, 255, 255), 2)
         out.write(frame)
+        current_frame += 1
     cap.release()
     out.release()
+    return output_path
 # -----------------------------------------------------------------------------
+# 4. Logic Handlers
 # -----------------------------------------------------------------------------
+def process_images(user_text, input_images):
+    if not input_images:
+        return "Please upload at least one image.", None
+    # input_images from Gradio Gallery is a list of (path, caption) tuples
+    # OR a list of paths depending on type. We requested 'filepath' type in Gradio.
+    pil_images = []
+    for img_path in input_images:
+        # If type='filepath' in Gallery, img_path is just the string path
+        # If using old gradio versions it might be a tuple.
+        if isinstance(img_path, tuple):
+            img_path = img_path[0]
+        pil_images.append(Image.open(img_path).convert("RGB"))
+    # Construct messages
+    content = [dict(type="text", text=user_text)]
+    for img in pil_images:
+        content.append(dict(type="image", image=img))
     messages = [{"role": "user", "content": content}]
+    # Process inputs
     inputs = processor.apply_chat_template(
         messages,
         tokenize=True,
         return_dict=True,
     )
     inputs = {k: v.to(model.device) for k, v in inputs.items()}
     # Generate
     with torch.inference_mode():
         generated_ids = model.generate(**inputs, max_new_tokens=1024)
     generated_tokens = generated_ids[0, inputs['input_ids'].size(1):]
     generated_text = processor.tokenizer.decode(generated_tokens, skip_special_tokens=True)
     # Check for points
+    widths = [img.width for img in pil_images]
+    heights = [img.height for img in pil_images]
+    points = extract_multi_image_points(generated_text, widths, heights)
+    output_gallery = pil_images
     if points:
+        output_gallery = draw_points_on_images(pil_images, points)
+    return generated_text, output_gallery
+def process_video(user_text, video_path):
+    if not video_path:
         return "Please upload a video.", None
+    # Construct messages
+    # Note: Molmo expects a URL or a path it can read.
     messages = [
         {
             "role": "user",
             "content": [
+                dict(type="text", text=user_text),
+                dict(type="video", video=video_path),
             ],
         }
     ]
+    # Process Vision Info (Molmo Utils)
+    # This samples the video and prepares tensors
     _, videos, video_kwargs = process_vision_info(messages)
     videos, video_metadatas = zip(*videos)
     videos, video_metadatas = list(videos), list(video_metadatas)
+    # Chat Template
     text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
+    # Inputs
     inputs = processor(
         videos=videos,
         video_metadata=video_metadatas,
         **video_kwargs,
     )
     inputs = {k: v.to(model.device) for k, v in inputs.items()}
     # Generate
     with torch.inference_mode():
         generated_ids = model.generate(**inputs, max_new_tokens=2048)
     generated_tokens = generated_ids[0, inputs['input_ids'].size(1):]
     generated_text = processor.tokenizer.decode(generated_tokens, skip_special_tokens=True)
+    # Point/Track processing
+    vid_meta = video_metadatas[0] # Assuming single video
+    points = extract_video_points(generated_text, image_w=vid_meta["width"], image_h=vid_meta["height"])
+    annotated_video_path = None
+    if points:
+        print(f"Found {len(points)} points/track-coords. Annotating video...")
+        annotated_video_path = draw_points_on_video(
+            video_path,
+            points,
+            original_width=vid_meta["width"],
+            original_height=vid_meta["height"]
+        )
+    # Return original video if no points found, otherwise annotated
+    out_vid = annotated_video_path if annotated_video_path else video_path
+    return generated_text, out_vid
 # -----------------------------------------------------------------------------
+# 5. Gradio UI Layout
 # -----------------------------------------------------------------------------
+with gr.Blocks(title="Molmo2-8B Demo") as demo:
+    gr.Markdown("# Molmo2-8B: Multimodal Open Source Model")
+    gr.Markdown("Supports Multi-image QA, Pointing, General Video QA, and Tracking.")
     with gr.Tabs():
+        # --- TAB 1: IMAGES ---
+        with gr.Tab("Images (QA & Pointing)"):
             with gr.Row():
+                with gr.Column():
+                    img_input = gr.Gallery(label="Input Images", type="filepath")
+                    img_prompt = gr.Textbox(label="Prompt", placeholder="e.g. 'Describe this' or 'Point to the boats'")
+                    img_btn = gr.Button("Run Image Analysis", variant="primary")
+                with gr.Column():
+                    img_text_out = gr.Textbox(label="Generated Text")
+                    img_out = gr.Gallery(label="Annotated Images")
             img_btn.click(
+                fn=process_images,
+                inputs=[img_prompt, img_input],
+                outputs=[img_text_out, img_out]
             )
+        # --- TAB 2: VIDEO ---
+        with gr.Tab("Video (QA, Pointing & Tracking)"):
+            gr.Markdown("**Note:** Video processing takes longer as frames are sampled.")
             with gr.Row():
+                with gr.Column():
+                    vid_input = gr.Video(label="Input Video", format="mp4")
+                    vid_prompt = gr.Textbox(label="Prompt", placeholder="e.g. 'What is happening?' or 'Track the player'")
+                    vid_btn = gr.Button("Run Video Analysis", variant="primary")
+                with gr.Column():
+                    vid_text_out = gr.Textbox(label="Generated Text")
+                    vid_out = gr.Video(label="Output Video (Annotated if applicable)")
             vid_btn.click(
+                fn=process_video,
+                inputs=[vid_prompt, vid_input],
+                outputs=[vid_text_out, vid_out]
             )
 if __name__ == "__main__":
+    demo.launch(server_name="0.0.0.0", share=True)