Spaces:

aliangdw
/

trace_visualizer

Sleeping

App Files Files Community

Anthony Liang commited on Feb 3

Commit

7c21061

1 Parent(s): 4e80be3

add prediction app and script for running inference on trained model

Browse files

Files changed (6) hide show

.gitignore +10 -0
README.md +46 -1
app.py +338 -0
predict_trace.py +99 -0
requirements.txt +7 -0
trajectory_viz.py +135 -0

.gitignore ADDED Viewed

	@@ -0,0 +1,10 @@

+*.png
+__pycache__/
+*.pyc
+*.pyo
+*.pyd
+*.pyw
+*.pyz
+*.pywz
+*.pyzw
+*.pyzwz

README.md CHANGED Viewed

@@ -9,4 +9,49 @@ app_file: app.py
 pinned: false
 ---
-Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

 pinned: false
 ---
+# Trace Model Visualizer
+Gradio app for visualizing trace/trajectory predictions from [mihirgrao/trace-model](https://huggingface.co/mihirgrao/trace-model).
+## Features
+- **Image input**: Upload an image
+- **Trace prediction**: Model predicts trajectory points from the image
+- **Visual overlay**: Trace is overlaid on the image with gradient coloring (green start → red end)
+- **Coordinate output**: Predicted trace points are printed below
+## Installation
+```bash
+pip install -r requirements.txt
+```
+## Usage
+### Gradio app
+```bash
+python app.py
+```
+Then open the URL (default: http://localhost:7860).
+1. Click **Load Model** to load the trace model (first run downloads from Hugging Face)
+2. Upload an image and optionally enter a task instruction (e.g. "Pick up the red block")
+3. Click **Run Inference**
+4. View the overlay image and predicted trace points
+### CLI script
+```bash
+python predict_trace.py image.png
+python predict_trace.py image.png -i "Pick up the red block"
+python predict_trace.py image.png -o output_trace.png -i "Stack the cube on the block"
+python predict_trace.py image.png -o output.png -m mihirgrao/trace-model
+```
+- `image` – Path to input image
+- `-i, --instruction` – Task / language instruction (e.g. "Pick up the red block")
+- `-o, --output` – Where to save the overlay (default: `<image>_trace.png`)
+- `-m, --model-id` – Model ID (default: mihirgrao/trace-model)
+- `-p, --prompt` – Full prompt override (if set, ignores `-i`)

app.py ADDED Viewed

	@@ -0,0 +1,338 @@

+#!/usr/bin/env python3
+"""
+Gradio app for Trace Model inference visualization.
+Takes an image, runs the trace model to predict trajectory points,
+overlays the trace on the image, and displays the predicted coordinates.
+Model: https://huggingface.co/mihirgrao/trace-model
+"""
+import os
+import tempfile
+import logging
+from typing import Optional, Tuple
+import gradio as gr
+import numpy as np
+import torch
+from PIL import Image
+from transformers import AutoModelForImageTextToText, AutoProcessor
+from trajectory_viz import extract_trajectory_from_text, visualize_trajectory_on_image
+try:
+    from qwen_vl_utils import process_vision_info
+except ImportError:
+    process_vision_info = None
+logger = logging.getLogger(__name__)
+# Default model path (Hugging Face Hub)
+DEFAULT_MODEL_ID = "mihirgrao/trace-model"
+# Trace format instruction (always appended)
+TRACE_FORMAT = (
+    "Predict the trajectory or trace in this image. "
+    "Output the coordinates as a list of [x, y] pairs, e.g. [[0.1, 0.2], [0.3, 0.4], ...]. "
+    "Use normalized coordinates between 0 and 1."
+)
+def build_prompt(instruction: str = "") -> str:
+    """Build the full prompt from task instruction + trace format."""
+    if instruction.strip():
+        return f"Task: {instruction.strip()}\n\n{TRACE_FORMAT}"
+    return TRACE_FORMAT
+# Global model state (lazy loading)
+_model_state = {
+    "model": None,
+    "processor": None,
+    "model_id": None,
+}
+def load_model(model_id: str = DEFAULT_MODEL_ID) -> Tuple[bool, str]:
+    """Load the trace model and processor. Returns (success, message)."""
+    global _model_state
+    if _model_state["model"] is not None and _model_state["model_id"] == model_id:
+        return True, f"Model already loaded: {model_id}"
+    try:
+        # Clear previous model
+        if _model_state["model"] is not None:
+            del _model_state["model"]
+            del _model_state["processor"]
+            _model_state["model"] = None
+            _model_state["processor"] = None
+            if torch.cuda.is_available():
+                torch.cuda.empty_cache()
+        # Load model with optional flash attention
+        load_kwargs = {
+            "torch_dtype": torch.bfloat16 if torch.cuda.is_available() else torch.float32,
+            "device_map": "auto" if torch.cuda.is_available() else None,
+        }
+        try:
+            if torch.cuda.is_available():
+                load_kwargs["attn_implementation"] = "flash_attention_2"
+            model = AutoModelForImageTextToText.from_pretrained(model_id, **load_kwargs)
+        except (ValueError, ImportError):
+            load_kwargs.pop("attn_implementation", None)
+            model = AutoModelForImageTextToText.from_pretrained(model_id, **load_kwargs)
+        processor = AutoProcessor.from_pretrained(model_id)
+        _model_state["model"] = model
+        _model_state["processor"] = processor
+        _model_state["model_id"] = model_id
+        return True, f"Model loaded: {model_id}"
+    except Exception as e:
+        logger.exception("Failed to load model")
+        return False, f"Error loading model: {str(e)}"
+def run_inference(image_path: str, prompt: str, model_id: str) -> Tuple[str, Optional[str], str]:
+    """
+    Run trace model inference on an image.
+    Returns:
+        (prediction_text, overlay_image_path, trace_points_text)
+    """
+    success, msg = load_model(model_id)
+    if not success:
+        return msg, None, ""
+    model = _model_state["model"]
+    processor = _model_state["processor"]
+    if image_path is None or not os.path.exists(image_path):
+        return "Please provide a valid image.", None, ""
+    try:
+        # Ensure file:// format for qwen_vl_utils
+        if not image_path.startswith("file://") and not image_path.startswith("http"):
+            image_uri = f"file://{os.path.abspath(image_path)}"
+        else:
+            image_uri = image_path
+        messages = [
+            {
+                "role": "user",
+                "content": [
+                    {"type": "image", "image": image_uri},
+                    {"type": "text", "text": prompt},
+                ],
+            }
+        ]
+        # Apply chat template
+        text = processor.apply_chat_template(
+            messages,
+            tokenize=False,
+            add_generation_prompt=True,
+        )
+        # Process vision info
+        if process_vision_info is not None:
+            process_kwargs = {"return_video_kwargs": True, "return_video_metadata": True}
+            if hasattr(processor, "image_processor") and hasattr(
+                processor.image_processor, "patch_size"
+            ):
+                process_kwargs["image_patch_size"] = processor.image_processor.patch_size
+            image_inputs, video_inputs, video_kwargs = process_vision_info(
+                messages, **process_kwargs
+            )
+        else:
+            # Fallback: load image directly and pass to processor
+            pil_image = Image.open(image_path).convert("RGB")
+            image_inputs = [pil_image]
+            video_inputs = None
+            video_kwargs = {}
+        # Prepare inputs
+        processor_kwargs = {
+            "text": [text],
+            "images": image_inputs,
+            "padding": True,
+            "return_tensors": "pt",
+            "do_resize": False,
+        }
+        if video_inputs is not None and len(video_inputs) > 0:
+            if isinstance(video_inputs[0], tuple):
+                videos, video_metadatas = zip(*video_inputs)
+                processor_kwargs["videos"] = list(videos)
+                processor_kwargs["video_metadata"] = list(video_metadatas)
+            else:
+                processor_kwargs["videos"] = video_inputs
+        if video_kwargs:
+            processor_kwargs.update(video_kwargs)
+        inputs = processor(**processor_kwargs)
+        inputs = {k: v.to(model.device) for k, v in inputs.items() if hasattr(v, "to")}
+        # Generate
+        with torch.no_grad():
+            generated_ids = model.generate(
+                **inputs,
+                max_new_tokens=1024,
+                do_sample=False,
+            )
+        # Decode output
+        input_ids = inputs["input_ids"]
+        generated_ids_trimmed = [
+            out[len(inp) :] for inp, out in zip(input_ids, generated_ids)
+        ]
+        prediction = processor.batch_decode(
+            generated_ids_trimmed,
+            skip_special_tokens=True,
+            clean_up_tokenization_spaces=False,
+        )[0]
+        # Extract trajectory and visualize
+        trajectories = extract_trajectory_from_text(prediction)
+        trace_points_text = format_trace_points(trajectories)
+        overlay_path = None
+        if trajectories and len(trajectories) >= 2:
+            with tempfile.NamedTemporaryFile(delete=False, suffix=".png") as f:
+                overlay_path = f.name
+            # Try normalized first (common for VLMs)
+            img_arr = visualize_trajectory_on_image(
+                trajectory=trajectories,
+                image_path=image_path,
+                output_path=overlay_path,
+                normalized=True,
+            )
+            if img_arr is None:
+                # Fallback: pixel coordinates
+                visualize_trajectory_on_image(
+                    trajectory=trajectories,
+                    image_path=image_path,
+                    output_path=overlay_path,
+                    normalized=False,
+                )
+        return prediction, overlay_path, trace_points_text
+    except Exception as e:
+        logger.exception("Inference failed")
+        return f"Error: {str(e)}", None, ""
+def format_trace_points(trajectories) -> str:
+    """Format trajectory points for display. trajectories is List[List[float]]."""
+    if not trajectories:
+        return "No trajectory points extracted."
+    lines = ["## Predicted Trace Points\n"]
+    for i, pt in enumerate(trajectories):
+        if isinstance(pt, (list, tuple)) and len(pt) >= 2:
+            x, y = pt[0], pt[1]
+            lines.append(f"- Point {i + 1}: `[{x:.4f}, {y:.4f}]`")
+        else:
+            lines.append(f"- Point {i + 1}: `{pt}`")
+    return "\n".join(lines)
+# --- Gradio UI ---
+try:
+    demo = gr.Blocks(title="Trace Model Visualizer", theme=gr.themes.Soft())
+except TypeError:
+    demo = gr.Blocks(title="Trace Model Visualizer")
+with demo:
+    gr.Markdown(
+        """
+        # Trace Model Visualizer
+        Upload an image to predict the trajectory/trace using [mihirgrao/trace-model](https://huggingface.co/mihirgrao/trace-model).
+        The model predicts coordinate points; they are overlaid on the image (green → red gradient) and listed below.
+        """
+    )
+    with gr.Row():
+        with gr.Column(scale=1):
+            image_input = gr.Image(
+                label="Upload Image",
+                type="filepath",
+                height=400,
+            )
+            instruction_input = gr.Textbox(
+                label="Task / Language instruction",
+                placeholder="e.g. Pick up the red block and place it on the table",
+                value="",
+                lines=2,
+                info="Describe the task. The model will predict the trace for this instruction.",
+            )
+            model_id_input = gr.Textbox(
+                label="Model ID",
+                value=DEFAULT_MODEL_ID,
+                info="Hugging Face model ID",
+            )
+            load_model_btn = gr.Button("Load Model", variant="secondary")
+            run_btn = gr.Button("Run Inference", variant="primary")
+        with gr.Column(scale=1):
+            overlay_output = gr.Image(
+                label="Image with Trace Overlay",
+                height=400,
+            )
+            prediction_output = gr.Textbox(
+                label="Model Prediction (raw)",
+                lines=6,
+            )
+            trace_points_output = gr.Markdown(
+                label="Extracted Trace Points",
+            )
+    status_md = gr.Markdown("Click 'Load Model' to load the trace model, then 'Run Inference' on an image.")
+    def on_load_model(model_id: str):
+        _, msg = load_model(model_id)
+        return f"**Status:** {msg}"
+    def on_run_inference(image_path, instruction, model_id):
+        if image_path is None:
+            return "Please upload an image first.", None, "", "**Status:** Please upload an image."
+        prompt = build_prompt(instruction)
+        pred, overlay_path, trace_text = run_inference(image_path, prompt, model_id)
+        status = "**Status:** Inference complete." if overlay_path else f"**Status:** {pred}"
+        return pred, overlay_path, trace_text, status
+    load_model_btn.click(
+        fn=on_load_model,
+        inputs=[model_id_input],
+        outputs=[status_md],
+    )
+    run_btn.click(
+        fn=on_run_inference,
+        inputs=[image_input, instruction_input, model_id_input],
+        outputs=[
+            prediction_output,
+            overlay_output,
+            trace_points_output,
+            status_md,
+        ],
+        api_name="run_inference",
+    )
+def main():
+    """Launch the Gradio app."""
+    demo.launch(
+        server_name="0.0.0.0",
+        server_port=7860,
+        share=False,
+    )
+if __name__ == "__main__":
+    main()

predict_trace.py ADDED Viewed

	@@ -0,0 +1,99 @@

+#!/usr/bin/env python3
+"""
+CLI script to predict trace on an image using the trace model.
+Reuses load_model and run_inference from app.
+"""
+import argparse
+import os
+import shutil
+import sys
+from app import DEFAULT_MODEL_ID, build_prompt, load_model, run_inference
+def main():
+    parser = argparse.ArgumentParser(
+        description="Predict trace/trajectory on an image using mihirgrao/trace-model"
+    )
+    parser.add_argument("image", type=str, help="Path to input image")
+    parser.add_argument(
+        "-o",
+        "--output",
+        type=str,
+        default=None,
+        help="Path to save overlay image (default: <image>_trace.png)",
+    )
+    parser.add_argument(
+        "-m",
+        "--model-id",
+        type=str,
+        default=DEFAULT_MODEL_ID,
+        help=f"Model ID (default: {DEFAULT_MODEL_ID})",
+    )
+    parser.add_argument(
+        "-i",
+        "--instruction",
+        type=str,
+        default="",
+        help="Task / language instruction (e.g. 'Pick up the red block')",
+    )
+    parser.add_argument(
+        "-p",
+        "--prompt",
+        type=str,
+        default=None,
+        help="Full prompt override (if set, ignores --instruction)",
+    )
+    args = parser.parse_args()
+    if not os.path.exists(args.image):
+        print(f"Error: Image not found: {args.image}", file=sys.stderr)
+        sys.exit(1)
+    # Load model
+    success, msg = load_model(args.model_id)
+    if not success:
+        print(f"Error: {msg}", file=sys.stderr)
+        sys.exit(1)
+    print(f"✓ {msg}")
+    # Build prompt from instruction
+    prompt = args.prompt if args.prompt is not None else build_prompt(args.instruction)
+    # Run inference
+    prediction, overlay_path, trace_text = run_inference(
+        args.image, prompt, args.model_id
+    )
+    # Handle errors
+    if prediction.startswith("Error:") or prediction.startswith("Please "):
+        print(f"Error: {prediction}", file=sys.stderr)
+        sys.exit(1)
+    if overlay_path is None:
+        print("\nModel prediction (raw):")
+        print(prediction)
+        print("\n" + trace_text)
+        print("\nNo trajectory points were extracted from the prediction.")
+        sys.exit(0)
+    # Save overlay to desired path if specified
+    output_path = args.output
+    if output_path is None:
+        base, ext = os.path.splitext(args.image)
+        output_path = f"{base}_trace{ext}"
+    shutil.copy(overlay_path, output_path)
+    os.unlink(overlay_path)  # Remove temp file
+    print(f"\n✓ Overlay saved to: {output_path}")
+    print("\nModel prediction (raw):")
+    print(prediction)
+    print("\n" + trace_text)
+if __name__ == "__main__":
+    main()

requirements.txt ADDED Viewed

	@@ -0,0 +1,7 @@

+gradio>=4.0.0
+torch>=2.0.0
+transformers>=4.45.0
+accelerate>=0.25.0
+Pillow>=9.0.0
+numpy>=1.20.0
+qwen-vl-utils>=0.0.8

trajectory_viz.py ADDED Viewed

	@@ -0,0 +1,135 @@

+"""
+Trajectory Visualization Utilities for Trace Model
+Extracts trajectory coordinates from model output text and overlays them on images.
+Supports both pixel coordinates and normalized (0-1) coordinates.
+"""
+import os
+import re
+from typing import List, Tuple, Optional, Union
+import numpy as np
+from PIL import Image, ImageDraw
+def extract_trajectory_from_text(text: str) -> List[List[float]]:
+    """
+    Extract trajectory coordinates from model output text.
+    Handles both pixel coordinates [[100, 200], [150, 250]] and
+    normalized coordinates [[0.5, 0.3], [0.7, 0.4]].
+    Args:
+        text: The text output from the model containing trajectory information
+    Returns:
+        List of [x, y] coordinate pairs as floats
+    """
+    # Look for coordinate pairs [x, y] - supports ints and floats
+    coord_pattern = r"\[\s*(-?\d+(?:\.\d+)?)\s*,\s*(-?\d+(?:\.\d+)?)\s*\]"
+    coord_matches = re.findall(coord_pattern, text)
+    if not coord_matches:
+        return []
+    trajectory = []
+    for x_str, y_str in coord_matches:
+        try:
+            x = float(x_str.strip())
+            y = float(y_str.strip())
+            trajectory.append([x, y])
+        except (ValueError, IndexError):
+            continue
+    return trajectory
+def _to_pixel_coords(
+    trajectory: List[List[float]],
+    img_width: int,
+    img_height: int,
+    normalized: bool = True,
+) -> List[List[int]]:
+    """Convert trajectory to pixel coordinates."""
+    pixel_traj = []
+    for x, y in trajectory:
+        if normalized:
+            x = int(x * img_width)
+            y = int(y * img_height)
+        else:
+            x, y = int(x), int(y)
+        pixel_traj.append([x, y])
+    return pixel_traj
+def visualize_trajectory_on_image(
+    trajectory: List[List[float]],
+    image_path: Optional[str] = None,
+    output_path: Optional[str] = None,
+    pil_image: Optional[Image.Image] = None,
+    normalized: bool = True,
+    start_color: Tuple[int, int, int] = (0, 255, 0),
+    end_color: Tuple[int, int, int] = (255, 0, 0),
+    line_thickness: int = 4,
+) -> Optional[np.ndarray]:
+    """
+    Overlay trajectory on an image with gradient coloring (green start -> red end).
+    Args:
+        trajectory: List of [x, y] coordinate pairs (pixel or normalized)
+        image_path: Path to input image (used if pil_image is None)
+        output_path: Where to save the output image
+        pil_image: PIL Image to draw on (overrides image_path)
+        normalized: If True, coordinates are 0-1 and will be scaled to image size
+        start_color: RGB for trajectory start
+        end_color: RGB for trajectory end
+        line_thickness: Line width in pixels
+    Returns:
+        numpy array of the output image, or None if trajectory too short
+    """
+    if not trajectory or len(trajectory) < 2:
+        return None
+    if pil_image is not None:
+        img = pil_image.convert("RGB").copy()
+    elif image_path and os.path.exists(image_path):
+        img = Image.open(image_path).convert("RGB").copy()
+    else:
+        return None
+    w, h = img.size
+    pixel_traj = _to_pixel_coords(trajectory, w, h, normalized=normalized)
+    # Clamp to image bounds
+    pixel_traj = [
+        [max(0, min(w - 1, x)), max(0, min(h - 1, y))]
+        for x, y in pixel_traj
+    ]
+    draw = ImageDraw.Draw(img)
+    # Draw gradient line segments
+    num_segments = len(pixel_traj) - 1
+    for i in range(num_segments):
+        progress = i / max(1, num_segments - 1)
+        r = int(start_color[0] * (1 - progress) + end_color[0] * progress)
+        g = int(start_color[1] * (1 - progress) + end_color[1] * progress)
+        b = int(start_color[2] * (1 - progress) + end_color[2] * progress)
+        segment_color = (r, g, b)
+        start_pt = tuple(pixel_traj[i])
+        end_pt = tuple(pixel_traj[i + 1])
+        draw.line([start_pt, end_pt], fill=segment_color, width=line_thickness)
+    # Draw start marker
+    if pixel_traj:
+        sx, sy = pixel_traj[0]
+        r = max(3, line_thickness)
+        bbox = [sx - r, sy - r, sx + r, sy + r]
+        draw.ellipse(bbox, fill=start_color, outline=(255, 255, 255), width=2)
+    if output_path:
+        img.save(output_path)
+    return np.array(img)