Spaces:

aliangdw
/

trace_visualizer

Sleeping

App Files Files Community

Anthony Liang commited on Feb 4

Commit

8c5e6cc

1 Parent(s): 5e40307

updates

Browse files

Files changed (4) hide show

app.py +13 -256
eval_server.py +24 -8
predict_trace.py +2 -2
trace_inference.py +247 -0

app.py CHANGED Viewed

@@ -16,30 +16,20 @@ from typing import List, Optional, Tuple
 import gradio as gr
 import requests
-import numpy as np
-import torch
-from PIL import Image
-from transformers import AutoModelForImageTextToText, AutoProcessor
-from trajectory_viz import extract_trajectory_from_text, visualize_trajectory_on_image
-try:
-    from qwen_vl_utils import process_vision_info
-except ImportError:
-    process_vision_info = None
 logger = logging.getLogger(__name__)
-# Default model path (Hugging Face Hub)
-DEFAULT_MODEL_ID = "mihirgrao/trace-model"
-# Trace format instruction (always appended)
-TRACE_FORMAT = (
-    "Predict the trajectory or trace in this image. "
-    "Output the coordinates as a list of [x, y] pairs, e.g. [[0.1, 0.2], [0.3, 0.4], ...]. "
-    "Use normalized coordinates between 0 and 1."
-)
 # Global server state (eval server mode)
 _server_state = {"server_url": None, "base_url": "http://localhost"}
@@ -53,16 +43,19 @@ def discover_available_models(
     start_port, end_port = port_range
     for port in range(start_port, end_port + 1):
         server_url = f"{base_url.rstrip('/')}:{port}"
         try:
             r = requests.get(f"{server_url}/health", timeout=2.0)
             if r.status_code == 200:
                 try:
                     info = requests.get(f"{server_url}/model_info", timeout=2.0).json()
                     name = info.get("model_id", f"Trace @ port {port}")
                 except Exception:
                     name = f"Trace @ port {port}"
                 available.append((server_url, name))
         except requests.exceptions.RequestException:
             continue
     return available
@@ -108,9 +101,6 @@ def check_server_health(server_url: str) -> Tuple[str, Optional[dict], Optional[
         return f"Error connecting to server: {str(e)}", None, None
-PREPROCESS_SIZE = (128, 128)
 def run_inference_via_server(
     image_path: str,
     instruction: str,
@@ -157,239 +147,6 @@ def run_inference_via_server(
                     os.unlink(preprocessed_path)
                 except Exception:
                     pass
-    return prediction, overlay_path, trace_points_text
-def center_crop_resize(
-    image: "Image.Image",
-    size: Tuple[int, int] = PREPROCESS_SIZE,
-) -> "Image.Image":
-    """Center crop to square then resize to size (default 128x128)."""
-    w, h = image.size
-    min_dim = min(w, h)
-    left = (w - min_dim) // 2
-    top = (h - min_dim) // 2
-    cropped = image.crop((left, top, left + min_dim, top + min_dim))
-    return cropped.resize(size, Image.Resampling.LANCZOS)
-def preprocess_image_for_trace(image_path: str) -> Tuple["Image.Image", Optional[str]]:
-    """
-    Load image, center crop and resize to 128x128.
-    Returns (preprocessed PIL Image, path to temp file for downstream use).
-    """
-    img = Image.open(image_path).convert("RGB")
-    img = center_crop_resize(img, PREPROCESS_SIZE)
-    tmp = tempfile.NamedTemporaryFile(delete=False, suffix=".png")
-    img.save(tmp.name)
-    return img, tmp.name
-def build_prompt(instruction: str = "") -> str:
-    """Build the full prompt from task instruction + trace format."""
-    if instruction.strip():
-        return f"Task: {instruction.strip()}\n\n{TRACE_FORMAT}"
-    return TRACE_FORMAT
-# Global model state (lazy loading)
-_model_state = {
-    "model": None,
-    "processor": None,
-    "model_id": None,
-}
-def load_model(model_id: str = DEFAULT_MODEL_ID) -> Tuple[bool, str]:
-    """Load the trace model and processor. Returns (success, message)."""
-    global _model_state
-    if _model_state["model"] is not None and _model_state["model_id"] == model_id:
-        return True, f"Model already loaded: {model_id}"
-    try:
-        # Clear previous model
-        if _model_state["model"] is not None:
-            del _model_state["model"]
-            del _model_state["processor"]
-            _model_state["model"] = None
-            _model_state["processor"] = None
-            if torch.cuda.is_available():
-                torch.cuda.empty_cache()
-        # Load model with optional flash attention
-        load_kwargs = {
-            "torch_dtype": torch.bfloat16 if torch.cuda.is_available() else torch.float32,
-            "device_map": "auto" if torch.cuda.is_available() else None,
-        }
-        try:
-            if torch.cuda.is_available():
-                load_kwargs["attn_implementation"] = "flash_attention_2"
-            model = AutoModelForImageTextToText.from_pretrained(model_id, **load_kwargs)
-        except (ValueError, ImportError):
-            load_kwargs.pop("attn_implementation", None)
-            model = AutoModelForImageTextToText.from_pretrained(model_id, **load_kwargs)
-        processor = AutoProcessor.from_pretrained(model_id)
-        _model_state["model"] = model
-        _model_state["processor"] = processor
-        _model_state["model_id"] = model_id
-        return True, f"Model loaded: {model_id}"
-    except Exception as e:
-        logger.exception("Failed to load model")
-        return False, f"Error loading model: {str(e)}"
-def run_inference(image_path: str, prompt: str, model_id: str) -> Tuple[str, Optional[str], str]:
-    """
-    Run trace model inference on an image.
-    Returns:
-        (prediction_text, overlay_image_path, trace_points_text)
-    """
-    success, msg = load_model(model_id)
-    if not success:
-        return msg, None, ""
-    model = _model_state["model"]
-    processor = _model_state["processor"]
-    if image_path is None or not os.path.exists(image_path):
-        return "Please provide a valid image.", None, ""
-    preprocessed_path = None
-    try:
-        # Preprocess: center crop and resize to 128x128
-        _, preprocessed_path = preprocess_image_for_trace(image_path)
-        image_uri = f"file://{os.path.abspath(preprocessed_path)}"
-        messages = [
-            {
-                "role": "user",
-                "content": [
-                    {"type": "image", "image": image_uri},
-                    {"type": "text", "text": prompt},
-                ],
-            }
-        ]
-        # Apply chat template
-        text = processor.apply_chat_template(
-            messages,
-            tokenize=False,
-            add_generation_prompt=True,
-        )
-        # Process vision info
-        if process_vision_info is not None:
-            process_kwargs = {"return_video_kwargs": True, "return_video_metadata": True}
-            if hasattr(processor, "image_processor") and hasattr(
-                processor.image_processor, "patch_size"
-            ):
-                process_kwargs["image_patch_size"] = processor.image_processor.patch_size
-            image_inputs, video_inputs, video_kwargs = process_vision_info(
-                messages, **process_kwargs
-            )
-        else:
-            # Fallback: load image directly and pass to processor
-            pil_image = Image.open(image_path).convert("RGB")
-            image_inputs = [pil_image]
-            video_inputs = None
-            video_kwargs = {}
-        # Prepare inputs
-        processor_kwargs = {
-            "text": [text],
-            "images": image_inputs,
-            "padding": True,
-            "return_tensors": "pt",
-            "do_resize": False,
-        }
-        if video_inputs is not None and len(video_inputs) > 0:
-            if isinstance(video_inputs[0], tuple):
-                videos, video_metadatas = zip(*video_inputs)
-                processor_kwargs["videos"] = list(videos)
-                processor_kwargs["video_metadata"] = list(video_metadatas)
-            else:
-                processor_kwargs["videos"] = video_inputs
-        if video_kwargs:
-            processor_kwargs.update(video_kwargs)
-        inputs = processor(**processor_kwargs)
-        inputs = {k: v.to(model.device) for k, v in inputs.items() if hasattr(v, "to")}
-        # Generate
-        with torch.no_grad():
-            generated_ids = model.generate(
-                **inputs,
-                max_new_tokens=1024,
-                do_sample=False,
-            )
-        # Decode output
-        input_ids = inputs["input_ids"]
-        generated_ids_trimmed = [
-            out[len(inp) :] for inp, out in zip(input_ids, generated_ids)
-        ]
-        prediction = processor.batch_decode(
-            generated_ids_trimmed,
-            skip_special_tokens=True,
-            clean_up_tokenization_spaces=False,
-        )[0]
-        # Extract trajectory and visualize
-        trajectories = extract_trajectory_from_text(prediction)
-        trace_points_text = format_trace_points(trajectories)
-        overlay_path = None
-        if trajectories and len(trajectories) >= 2:
-            with tempfile.NamedTemporaryFile(delete=False, suffix=".png") as f:
-                overlay_path = f.name
-            # Overlay on preprocessed (128x128) image
-            img_arr = visualize_trajectory_on_image(
-                trajectory=trajectories,
-                image_path=preprocessed_path,
-                output_path=overlay_path,
-                normalized=True,
-            )
-            if img_arr is None:
-                visualize_trajectory_on_image(
-                    trajectory=trajectories,
-                    image_path=preprocessed_path,
-                    output_path=overlay_path,
-                    normalized=False,
-                )
-        return prediction, overlay_path, trace_points_text
-    except Exception as e:
-        logger.exception("Inference failed")
-        return f"Error: {str(e)}", None, ""
-    finally:
-        if preprocessed_path and os.path.exists(preprocessed_path):
-            try:
-                os.unlink(preprocessed_path)
-            except Exception:
-                pass
-def format_trace_points(trajectories) -> str:
-    """Format trajectory points for display. trajectories is List[List[float]]."""
-    if not trajectories:
-        return "No trajectory points extracted."
-    lines = ["## Predicted Trace Points\n"]
-    for i, pt in enumerate(trajectories):
-        if isinstance(pt, (list, tuple)) and len(pt) >= 2:
-            x, y = pt[0], pt[1]
-            lines.append(f"- Point {i + 1}: `[{x:.4f}, {y:.4f}]`")
-        else:
-            lines.append(f"- Point {i + 1}: `{pt}`")
-    return "\n".join(lines)
 # --- Gradio UI ---
 try:
     demo = gr.Blocks(title="Trace Model Visualizer", theme=gr.themes.Soft())

 import gradio as gr
 import requests
+from trace_inference import (
+    DEFAULT_MODEL_ID,
+    TRACE_FORMAT,
+    build_prompt,
+    format_trace_points,
+    load_model,
+    preprocess_image_for_trace,
+    run_inference,
+)
+from trajectory_viz import visualize_trajectory_on_image
 logger = logging.getLogger(__name__)
 # Global server state (eval server mode)
 _server_state = {"server_url": None, "base_url": "http://localhost"}
     start_port, end_port = port_range
     for port in range(start_port, end_port + 1):
         server_url = f"{base_url.rstrip('/')}:{port}"
+        print(f"Checking {server_url}/health")
         try:
             r = requests.get(f"{server_url}/health", timeout=2.0)
             if r.status_code == 200:
                 try:
                     info = requests.get(f"{server_url}/model_info", timeout=2.0).json()
+                    print(info)
                     name = info.get("model_id", f"Trace @ port {port}")
                 except Exception:
                     name = f"Trace @ port {port}"
                 available.append((server_url, name))
         except requests.exceptions.RequestException:
+            print(f"Error checking {server_url}/health")
             continue
     return available
         return f"Error connecting to server: {str(e)}", None, None
 def run_inference_via_server(
     image_path: str,
     instruction: str,
                     os.unlink(preprocessed_path)
                 except Exception:
                     pass
 # --- Gradio UI ---
 try:
     demo = gr.Blocks(title="Trace Model Visualizer", theme=gr.themes.Soft())

eval_server.py CHANGED Viewed

@@ -3,7 +3,7 @@
 FastAPI server for Trace Model inference.
 Usage:
-    python eval_server.py --model-id mihirgrao/trace-model --port 8001
 Endpoints:
     POST /predict         - Single image + instruction
@@ -14,8 +14,10 @@ Endpoints:
 import argparse
 import base64
 import logging
 import os
 import tempfile
 import time
 from concurrent.futures import ThreadPoolExecutor
@@ -26,7 +28,13 @@ import uvicorn
 from fastapi import FastAPI, Request
 from fastapi.middleware.cors import CORSMiddleware
-from app import DEFAULT_MODEL_ID, build_prompt, load_model, run_inference
 from trajectory_viz import extract_trajectory_from_text
 logger = logging.getLogger(__name__)
@@ -72,13 +80,23 @@ class TraceEvalServer:
         temp_file_path = None
         if image_path is None:
             try:
-                image_bytes = base64.b64decode(image_base64)
                 with tempfile.NamedTemporaryFile(delete=False, suffix=".png") as f:
-                    f.write(image_bytes)
                     image_path = f.name
                     temp_file_path = image_path
             except Exception as e:
-                return {"error": f"Invalid base64 image: {e}"}
         try:
             prompt = build_prompt(instruction)
@@ -138,9 +156,7 @@ class TraceEvalServer:
     def get_model_info(self) -> Dict[str, Any]:
         """Get model information."""
         try:
-            from app import _model_state
-            model = _model_state.get("model")
             if model is None:
                 return {"model_id": self.model_id, "status": "not_loaded"}

 FastAPI server for Trace Model inference.
 Usage:
+    python eval_server.py --model-id mihirgrao/trace-model --port 8000
 Endpoints:
     POST /predict         - Single image + instruction
 import argparse
 import base64
+import io
 import logging
 import os
+import re
 import tempfile
 import time
 from concurrent.futures import ThreadPoolExecutor
 from fastapi import FastAPI, Request
 from fastapi.middleware.cors import CORSMiddleware
+from trace_inference import (
+    DEFAULT_MODEL_ID,
+    build_prompt,
+    load_model,
+    run_inference,
+)
+from trace_inference import _model_state as _trace_model_state
 from trajectory_viz import extract_trajectory_from_text
 logger = logging.getLogger(__name__)
         temp_file_path = None
         if image_path is None:
             try:
+                # Strip data URL prefix if present (e.g. "data:image/png;base64,")
+                b64_str = image_base64.strip()
+                if b64_str.startswith("data:"):
+                    match = re.match(r"data:image/[^;]+;base64,(.+)", b64_str, re.DOTALL)
+                    if match:
+                        b64_str = match.group(1)
+                image_bytes = base64.b64decode(b64_str, validate=False)
+                # Load via BytesIO to validate and get proper format, then save
+                from PIL import Image
+                img = Image.open(io.BytesIO(image_bytes)).convert("RGB")
                 with tempfile.NamedTemporaryFile(delete=False, suffix=".png") as f:
+                    img.save(f.name, format="PNG")
                     image_path = f.name
                     temp_file_path = image_path
             except Exception as e:
+                return {"error": f"Invalid image data: {e}"}
         try:
             prompt = build_prompt(instruction)
     def get_model_info(self) -> Dict[str, Any]:
         """Get model information."""
         try:
+            model = _trace_model_state.get("model")
             if model is None:
                 return {"model_id": self.model_id, "status": "not_loaded"}

predict_trace.py CHANGED Viewed

@@ -2,7 +2,7 @@
 """
 CLI script to predict trace on an image using the trace model.
-Reuses load_model and run_inference from app.
 """
 import argparse
@@ -10,7 +10,7 @@ import os
 import shutil
 import sys
-from app import DEFAULT_MODEL_ID, build_prompt, load_model, run_inference
 def main():

 """
 CLI script to predict trace on an image using the trace model.
+Reuses load_model and run_inference from trace_inference.
 """
 import argparse
 import shutil
 import sys
+from trace_inference import DEFAULT_MODEL_ID, build_prompt, load_model, run_inference
 def main():

trace_inference.py ADDED Viewed

	@@ -0,0 +1,247 @@

+"""
+Shared trace model inference logic.
+This module has minimal top-level imports so eval_server can import
+DEFAULT_MODEL_ID and build_prompt without pulling in torch/transformers.
+Heavy imports are done lazily inside load_model and run_inference.
+"""
+import logging
+import os
+import tempfile
+from typing import List, Optional, Tuple
+logger = logging.getLogger(__name__)
+# Constants (no heavy deps)
+DEFAULT_MODEL_ID = "mihirgrao/trace-model"
+TRACE_FORMAT = (
+    "Predict the trajectory or trace in this image. "
+    "Output the coordinates as a list of [x, y] pairs, e.g. [[0.1, 0.2], [0.3, 0.4], ...]. "
+    "Use normalized coordinates between 0 and 1."
+)
+PREPROCESS_SIZE = (128, 128)
+# Global model state
+_model_state = {
+    "model": None,
+    "processor": None,
+    "model_id": None,
+}
+def build_prompt(instruction: str = "") -> str:
+    """Build the full prompt from task instruction + trace format."""
+    if instruction.strip():
+        return f"Task: {instruction.strip()}\n\n{TRACE_FORMAT}"
+    return TRACE_FORMAT
+def format_trace_points(trajectories: List) -> str:
+    """Format trajectory points for display."""
+    if not trajectories:
+        return "No trajectory points extracted."
+    lines = ["## Predicted Trace Points\n"]
+    for i, pt in enumerate(trajectories):
+        if isinstance(pt, (list, tuple)) and len(pt) >= 2:
+            x, y = pt[0], pt[1]
+            lines.append(f"- Point {i + 1}: `[{x:.4f}, {y:.4f}]`")
+        else:
+            lines.append(f"- Point {i + 1}: `{pt}`")
+    return "\n".join(lines)
+def center_crop_resize(image, size: Tuple[int, int] = PREPROCESS_SIZE):
+    """Center crop to square then resize. Requires PIL Image."""
+    from PIL import Image
+    w, h = image.size
+    min_dim = min(w, h)
+    left = (w - min_dim) // 2
+    top = (h - min_dim) // 2
+    cropped = image.crop((left, top, left + min_dim, top + min_dim))
+    return cropped.resize(size, Image.Resampling.LANCZOS)
+def preprocess_image_for_trace(image_path: str) -> Tuple:
+    """Load image, center crop and resize to 128x128. Returns (PIL Image, temp_path)."""
+    from PIL import Image
+    img = Image.open(image_path).convert("RGB")
+    img = center_crop_resize(img, PREPROCESS_SIZE)
+    tmp = tempfile.NamedTemporaryFile(delete=False, suffix=".png")
+    img.save(tmp.name)
+    return img, tmp.name
+def load_model(model_id: str = DEFAULT_MODEL_ID) -> Tuple[bool, str]:
+    """Load the trace model and processor. Returns (success, message)."""
+    global _model_state
+    if _model_state["model"] is not None and _model_state["model_id"] == model_id:
+        return True, f"Model already loaded: {model_id}"
+    try:
+        import torch
+        from transformers import AutoModelForImageTextToText, AutoProcessor
+        if _model_state["model"] is not None:
+            del _model_state["model"]
+            del _model_state["processor"]
+            _model_state["model"] = None
+            _model_state["processor"] = None
+            if torch.cuda.is_available():
+                torch.cuda.empty_cache()
+        load_kwargs = {
+            "torch_dtype": torch.bfloat16 if torch.cuda.is_available() else torch.float32,
+            "device_map": "auto" if torch.cuda.is_available() else None,
+        }
+        try:
+            if torch.cuda.is_available():
+                load_kwargs["attn_implementation"] = "flash_attention_2"
+            model = AutoModelForImageTextToText.from_pretrained(model_id, **load_kwargs)
+        except (ValueError, ImportError):
+            load_kwargs.pop("attn_implementation", None)
+            model = AutoModelForImageTextToText.from_pretrained(model_id, **load_kwargs)
+        processor = AutoProcessor.from_pretrained(model_id)
+        _model_state["model"] = model
+        _model_state["processor"] = processor
+        _model_state["model_id"] = model_id
+        return True, f"Model loaded: {model_id}"
+    except Exception as e:
+        logger.exception("Failed to load model")
+        return False, f"Error loading model: {str(e)}"
+def run_inference(image_path: str, prompt: str, model_id: str) -> Tuple[str, Optional[str], str]:
+    """
+    Run trace model inference on an image.
+    Returns: (prediction_text, overlay_image_path, trace_points_text)
+    """
+    success, msg = load_model(model_id)
+    if not success:
+        return msg, None, ""
+    model = _model_state["model"]
+    processor = _model_state["processor"]
+    if image_path is None or not os.path.exists(image_path):
+        return "Please provide a valid image.", None, ""
+    try:
+        from trajectory_viz import extract_trajectory_from_text, visualize_trajectory_on_image
+        try:
+            from qwen_vl_utils import process_vision_info
+        except ImportError:
+            process_vision_info = None
+        preprocessed_path = None
+        try:
+            _, preprocessed_path = preprocess_image_for_trace(image_path)
+            image_uri = f"file://{os.path.abspath(preprocessed_path)}"
+            messages = [
+                {
+                    "role": "user",
+                    "content": [
+                        {"type": "image", "image": image_uri},
+                        {"type": "text", "text": prompt},
+                    ],
+                }
+            ]
+            text = processor.apply_chat_template(
+                messages, tokenize=False, add_generation_prompt=True
+            )
+            if process_vision_info is not None:
+                process_kwargs = {"return_video_kwargs": True, "return_video_metadata": True}
+                if hasattr(processor, "image_processor") and hasattr(
+                    processor.image_processor, "patch_size"
+                ):
+                    process_kwargs["image_patch_size"] = processor.image_processor.patch_size
+                image_inputs, video_inputs, video_kwargs = process_vision_info(
+                    messages, **process_kwargs
+                )
+            else:
+                from PIL import Image
+                pil_image = Image.open(image_path).convert("RGB")
+                image_inputs = [pil_image]
+                video_inputs = None
+                video_kwargs = {}
+            processor_kwargs = {
+                "text": [text],
+                "images": image_inputs,
+                "padding": True,
+                "return_tensors": "pt",
+                "do_resize": False,
+            }
+            if video_inputs is not None and len(video_inputs) > 0:
+                if isinstance(video_inputs[0], tuple):
+                    videos, video_metadatas = zip(*video_inputs)
+                    processor_kwargs["videos"] = list(videos)
+                    processor_kwargs["video_metadata"] = list(video_metadatas)
+                else:
+                    processor_kwargs["videos"] = video_inputs
+            if video_kwargs:
+                processor_kwargs.update(video_kwargs)
+            import torch
+            inputs = processor(**processor_kwargs)
+            inputs = {k: v.to(model.device) for k, v in inputs.items() if hasattr(v, "to")}
+            with torch.no_grad():
+                generated_ids = model.generate(
+                    **inputs, max_new_tokens=1024, do_sample=False
+                )
+            input_ids = inputs["input_ids"]
+            generated_ids_trimmed = [
+                out[len(inp) :] for inp, out in zip(input_ids, generated_ids)
+            ]
+            prediction = processor.batch_decode(
+                generated_ids_trimmed,
+                skip_special_tokens=True,
+                clean_up_tokenization_spaces=False,
+            )[0]
+            trajectories = extract_trajectory_from_text(prediction)
+            trace_points_text = format_trace_points(trajectories)
+            overlay_path = None
+            if trajectories and len(trajectories) >= 2:
+                with tempfile.NamedTemporaryFile(delete=False, suffix=".png") as f:
+                    overlay_path = f.name
+                img_arr = visualize_trajectory_on_image(
+                    trajectory=trajectories,
+                    image_path=preprocessed_path,
+                    output_path=overlay_path,
+                    normalized=True,
+                )
+                if img_arr is None:
+                    visualize_trajectory_on_image(
+                        trajectory=trajectories,
+                        image_path=preprocessed_path,
+                        output_path=overlay_path,
+                        normalized=False,
+                    )
+            return prediction, overlay_path, trace_points_text
+        finally:
+            if preprocessed_path and os.path.exists(preprocessed_path):
+                try:
+                    os.unlink(preprocessed_path)
+                except Exception:
+                    pass
+    except Exception as e:
+        logger.exception("Inference failed")
+        return f"Error: {str(e)}", None, ""