Spaces:

aliangdw
/

trace_visualizer

Sleeping

App Files Files Community

Anthony Liang commited on Feb 7

Commit

be80524

1 Parent(s): 130aa46

update

Browse files

Files changed (3) hide show

app.py +22 -37
eval_server.py +6 -17
trace_inference.py +115 -316

app.py CHANGED Viewed

@@ -19,12 +19,9 @@ import requests
 from trace_inference import (
     DEFAULT_MODEL_ID,
-    TRACE_FORMAT,
-    build_franka_prompt,
     build_prompt,
     preprocess_image_for_trace,
     run_inference,
-    run_inference_qwenvl,
 )
 from trajectory_viz import visualize_trajectory_on_image
@@ -124,7 +121,7 @@ def run_inference_via_server(
     image_path: str,
     instruction: str,
     server_url: str,
-    use_qwenvl: bool = False,
 ) -> Tuple[str, Optional[str]]:
     """Run inference via trace eval server. Returns (prediction, overlay_path)."""
     with open(image_path, "rb") as f:
@@ -135,7 +132,7 @@ def run_inference_via_server(
         json={
             "image_base64": image_b64,
             "instruction": instruction,
-            "use_qwenvl": use_qwenvl,
         },
         timeout=120.0,
         headers=headers,
@@ -177,7 +174,7 @@ def run_inference_via_server(
 # --- Gradio UI ---
 try:
-    demo = gr.Blocks(title="Trace Model Visualizer", theme=gr.themes.Soft())
 except TypeError:
     demo = gr.Blocks(title="Trace Model Visualizer")
@@ -303,17 +300,18 @@ with demo:
                 lines=4,
                 info="Enter a task description in natural language. The model predicts the trace for this instruction.",
             )
             gr.Markdown("### Local model (if no eval server selected)")
             model_id_input = gr.Textbox(
                 label="Model ID",
                 value=DEFAULT_MODEL_ID,
                 info="Hugging Face model ID (auto-loads on first inference if no eval server selected)",
             )
-            use_qwenvl_checkbox = gr.Checkbox(
-                label="Use Franka / qwenvl inference",
-                value=False,
-                info="Uses preprocess_qwen_visual (qwenvl). Requires qwen-vl-finetune on PYTHONPATH.",
-            )
             run_btn = gr.Button("Run Inference", variant="primary")
         with gr.Column(scale=1):
@@ -334,7 +332,7 @@ with demo:
         "Select an eval server from the sidebar (auto-connects), or run inference with local model."
     )
-    def on_run_inference(image_path, instruction, model_id, server_url, use_qwenvl):
         if image_path is None:
             return (
                 "",
@@ -343,48 +341,34 @@ with demo:
                 "**Status:** Please upload an image.",
             )
         if server_url:
-            prompt = (
-                build_franka_prompt(instruction or "predict the trace")
-                if use_qwenvl
-                else build_prompt(instruction)
-            )
             prompt_md = f"**Prompt sent to model:**\n\n```\n{prompt}\n```"
             pred, overlay_path = run_inference_via_server(
-                image_path, instruction, server_url, use_qwenvl
             )
-        elif use_qwenvl:
-            prompt = build_franka_prompt(instruction or "predict the trace")
-            prompt_md = f"**Prompt sent to model:**\n\n```\n{prompt}\n```"
-            output_dict, pred, overlay_path, trace_text = run_inference_qwenvl(
-                image_path, instruction or "predict the trace", model_id
-            )
-            if not output_dict and trace_text and "qwenvl package not found" in trace_text:
-                pred = trace_text
-                overlay_path = None
         else:
-            prompt = build_prompt(instruction)
             prompt_md = f"**Prompt sent to model:**\n\n```\n{prompt}\n```"
             pred, overlay_path, _ = run_inference(image_path, prompt, model_id)
         status = "**Status:** Inference complete." if overlay_path else f"**Status:** {pred}"
         return prompt_md, pred, overlay_path, status
-    def update_prompt_display(instruction: str, use_qwenvl: bool):
-        if use_qwenvl:
-            prompt = build_franka_prompt(instruction or "predict the trace")
-        else:
-            prompt = build_prompt(instruction)
         return f"**Prompt sent to model:**\n\n```\n{prompt}\n```"
     instruction_input.change(
         fn=update_prompt_display,
-        inputs=[instruction_input, use_qwenvl_checkbox],
         outputs=[prompt_display],
     )
-    use_qwenvl_checkbox.change(
         fn=update_prompt_display,
-        inputs=[instruction_input, use_qwenvl_checkbox],
         outputs=[prompt_display],
     )
@@ -395,7 +379,7 @@ with demo:
             instruction_input,
             model_id_input,
             server_url_state,
-            use_qwenvl_checkbox,
         ],
         outputs=[
             prompt_display,
@@ -413,6 +397,7 @@ def main():
         server_name="0.0.0.0",
         server_port=7860,
         share=False,
     )

 from trace_inference import (
     DEFAULT_MODEL_ID,
     build_prompt,
     preprocess_image_for_trace,
     run_inference,
 )
 from trajectory_viz import visualize_trajectory_on_image
     image_path: str,
     instruction: str,
     server_url: str,
+    is_oxe: bool = False,
 ) -> Tuple[str, Optional[str]]:
     """Run inference via trace eval server. Returns (prediction, overlay_path)."""
     with open(image_path, "rb") as f:
         json={
             "image_base64": image_b64,
             "instruction": instruction,
+            "is_oxe": is_oxe,
         },
         timeout=120.0,
         headers=headers,
 # --- Gradio UI ---
 try:
+    demo = gr.Blocks(title="Trace Model Visualizer")
 except TypeError:
     demo = gr.Blocks(title="Trace Model Visualizer")
                 lines=4,
                 info="Enter a task description in natural language. The model predicts the trace for this instruction.",
             )
+            prompt_format = gr.Radio(
+                choices=["LIBERO", "OXE"],
+                value="LIBERO",
+                label="Prompt Format",
+                info="Switch between LIBERO and OXE training formats.",
+            )
             gr.Markdown("### Local model (if no eval server selected)")
             model_id_input = gr.Textbox(
                 label="Model ID",
                 value=DEFAULT_MODEL_ID,
                 info="Hugging Face model ID (auto-loads on first inference if no eval server selected)",
             )
             run_btn = gr.Button("Run Inference", variant="primary")
         with gr.Column(scale=1):
         "Select an eval server from the sidebar (auto-connects), or run inference with local model."
     )
+    def on_run_inference(image_path, instruction, model_id, server_url, prompt_mode):
         if image_path is None:
             return (
                 "",
                 "**Status:** Please upload an image.",
             )
+        is_oxe = (prompt_mode == "OXE")
         if server_url:
+            prompt = build_prompt(instruction, is_oxe=is_oxe)
             prompt_md = f"**Prompt sent to model:**\n\n```\n{prompt}\n```"
             pred, overlay_path = run_inference_via_server(
+                image_path, instruction, server_url, is_oxe=is_oxe
             )
         else:
+            prompt = build_prompt(instruction, is_oxe=is_oxe)
             prompt_md = f"**Prompt sent to model:**\n\n```\n{prompt}\n```"
             pred, overlay_path, _ = run_inference(image_path, prompt, model_id)
         status = "**Status:** Inference complete." if overlay_path else f"**Status:** {pred}"
         return prompt_md, pred, overlay_path, status
+    def update_prompt_display(instruction: str, prompt_mode: str):
+        is_oxe = (prompt_mode == "OXE")
+        prompt = build_prompt(instruction, is_oxe=is_oxe)
         return f"**Prompt sent to model:**\n\n```\n{prompt}\n```"
     instruction_input.change(
         fn=update_prompt_display,
+        inputs=[instruction_input, prompt_format],
         outputs=[prompt_display],
     )
+    prompt_format.change(
         fn=update_prompt_display,
+        inputs=[instruction_input, prompt_format],
         outputs=[prompt_display],
     )
             instruction_input,
             model_id_input,
             server_url_state,
+            prompt_format,
         ],
         outputs=[
             prompt_display,
         server_name="0.0.0.0",
         server_port=7860,
         share=False,
+        theme=gr.themes.Soft(),
     )

eval_server.py CHANGED Viewed

@@ -33,7 +33,6 @@ from trace_inference import (
     build_prompt,
     load_model,
     run_inference,
-    run_inference_qwenvl,
 )
 from trace_inference import _model_state as _trace_model_state
 from trajectory_viz import extract_trajectory_from_text
@@ -69,13 +68,12 @@ class TraceEvalServer:
         image_path: Optional[str] = None,
         image_base64: Optional[str] = None,
         instruction: str = "",
-        use_qwenvl: bool = False,
     ) -> Dict[str, Any]:
         """
         Run inference on a single image.
         Provide either image_path (file path) or image_base64 (base64-encoded image).
-        If use_qwenvl=True, uses run_inference_qwenvl (Franka-style, requires qwenvl).
         """
         if image_path is None and image_base64 is None:
             return {"error": "Provide image_path or image_base64"}
@@ -102,15 +100,8 @@ class TraceEvalServer:
                 return {"error": f"Invalid image data: {e}"}
         try:
-            if use_qwenvl:
-                output_dict, prediction, _, trace_text = run_inference_qwenvl(
-                    image_path, instruction or "predict the trace", self.model_id
-                )
-                if not output_dict and trace_text and "qwenvl package not found" in trace_text:
-                    return {"error": trace_text}
-            else:
-                prompt = build_prompt(instruction)
-                prediction, _, _ = run_inference(image_path, prompt, self.model_id)
         finally:
             if temp_file_path and os.path.exists(temp_file_path):
                 try:
@@ -126,8 +117,6 @@ class TraceEvalServer:
             "prediction": prediction,
             "trajectory": trajectory,
         }
-        if use_qwenvl and output_dict:
-            result["output_dict"] = output_dict
         return result
     def predict_batch(
@@ -146,7 +135,7 @@ class TraceEvalServer:
                 image_path=sample.get("image_path"),
                 image_base64=sample.get("image_base64"),
                 instruction=sample.get("instruction", ""),
-                use_qwenvl=sample.get("use_qwenvl", False),
             )
             elapsed = time.time() - start
@@ -214,14 +203,14 @@ def create_app(
             - image_path: (optional) path to image file
             - image_base64: (optional) base64-encoded image
             - instruction: natural language task description
-            - use_qwenvl: (optional) if true, use Franka/qwenvl inference (requires qwenvl)
         """
         body = await request.json()
         return trace_server.predict_one(
             image_path=body.get("image_path"),
             image_base64=body.get("image_base64"),
             instruction=body.get("instruction", ""),
-            use_qwenvl=body.get("use_qwenvl", False),
         )
     @app.post("/predict_batch")

     build_prompt,
     load_model,
     run_inference,
 )
 from trace_inference import _model_state as _trace_model_state
 from trajectory_viz import extract_trajectory_from_text
         image_path: Optional[str] = None,
         image_base64: Optional[str] = None,
         instruction: str = "",
+        is_oxe: bool = False,
     ) -> Dict[str, Any]:
         """
         Run inference on a single image.
         Provide either image_path (file path) or image_base64 (base64-encoded image).
         """
         if image_path is None and image_base64 is None:
             return {"error": "Provide image_path or image_base64"}
                 return {"error": f"Invalid image data: {e}"}
         try:
+            prompt = build_prompt(instruction, is_oxe=is_oxe)
+            prediction, _, _ = run_inference(image_path, prompt, self.model_id)
         finally:
             if temp_file_path and os.path.exists(temp_file_path):
                 try:
             "prediction": prediction,
             "trajectory": trajectory,
         }
         return result
     def predict_batch(
                 image_path=sample.get("image_path"),
                 image_base64=sample.get("image_base64"),
                 instruction=sample.get("instruction", ""),
+                is_oxe=sample.get("is_oxe", False),
             )
             elapsed = time.time() - start
             - image_path: (optional) path to image file
             - image_base64: (optional) base64-encoded image
             - instruction: natural language task description
+            - is_oxe: (optional) if true, use OXE prompt format
         """
         body = await request.json()
         return trace_server.predict_one(
             image_path=body.get("image_path"),
             image_base64=body.get("image_base64"),
             instruction=body.get("instruction", ""),
+            is_oxe=body.get("is_oxe", False),
         )
     @app.post("/predict_batch")

trace_inference.py CHANGED Viewed

@@ -9,22 +9,16 @@ Heavy imports are done lazily inside load_model and run_inference.
 import logging
 import os
 import tempfile
-from typing import List, Optional, Tuple
 import re
 from pathlib import Path
-import torch
-from typing import Dict, Any
 logger = logging.getLogger(__name__)
-# Constants (no heavy deps)
 DEFAULT_MODEL_ID = "mihirgrao/trace-model"
-TRACE_FORMAT = (
-    "Predict the trajectory or trace in this image. "
-    "Output the coordinates as a list of [x, y] pairs, e.g. [[0.1, 0.2], [0.3, 0.4], ...]. "
-    "Use normalized coordinates between 0 and 1."
-)
-PREPROCESS_SIZE = (128, 128)
 # Global model state
 _model_state = {
@@ -34,11 +28,12 @@ _model_state = {
 }
-def build_prompt(instruction: str = "") -> str:
-    """Build the full prompt from task instruction + trace format."""
-    if instruction.strip():
-        return f"Task: {instruction.strip()}\n\n{TRACE_FORMAT}"
-    return TRACE_FORMAT
 def format_trace_points(trajectories: List) -> str:
@@ -55,7 +50,7 @@ def format_trace_points(trajectories: List) -> str:
     return "\n".join(lines)
-def center_crop_resize(image, size: Tuple[int, int] = PREPROCESS_SIZE):
     """Center crop to square then resize. Requires PIL Image."""
     from PIL import Image
@@ -64,7 +59,8 @@ def center_crop_resize(image, size: Tuple[int, int] = PREPROCESS_SIZE):
     left = (w - min_dim) // 2
     top = (h - min_dim) // 2
     cropped = image.crop((left, top, left + min_dim, top + min_dim))
-    return cropped.resize(size, Image.Resampling.LANCZOS)
 def preprocess_image_for_trace(image_path: str) -> Tuple:
@@ -72,195 +68,16 @@ def preprocess_image_for_trace(image_path: str) -> Tuple:
     from PIL import Image
     img = Image.open(image_path).convert("RGB")
-    img = center_crop_resize(img, PREPROCESS_SIZE)
     tmp = tempfile.NamedTemporaryFile(delete=False, suffix=".png")
     img.save(tmp.name)
     return img, tmp.name
-def load_model(model_id: str = DEFAULT_MODEL_ID) -> Tuple[bool, str]:
-    """Load the trace model and processor. Returns (success, message)."""
-    global _model_state
-    if _model_state["model"] is not None and _model_state["model_id"] == model_id:
-        return True, f"Model already loaded: {model_id}"
-    try:
-        import torch
-        from transformers import AutoModelForImageTextToText, AutoProcessor
-        if _model_state["model"] is not None:
-            del _model_state["model"]
-            del _model_state["processor"]
-            _model_state["model"] = None
-            _model_state["processor"] = None
-            if torch.cuda.is_available():
-                torch.cuda.empty_cache()
-        load_kwargs = {
-            "torch_dtype": torch.bfloat16 if torch.cuda.is_available() else torch.float32,
-            "device_map": "auto" if torch.cuda.is_available() else None,
-        }
-        try:
-            if torch.cuda.is_available():
-                load_kwargs["attn_implementation"] = "flash_attention_2"
-            model = AutoModelForImageTextToText.from_pretrained(model_id, **load_kwargs)
-        except (ValueError, ImportError):
-            load_kwargs.pop("attn_implementation", None)
-            model = AutoModelForImageTextToText.from_pretrained(model_id, **load_kwargs)
-        processor = AutoProcessor.from_pretrained(model_id)
-        _model_state["model"] = model
-        _model_state["processor"] = processor
-        _model_state["model_id"] = model_id
-        return True, f"Model loaded: {model_id}"
-    except Exception as e:
-        logger.exception("Failed to load model")
-        return False, f"Error loading model: {str(e)}"
-def run_inference(image_path: str, prompt: str, model_id: str) -> Tuple[str, Optional[str], str]:
-    """
-    Run trace model inference on an image.
-    Returns: (prediction_text, overlay_image_path, trace_points_text)
-    """
-    success, msg = load_model(model_id)
-    if not success:
-        return msg, None, ""
-    model = _model_state["model"]
-    processor = _model_state["processor"]
-    if image_path is None or not os.path.exists(image_path):
-        return "Please provide a valid image.", None, ""
-    try:
-        from trajectory_viz import extract_trajectory_from_text, visualize_trajectory_on_image
-        try:
-            from qwen_vl_utils import process_vision_info
-        except ImportError:
-            process_vision_info = None
-        preprocessed_path = None
-        try:
-            _, preprocessed_path = preprocess_image_for_trace(image_path)
-            image_uri = f"file://{os.path.abspath(preprocessed_path)}"
-            messages = [
-                {
-                    "role": "user",
-                    "content": [
-                        {"type": "image", "image": image_uri},
-                        {"type": "text", "text": prompt},
-                    ],
-                }
-            ]
-            text = processor.apply_chat_template(
-                messages, tokenize=False, add_generation_prompt=True
-            )
-            if process_vision_info is not None:
-                process_kwargs = {"return_video_kwargs": True, "return_video_metadata": True}
-                if hasattr(processor, "image_processor") and hasattr(
-                    processor.image_processor, "patch_size"
-                ):
-                    process_kwargs["image_patch_size"] = processor.image_processor.patch_size
-                image_inputs, video_inputs, video_kwargs = process_vision_info(
-                    messages, **process_kwargs
-                )
-            else:
-                from PIL import Image
-                pil_image = Image.open(image_path).convert("RGB")
-                image_inputs = [pil_image]
-                video_inputs = None
-                video_kwargs = {}
-            processor_kwargs = {
-                "text": [text],
-                "images": image_inputs,
-                "padding": True,
-                "return_tensors": "pt",
-                "do_resize": False,
-            }
-            if video_inputs is not None and len(video_inputs) > 0:
-                if isinstance(video_inputs[0], tuple):
-                    videos, video_metadatas = zip(*video_inputs)
-                    processor_kwargs["videos"] = list(videos)
-                    processor_kwargs["video_metadata"] = list(video_metadatas)
-                else:
-                    processor_kwargs["videos"] = video_inputs
-            if video_kwargs:
-                processor_kwargs.update(video_kwargs)
-            import torch
-            inputs = processor(**processor_kwargs)
-            inputs = {k: v.to(model.device) for k, v in inputs.items() if hasattr(v, "to")}
-            with torch.no_grad():
-                generated_ids = model.generate(
-                    **inputs, max_new_tokens=1024, do_sample=False
-                )
-            input_ids = inputs["input_ids"]
-            generated_ids_trimmed = [
-                out[len(inp) :] for inp, out in zip(input_ids, generated_ids)
-            ]
-            prediction = processor.batch_decode(
-                generated_ids_trimmed,
-                skip_special_tokens=True,
-                clean_up_tokenization_spaces=False,
-            )[0]
-            trajectories = extract_trajectory_from_text(prediction)
-            trace_points_text = format_trace_points(trajectories)
-            overlay_path = None
-            if trajectories and len(trajectories) >= 2:
-                with tempfile.NamedTemporaryFile(delete=False, suffix=".png") as f:
-                    overlay_path = f.name
-                img_arr = visualize_trajectory_on_image(
-                    trajectory=trajectories,
-                    image_path=preprocessed_path,
-                    output_path=overlay_path,
-                    normalized=True,
-                )
-                if img_arr is None:
-                    visualize_trajectory_on_image(
-                        trajectory=trajectories,
-                        image_path=preprocessed_path,
-                        output_path=overlay_path,
-                        normalized=False,
-                    )
-            return prediction, overlay_path, trace_points_text
-        finally:
-            if preprocessed_path and os.path.exists(preprocessed_path):
-                try:
-                    os.unlink(preprocessed_path)
-                except Exception:
-                    pass
-    except Exception as e:
-        logger.exception("Inference failed")
-        return f"Error: {str(e)}", None, ""
-def build_franka_prompt(task: str) -> str:
-    """Build the Franka-style prompt for trace prediction."""
-    return (
-        '<image>\nYou are a Franka robot using the joint control. '
-        f'The task is "{task}". Can you predict the trace of the end effector?'
-    )
 def _make_abs_paths(base: Path, files: str) -> str:
     return f"{(base / files).resolve()}"
 def _build_messages(item: Dict[str, Any], base_path: Path) -> List[Dict[str, Any]]:
     # Extract and normalize images and videos
     images = item.get("image") or []
@@ -322,7 +139,6 @@ def _build_messages(item: Dict[str, Any], base_path: Path) -> List[Dict[str, Any
     return messages
-IGNORE_INDEX = -100
 def preprocess_qwen_visual(
     sources,
@@ -382,143 +198,126 @@ def preprocess_qwen_visual(
     return full_result
-def run_inference_qwenvl(
-    image_path: str,
-    task: str,
-    model_id: str = DEFAULT_MODEL_ID,
-    data_path: Optional[str] = None,
-) -> Tuple[dict, str, Optional[str], str]:
-    """
-    Run trace inference using preprocess_qwen_visual (qwenvl data processor).
-    Uses Franka-style prompt and output format.
-    Args:
-        image_path: Full path to the image file.
-        task: Task description (e.g. "open pot, then pick bread and place inside pot").
-        model_id: Model to load.
-        data_path: Base path for image resolution. Defaults to dirname of image_path.
-    Returns:
-        (output_dict, prediction_text, overlay_path, trace_points_text)
-        output_dict has format: {"id", "image", "conversations": [human_msg, gpt_msg]}
-    """
-    success, msg = load_model(model_id)
-    if not success:
-        return {}, msg, None, ""
-    model = _model_state["model"]
-    processor = _model_state["processor"]
-    if not image_path or not os.path.exists(image_path):
-        return {}, "Please provide a valid image.", None, ""
-    data_path = data_path or os.path.dirname(os.path.abspath(image_path))
-    image_rel = os.path.basename(image_path)
-    sample_id = hash(image_path) % 100000  # deterministic id for display
-    prompt = build_franka_prompt(task)
-    inference_sample = {
-        "id": sample_id,
-        "image": [image_rel],
-        "conversations": [{"from": "human", "value": prompt}],
-        "data_path": data_path,
-    }
-    print("prompt")
-    print(prompt)
-    print("image_path")
-    print(image_rel)
-    print("data_path")
-    print(data_path)
-    try:
-        import torch
-        from trajectory_viz import extract_trajectory_from_text, visualize_trajectory_on_image
-        processed_data = preprocess_qwen_visual(
-            [inference_sample], processor, add_gen_prompt=True
-        )
-        print("inference_sample")
-        print(inference_sample)
-        print("processed_data")
-        print(processed_data)
-        input_ids = processed_data["input_ids"].to(model.device)
-        pixel_values = (
-            processed_data["pixel_values"].to(model.device)
-            if "pixel_values" in processed_data
-            else None
-        )
-        image_grid_thw = (
-            processed_data["image_grid_thw"].to(model.device)
-            if "image_grid_thw" in processed_data
-            else None
-        )
-        inputs = {"input_ids": input_ids}
-        if pixel_values is not None:
-            inputs["pixel_values"] = pixel_values
-        if image_grid_thw is not None:
-            inputs["image_grid_thw"] = image_grid_thw
-        with torch.no_grad():
-            generated_ids = model.generate(**inputs, max_new_tokens=512)
-        generated_ids_trimmed = [
-            out_ids[len(in_ids) :] for in_ids, out_ids in zip(input_ids, generated_ids)
-        ]
-        prediction = processor.tokenizer.decode(
-            generated_ids_trimmed[0], skip_special_tokens=True
-        )
-        print("prediction")
-        print(prediction)
-        # Format output like the example: "Trace: [[x,y], [x,y], ...]"
-        trajectories = extract_trajectory_from_text(prediction)
-        trace_value = f"Trace: {trajectories}" if trajectories else f"Trace: {prediction}"
-        output_dict = {
-            "id": sample_id,
-            "image": [image_rel],
             "conversations": [
-                {"from": "human", "value": prompt},
-                {"from": "gpt", "value": trace_value},
             ],
         }
-        trace_points_text = format_trace_points(trajectories)
-        print("trace_points_text")
-        print(trace_points_text)
         overlay_path = None
-        if trajectories and len(trajectories) >= 2:
-            _, preprocessed_path = preprocess_image_for_trace(image_path)
-            try:
-                with tempfile.NamedTemporaryFile(delete=False, suffix=".png") as f:
-                    overlay_path = f.name
-                img_arr = visualize_trajectory_on_image(
-                    trajectory=trajectories,
-                    image_path=preprocessed_path,
-                    output_path=overlay_path,
-                    normalized=True,
-                )
-                if img_arr is None:
-                    visualize_trajectory_on_image(
-                        trajectory=trajectories,
-                        image_path=preprocessed_path,
-                        output_path=overlay_path,
-                        normalized=False,
-                    )
-            finally:
-                if preprocessed_path and os.path.exists(preprocessed_path):
-                    try:
-                        os.unlink(preprocessed_path)
-                    except Exception:
-                        pass
-        return output_dict, prediction, overlay_path, trace_points_text
     except Exception as e:
-        logger.exception("Inference failed (qwenvl)")
-        return {}, f"Error: {str(e)}", None, ""

 import logging
 import os
 import tempfile
+import torch
 import re
+from typing import List, Optional, Tuple, Dict, Any
 from pathlib import Path
 logger = logging.getLogger(__name__)
+# Constants
 DEFAULT_MODEL_ID = "mihirgrao/trace-model"
+IGNORE_INDEX = -100
 # Global model state
 _model_state = {
 }
+def build_prompt(instruction: str = "", is_oxe: bool = False) -> str:
+    """Build the full prompt from task instruction."""
+    task = instruction.strip() or "predict the trace"
+    if is_oxe:
+        return f"<image>\nYou are a Franka robot using the joint control. The task is \"{task}\". Can you predict the trace of the end effector?"
+    return f"You are a robot. Your task is: \"{task}\". <image> Can you predict the trace of the end effector in this image to complete the task?"
 def format_trace_points(trajectories: List) -> str:
     return "\n".join(lines)
+def center_crop_resize(image, size: Tuple[int, int] = (128, 128)):
     """Center crop to square then resize. Requires PIL Image."""
     from PIL import Image
     left = (w - min_dim) // 2
     top = (h - min_dim) // 2
     cropped = image.crop((left, top, left + min_dim, top + min_dim))
+    # return cropped.resize(size, Image.Resampling.LANCZOS)
+    return cropped
 def preprocess_image_for_trace(image_path: str) -> Tuple:
     from PIL import Image
     img = Image.open(image_path).convert("RGB")
+    img = center_crop_resize(img, (128, 128))
     tmp = tempfile.NamedTemporaryFile(delete=False, suffix=".png")
     img.save(tmp.name)
     return img, tmp.name
 def _make_abs_paths(base: Path, files: str) -> str:
     return f"{(base / files).resolve()}"
 def _build_messages(item: Dict[str, Any], base_path: Path) -> List[Dict[str, Any]]:
     # Extract and normalize images and videos
     images = item.get("image") or []
     return messages
 def preprocess_qwen_visual(
     sources,
     return full_result
+def load_model(model_id: str = DEFAULT_MODEL_ID) -> Tuple[bool, str]:
+    """Load the trace model and processor. Returns (success, message)."""
+    global _model_state
+    if _model_state["model"] is not None and _model_state["model_id"] == model_id:
+        return True, f"Model already loaded: {model_id}"
+    try:
+        from transformers import AutoModelForImageTextToText, AutoProcessor
+        if _model_state["model"] is not None:
+            del _model_state["model"]
+            del _model_state["processor"]
+            _model_state["model"] = None
+            _model_state["processor"] = None
+            if torch.cuda.is_available():
+                torch.cuda.empty_cache()
+        logger.info(f"Loading model from {model_id}...")
+        load_kwargs = {
+            "dtype": torch.bfloat16,
+            "device_map": "auto",
+        }
+        model = AutoModelForImageTextToText.from_pretrained(
+            model_id,
+            **load_kwargs,
+        )
+        processor = AutoProcessor.from_pretrained(model_id)
+        _model_state["model"] = model
+        _model_state["processor"] = processor
+        _model_state["model_id"] = model_id
+        return True, f"Model loaded: {model_id}"
+    except Exception as e:
+        logger.exception("Failed to load model")
+        return False, f"Error loading model: {str(e)}"
+def run_inference(image_path: str, prompt: str, model_id: str) -> Tuple[str, Optional[str], str]:
+    """
+    Run trace model inference on an image.
+    Returns: (prediction_text, overlay_image_path, trace_points_text)
+    """
+    success, msg = load_model(model_id)
+    if not success:
+        return msg, None, ""
+    model = _model_state["model"]
+    processor = _model_state["processor"]
+    if image_path is None or not os.path.exists(image_path):
+        return "Please provide a valid image.", None, ""
+    try:
+        from trajectory_viz import extract_trajectory_from_text, visualize_trajectory_on_image
+        abs_image_path = os.path.abspath(image_path)
+        raw_item = {
+            "id": "single_inference",
+            "image": [abs_image_path],
             "conversations": [
+                {
+                    "from": "human",
+                    "value": prompt
+                }
             ],
+            "data_path": ""
         }
+        # Preprocessing using internal method
+        processed = preprocess_qwen_visual([raw_item], processor, add_gen_prompt=True)
+        # Prepare inputs - passing only what's necessary as per the new method
+        inputs = {"input_ids": processed["input_ids"].to(model.device)}
+        if "pixel_values" in processed:
+            inputs["pixel_values"] = processed["pixel_values"].to(model.device)
+        if "image_grid_thw" in processed:
+            inputs["image_grid_thw"] = processed["image_grid_thw"].to(model.device)
+        # Generate prediction
+        with torch.no_grad():
+            generated_ids = model.generate(
+                **inputs,
+                max_new_tokens=512,
+                do_sample=False,
+            )
+        # Trim prompt tokens
+        trimmed = generated_ids[:, inputs["input_ids"].shape[1]:]
+        # Decode
+        prediction = processor.tokenizer.batch_decode(
+            trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
+        )[0]
+        trajectory = extract_trajectory_from_text(prediction)
+        trace_points_text = ""
         overlay_path = None
+        if trajectory:
+            trace_points_text = format_trace_points(trajectory)
+            with tempfile.NamedTemporaryFile(delete=False, suffix=".png") as f:
+                overlay_path = f.name
+            visualize_trajectory_on_image(
+                trajectory=trajectory,
+                image_path=abs_image_path,
+                output_path=overlay_path,
+                normalized=True
+            )
+        else:
+            trace_points_text = "No trajectory points extracted."
+        return prediction, overlay_path, trace_points_text
     except Exception as e:
+        logger.exception("Inference failed")
+        return f"Error: {str(e)}", None, ""