Spaces:

aliangdw
/

trace_visualizer

Sleeping

App Files Files Community

Anthony Liang commited on 20 days ago

Commit

521dbac

1 Parent(s): cde89ae

update

Browse files

Files changed (2) hide show

app.py +43 -35
trace_inference.py +142 -0

app.py CHANGED Viewed

@@ -20,11 +20,11 @@ import requests
 from trace_inference import (
     DEFAULT_MODEL_ID,
     TRACE_FORMAT,
     build_prompt,
-    format_trace_points,
-    load_model,
     preprocess_image_for_trace,
     run_inference,
 )
 from trajectory_viz import visualize_trajectory_on_image
@@ -124,8 +124,8 @@ def run_inference_via_server(
     image_path: str,
     instruction: str,
     server_url: str,
-) -> Tuple[str, Optional[str], str]:
-    """Run inference via trace eval server. Returns (prediction, overlay_path, trace_points_text)."""
     with open(image_path, "rb") as f:
         image_b64 = base64.b64encode(f.read()).decode("utf-8")
     headers = {"ngrok-skip-browser-warning": "true"} if "ngrok" in server_url else {}
@@ -138,10 +138,9 @@ def run_inference_via_server(
     r.raise_for_status()
     data = r.json()
     if "error" in data:
-        return data["error"], None, ""
     prediction = data.get("prediction", "")
     trajectory = data.get("trajectory", [])
-    trace_points_text = format_trace_points(trajectory)
     overlay_path = None
     if trajectory and len(trajectory) >= 2:
@@ -168,7 +167,7 @@ def run_inference_via_server(
                     os.unlink(preprocessed_path)
                 except Exception:
                     pass
-    return prediction, overlay_path, trace_points_text
 # --- Gradio UI ---
@@ -263,7 +262,7 @@ with demo:
             interactive=True,
             info="Discover trace eval servers on ports 8000-8010",
         )
-        server_status = gr.Markdown("Click 'Discover Eval Servers' or use local model below")
         gr.Markdown("---")
         gr.Markdown("### 📋 Model Information")
         model_info_display = gr.Markdown("")
@@ -303,9 +302,13 @@ with demo:
             model_id_input = gr.Textbox(
                 label="Model ID",
                 value=DEFAULT_MODEL_ID,
-                info="Hugging Face model ID (used when no eval server is selected)",
             )
-            load_model_btn = gr.Button("Load Model", variant="secondary")
             run_btn = gr.Button("Run Inference", variant="primary")
         with gr.Column(scale=1):
@@ -321,53 +324,58 @@ with demo:
                 label="Model Prediction (raw)",
                 lines=6,
             )
-            trace_points_output = gr.Markdown(
-                label="Extracted Trace Points",
-            )
     status_md = gr.Markdown(
-        "Select an eval server from the sidebar, or load a local model and run inference."
     )
-    def on_load_model(model_id: str):
-        _, msg = load_model(model_id)
-        return f"**Status:** {msg}"
-    def on_run_inference(image_path, instruction, model_id, server_url):
         if image_path is None:
             return (
                 "",
                 "Please upload an image first.",
                 None,
-                "",
                 "**Status:** Please upload an image.",
             )
-        prompt = build_prompt(instruction)
-        prompt_md = f"**Prompt sent to model:**\n\n```\n{prompt}\n```"
         if server_url:
-            pred, overlay_path, trace_text = run_inference_via_server(
                 image_path, instruction, server_url
             )
         else:
-            pred, overlay_path, trace_text = run_inference(image_path, prompt, model_id)
         status = "**Status:** Inference complete." if overlay_path else f"**Status:** {pred}"
-        return prompt_md, pred, overlay_path, trace_text, status
-    load_model_btn.click(
-        fn=on_load_model,
-        inputs=[model_id_input],
-        outputs=[status_md],
-    )
-    def update_prompt_display(instruction: str):
-        prompt = build_prompt(instruction)
         return f"**Prompt sent to model:**\n\n```\n{prompt}\n```"
     instruction_input.change(
         fn=update_prompt_display,
-        inputs=[instruction_input],
         outputs=[prompt_display],
     )
@@ -378,12 +386,12 @@ with demo:
             instruction_input,
             model_id_input,
             server_url_state,
         ],
         outputs=[
             prompt_display,
             prediction_output,
             overlay_output,
-            trace_points_output,
             status_md,
         ],
         api_name="run_inference",

 from trace_inference import (
     DEFAULT_MODEL_ID,
     TRACE_FORMAT,
+    build_franka_prompt,
     build_prompt,
     preprocess_image_for_trace,
     run_inference,
+    run_inference_qwenvl,
 )
 from trajectory_viz import visualize_trajectory_on_image
     image_path: str,
     instruction: str,
     server_url: str,
+) -> Tuple[str, Optional[str]]:
+    """Run inference via trace eval server. Returns (prediction, overlay_path)."""
     with open(image_path, "rb") as f:
         image_b64 = base64.b64encode(f.read()).decode("utf-8")
     headers = {"ngrok-skip-browser-warning": "true"} if "ngrok" in server_url else {}
     r.raise_for_status()
     data = r.json()
     if "error" in data:
+        return data["error"], None
     prediction = data.get("prediction", "")
     trajectory = data.get("trajectory", [])
     overlay_path = None
     if trajectory and len(trajectory) >= 2:
                     os.unlink(preprocessed_path)
                 except Exception:
                     pass
+    return prediction, overlay_path
 # --- Gradio UI ---
             interactive=True,
             info="Discover trace eval servers on ports 8000-8010",
         )
+        server_status = gr.Markdown("Select an eval server below (auto-connects on selection)")
         gr.Markdown("---")
         gr.Markdown("### 📋 Model Information")
         model_info_display = gr.Markdown("")
             model_id_input = gr.Textbox(
                 label="Model ID",
                 value=DEFAULT_MODEL_ID,
+                info="Hugging Face model ID (auto-loads on first inference if no eval server selected)",
+            )
+            use_qwenvl_checkbox = gr.Checkbox(
+                label="Use Franka / qwenvl inference",
+                value=False,
+                info="Uses preprocess_qwen_visual (qwenvl). Requires qwen-vl-finetune on PYTHONPATH.",
             )
             run_btn = gr.Button("Run Inference", variant="primary")
         with gr.Column(scale=1):
                 label="Model Prediction (raw)",
                 lines=6,
             )
     status_md = gr.Markdown(
+        "Select an eval server from the sidebar (auto-connects), or run inference with local model."
     )
+    def on_run_inference(image_path, instruction, model_id, server_url, use_qwenvl):
         if image_path is None:
             return (
                 "",
                 "Please upload an image first.",
                 None,
                 "**Status:** Please upload an image.",
             )
         if server_url:
+            prompt = build_prompt(instruction)
+            prompt_md = f"**Prompt sent to model:**\n\n```\n{prompt}\n```"
+            pred, overlay_path = run_inference_via_server(
                 image_path, instruction, server_url
             )
+        elif use_qwenvl:
+            prompt = build_franka_prompt(instruction or "predict the trace")
+            prompt_md = f"**Prompt sent to model:**\n\n```\n{prompt}\n```"
+            output_dict, pred, overlay_path, trace_text = run_inference_qwenvl(
+                image_path, instruction or "predict the trace", model_id
+            )
+            if not output_dict and trace_text and "qwenvl package not found" in trace_text:
+                pred = trace_text
+                overlay_path = None
         else:
+            prompt = build_prompt(instruction)
+            prompt_md = f"**Prompt sent to model:**\n\n```\n{prompt}\n```"
+            pred, overlay_path, _ = run_inference(image_path, prompt, model_id)
         status = "**Status:** Inference complete." if overlay_path else f"**Status:** {pred}"
+        return prompt_md, pred, overlay_path, status
+    def update_prompt_display(instruction: str, use_qwenvl: bool):
+        if use_qwenvl:
+            prompt = build_franka_prompt(instruction or "predict the trace")
+        else:
+            prompt = build_prompt(instruction)
         return f"**Prompt sent to model:**\n\n```\n{prompt}\n```"
     instruction_input.change(
         fn=update_prompt_display,
+        inputs=[instruction_input, use_qwenvl_checkbox],
+        outputs=[prompt_display],
+    )
+    use_qwenvl_checkbox.change(
+        fn=update_prompt_display,
+        inputs=[instruction_input, use_qwenvl_checkbox],
         outputs=[prompt_display],
     )
             instruction_input,
             model_id_input,
             server_url_state,
+            use_qwenvl_checkbox,
         ],
         outputs=[
             prompt_display,
             prediction_output,
             overlay_output,
             status_md,
         ],
         api_name="run_inference",

trace_inference.py CHANGED Viewed

@@ -245,3 +245,145 @@ def run_inference(image_path: str, prompt: str, model_id: str) -> Tuple[str, Opt
     except Exception as e:
         logger.exception("Inference failed")
         return f"Error: {str(e)}", None, ""

     except Exception as e:
         logger.exception("Inference failed")
         return f"Error: {str(e)}", None, ""
+def build_franka_prompt(task: str) -> str:
+    """Build the Franka-style prompt for trace prediction."""
+    return (
+        '<image>\nYou are a Franka robot using the joint control. '
+        f'The task is "{task}". Can you predict the trace of the end effector?'
+    )
+def run_inference_qwenvl(
+    image_path: str,
+    task: str,
+    model_id: str = DEFAULT_MODEL_ID,
+    data_path: Optional[str] = None,
+) -> Tuple[dict, str, Optional[str], str]:
+    """
+    Run trace inference using preprocess_qwen_visual (qwenvl data processor).
+    Uses Franka-style prompt and output format.
+    Args:
+        image_path: Full path to the image file.
+        task: Task description (e.g. "open pot, then pick bread and place inside pot").
+        model_id: Model to load.
+        data_path: Base path for image resolution. Defaults to dirname of image_path.
+    Returns:
+        (output_dict, prediction_text, overlay_path, trace_points_text)
+        output_dict has format: {"id", "image", "conversations": [human_msg, gpt_msg]}
+    """
+    try:
+        from qwenvl.data.data_processor import preprocess_qwen_visual
+    except ImportError as e:
+        return (
+            {},
+            "",
+            None,
+            f"qwenvl package not found: {e}. Install qwen-vl-finetune or add to PYTHONPATH.",
+        )
+    success, msg = load_model(model_id)
+    if not success:
+        return {}, msg, None, ""
+    model = _model_state["model"]
+    processor = _model_state["processor"]
+    if not image_path or not os.path.exists(image_path):
+        return {}, "Please provide a valid image.", None, ""
+    data_path = data_path or os.path.dirname(os.path.abspath(image_path))
+    image_rel = os.path.basename(image_path)
+    sample_id = hash(image_path) % 100000  # deterministic id for display
+    prompt = build_franka_prompt(task)
+    inference_sample = {
+        "id": sample_id,
+        "image": [image_rel],
+        "conversations": [{"from": "human", "value": prompt}],
+        "data_path": data_path,
+    }
+    try:
+        import torch
+        from trajectory_viz import extract_trajectory_from_text, visualize_trajectory_on_image
+        processed_data = preprocess_qwen_visual(
+            [inference_sample], processor, add_gen_prompt=True
+        )
+        input_ids = processed_data["input_ids"].to(model.device)
+        pixel_values = (
+            processed_data["pixel_values"].to(model.device)
+            if "pixel_values" in processed_data
+            else None
+        )
+        image_grid_thw = (
+            processed_data["image_grid_thw"].to(model.device)
+            if "image_grid_thw" in processed_data
+            else None
+        )
+        inputs = {"input_ids": input_ids}
+        if pixel_values is not None:
+            inputs["pixel_values"] = pixel_values
+        if image_grid_thw is not None:
+            inputs["image_grid_thw"] = image_grid_thw
+        with torch.no_grad():
+            generated_ids = model.generate(**inputs, max_new_tokens=1024)
+        generated_ids_trimmed = [
+            out_ids[len(in_ids) :] for in_ids, out_ids in zip(input_ids, generated_ids)
+        ]
+        prediction = processor.tokenizer.decode(
+            generated_ids_trimmed[0], skip_special_tokens=True
+        )
+        # Format output like the example: "Trace: [[x,y], [x,y], ...]"
+        trajectories = extract_trajectory_from_text(prediction)
+        trace_value = f"Trace: {trajectories}" if trajectories else f"Trace: {prediction}"
+        output_dict = {
+            "id": sample_id,
+            "image": [image_rel],
+            "conversations": [
+                {"from": "human", "value": prompt},
+                {"from": "gpt", "value": trace_value},
+            ],
+        }
+        trace_points_text = format_trace_points(trajectories)
+        overlay_path = None
+        if trajectories and len(trajectories) >= 2:
+            _, preprocessed_path = preprocess_image_for_trace(image_path)
+            try:
+                with tempfile.NamedTemporaryFile(delete=False, suffix=".png") as f:
+                    overlay_path = f.name
+                img_arr = visualize_trajectory_on_image(
+                    trajectory=trajectories,
+                    image_path=preprocessed_path,
+                    output_path=overlay_path,
+                    normalized=True,
+                )
+                if img_arr is None:
+                    visualize_trajectory_on_image(
+                        trajectory=trajectories,
+                        image_path=preprocessed_path,
+                        output_path=overlay_path,
+                        normalized=False,
+                    )
+            finally:
+                if preprocessed_path and os.path.exists(preprocessed_path):
+                    try:
+                        os.unlink(preprocessed_path)
+                    except Exception:
+                        pass
+        return output_dict, prediction, overlay_path, trace_points_text
+    except Exception as e:
+        logger.exception("Inference failed (qwenvl)")
+        return {}, f"Error: {str(e)}", None, ""