Spaces:

aliangdw
/

trace_visualizer

Sleeping

App Files Files Community

Anthony Liang commited on 19 days ago

Commit

28efb30

1 Parent(s): fad52c2

added the functions for now

Browse files

Files changed (1) hide show

trace_inference.py +113 -10

trace_inference.py CHANGED Viewed

@@ -10,6 +10,10 @@ import logging
 import os
 import tempfile
 from typing import List, Optional, Tuple
 logger = logging.getLogger(__name__)
@@ -254,6 +258,112 @@ def build_franka_prompt(task: str) -> str:
         f'The task is "{task}". Can you predict the trace of the end effector?'
     )
 def run_inference_qwenvl(
     image_path: str,
@@ -275,16 +385,6 @@ def run_inference_qwenvl(
         (output_dict, prediction_text, overlay_path, trace_points_text)
         output_dict has format: {"id", "image", "conversations": [human_msg, gpt_msg]}
     """
-    try:
-        from qwenvl.data.data_processor import preprocess_qwen_visual
-    except ImportError as e:
-        return (
-            {},
-            "",
-            None,
-            f"qwenvl package not found: {e}. Install qwen-vl-finetune or add to PYTHONPATH.",
-        )
     success, msg = load_model(model_id)
     if not success:
         return {}, msg, None, ""
@@ -315,6 +415,9 @@ def run_inference_qwenvl(
             [inference_sample], processor, add_gen_prompt=True
         )
         input_ids = processed_data["input_ids"].to(model.device)
         pixel_values = (
             processed_data["pixel_values"].to(model.device)

 import os
 import tempfile
 from typing import List, Optional, Tuple
+import re
+from pathlib import Path
+import torch
+from typing import Dict, Any
 logger = logging.getLogger(__name__)
         f'The task is "{task}". Can you predict the trace of the end effector?'
     )
+def _make_abs_paths(base: Path, files: str) -> str:
+    return f"{(base / files).resolve()}"
+def _build_messages(item: Dict[str, Any], base_path: Path) -> List[Dict[str, Any]]:
+    # Extract and normalize images and videos
+    images = item.get("image") or []
+    if isinstance(images, str):
+        images = [images]
+    videos = item.get("video") or []
+    if isinstance(videos, str):
+        videos = [videos]
+    # Build media pools with absolute paths
+    image_pool = [
+        {"type": "image", "image": _make_abs_paths(base_path, img)} for img in images
+    ]
+    video_pool = [
+        {"type": "video", "video": _make_abs_paths(base_path, vid)} for vid in videos
+    ]
+    messages = []
+    for turn in item["conversations"]:
+        role = "user" if turn["from"] == "human" else "assistant"
+        text: str = turn["value"]
+        if role == "user":
+            content = []
+            # Split text by <image> or <video> placeholders while keeping delimiters
+            text_parts = re.split(r"(<image>|<video>)", text)
+            for seg in text_parts:
+                if seg == "<image>":
+                    if not image_pool:
+                        raise ValueError(
+                            "Number of <image> placeholders exceeds the number of provided images"
+                        )
+                    content.append(image_pool.pop(0))
+                elif seg == "<video>":
+                    if not video_pool:
+                        raise ValueError(
+                            "Number of <video> placeholders exceeds the number of provided videos"
+                        )
+                    content.append(video_pool.pop(0))
+                elif seg.strip():
+                    content.append({"type": "text", "text": seg.strip()})
+            messages.append({"role": role, "content": content})
+        else:
+            # Assistant messages contain only text
+            messages.append({"role": role, "content": [{"type": "text", "text": text}]})
+    # Check for unused media files
+    if image_pool:
+        raise ValueError(
+            f"{len(image_pool)} image(s) remain unused (not consumed by placeholders)"
+        )
+    if video_pool:
+        raise ValueError(
+            f"{len(video_pool)} video(s) remain unused (not consumed by placeholders)"
+        )
+    return messages
+IGNORE_INDEX = -100
+def preprocess_qwen_visual(
+    sources,
+    processor,
+) -> Dict:
+    if len(sources) != 1:
+        raise ValueError(f"Expected 1 source, got {len(sources)}")
+    source = sources[0]
+    base_path = Path(source.get("data_path", ""))
+    messages = _build_messages(source, base_path)
+    full_result = processor.apply_chat_template(
+        messages, tokenize=True, return_dict=True, return_tensors="pt"
+    )
+    input_ids = full_result["input_ids"]
+    if isinstance(input_ids, list):
+        input_ids = torch.tensor(input_ids).unsqueeze(0)
+    labels = torch.full_like(input_ids, IGNORE_INDEX)
+    input_ids_flat = input_ids[0].tolist()
+    L = len(input_ids_flat)
+    pos = 0
+    while pos < L:
+        if input_ids_flat[pos] == 77091:
+            ans_start = pos + 2
+            ans_end = ans_start
+            while ans_end < L and input_ids_flat[ans_end] != 151645:
+                ans_end += 1
+            if ans_end < L:
+                labels[0, ans_start : ans_end + 2] = input_ids[
+                    0, ans_start : ans_end + 2
+                ]
+                pos = ans_end
+        pos += 1
+    full_result["labels"] = labels
+    full_result["input_ids"] = input_ids
+    return full_result
 def run_inference_qwenvl(
     image_path: str,
         (output_dict, prediction_text, overlay_path, trace_points_text)
         output_dict has format: {"id", "image", "conversations": [human_msg, gpt_msg]}
     """
     success, msg = load_model(model_id)
     if not success:
         return {}, msg, None, ""
             [inference_sample], processor, add_gen_prompt=True
         )
+        print("processed_data")
+        print(processed_data)
         input_ids = processed_data["input_ids"].to(model.device)
         pixel_values = (
             processed_data["pixel_values"].to(model.device)