Spaces:

HongzeFu
/

RoboMME

Running

App Files Files Community

HongzeFu commited on Mar 4

Commit

06c11b0

0 Parent(s):

HF Space: code-only (no binary assets)

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

.gitignore +217 -0
.python-version +1 -0
README.md +10 -0
app.py +72 -0
doc/env_format.md +117 -0
doc/h5_data_format.md +71 -0
doc/submission/model_example.md +69 -0
gradio-web/AAAuser_generator.py +147 -0
gradio-web/config.py +41 -0
gradio-web/gradio_callbacks.py +997 -0
gradio-web/image_utils.py +716 -0
gradio-web/main.py +61 -0
gradio-web/note_content.py +181 -0
gradio-web/oracle_logic.py +975 -0
gradio-web/process_session.py +448 -0
gradio-web/scripts/run_background.sh +287 -0
gradio-web/scripts/后台运行说明.md +288 -0
gradio-web/state_manager.py +473 -0
gradio-web/test/conftest.py +39 -0
gradio-web/test/test_episode98_removed_behavior.py +107 -0
gradio-web/test/test_execute_stream_frames.py +59 -0
gradio-web/test/test_live_obs_refresh.py +70 -0
gradio-web/test/test_option_label_format.py +196 -0
gradio-web/test/test_oracle_builder_integration.py +184 -0
gradio-web/test/test_oracle_imports.py +18 -0
gradio-web/test/test_precheck_execute_inputs.py +53 -0
gradio-web/test/test_process_session_sanitize.py +39 -0
gradio-web/test/test_reference_action_callbacks.py +84 -0
gradio-web/test/test_reference_action_oracle.py +117 -0
gradio-web/test/test_ui_native_layout_contract.py +88 -0
gradio-web/test/test_ui_phase_machine_runtime_e2e.py +782 -0
gradio-web/test/test_user_manager_random_flow.py +96 -0
gradio-web/ui_layout.py +547 -0
gradio-web/user_manager.py +178 -0
gradio-web/verify_video_names.py +128 -0
pyproject.toml +34 -0
readme.md +135 -0
requirements.txt +11 -0
scripts/dataset_replay.py +268 -0
scripts/dev/compare_multi_choice_readers.py +334 -0
scripts/dev/dataset_replay_printType.py +254 -0
scripts/dev/deprecated/dataset_replay-FK-parallel.py +335 -0
scripts/dev/deprecated/dataset_replay-FK.py +264 -0
scripts/dev/deprecated/dataset_replay-ee-parallel.py +214 -0
scripts/dev/deprecated/dataset_replay-ee.py +163 -0
scripts/dev/eval-dataset-offline-rpy.py +195 -0
scripts/dev/eval_dataset_replay.py +476 -0
scripts/dev/evaluate_dataset_replay-parallelv3.py +669 -0
scripts/dev/evaluate_dataset_replay-parallelv4-noresolver.py +676 -0
scripts/dev/generate-dataset-control-seed-readJson-advanceV3.py +878 -0

.gitignore ADDED Viewed

	@@ -0,0 +1,217 @@

+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[codz]
+*$py.class
+# C extensions
+*.so
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+share/python-wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+# PyInstaller
+#  Usually these files are written by a python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.nox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+*.py.cover
+.hypothesis/
+.pytest_cache/
+cover/
+# Translations
+*.mo
+*.pot
+# Django stuff:
+*.log
+local_settings.py
+db.sqlite3
+db.sqlite3-journal
+# Flask stuff:
+instance/
+.webassets-cache
+# Scrapy stuff:
+.scrapy
+# Sphinx documentation
+docs/_build/
+# PyBuilder
+.pybuilder/
+target/
+# Jupyter Notebook
+.ipynb_checkpoints
+# IPython
+profile_default/
+ipython_config.py
+# pyenv
+#   For a library or package, you might want to ignore these files since the code is
+#   intended to run in multiple environments; otherwise, check them in:
+# .python-version
+# pipenv
+#   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
+#   However, in case of collaboration, if having platform-specific dependencies or dependencies
+#   having no cross-platform support, pipenv may install dependencies that don't work, or not
+#   install all needed dependencies.
+#Pipfile.lock
+# UV
+#   Similar to Pipfile.lock, it is generally recommended to include uv.lock in version control.
+#   This is especially recommended for binary packages to ensure reproducibility, and is more
+#   commonly ignored for libraries.
+#uv.lock
+# poetry
+#   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
+#   This is especially recommended for binary packages to ensure reproducibility, and is more
+#   commonly ignored for libraries.
+#   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
+#poetry.lock
+#poetry.toml
+# pdm
+#   Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
+#   pdm recommends including project-wide configuration in pdm.toml, but excluding .pdm-python.
+#   https://pdm-project.org/en/latest/usage/project/#working-with-version-control
+#pdm.lock
+#pdm.toml
+.pdm-python
+.pdm-build/
+# pixi
+#   Similar to Pipfile.lock, it is generally recommended to include pixi.lock in version control.
+#pixi.lock
+#   Pixi creates a virtual environment in the .pixi directory, just like venv module creates one
+#   in the .venv directory. It is recommended not to include this directory in version control.
+.pixi
+# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
+__pypackages__/
+# Celery stuff
+celerybeat-schedule
+celerybeat.pid
+# SageMath parsed files
+*.sage.py
+# Environments
+.env
+.envrc
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+# Spyder project settings
+.spyderproject
+.spyproject
+# Rope project settings
+.ropeproject
+# mkdocs documentation
+/site
+# mypy
+.mypy_cache/
+.dmypy.json
+dmypy.json
+# Pyre type checker
+.pyre/
+# pytype static type analyzer
+.pytype/
+# Cython debug symbols
+cython_debug/
+# PyCharm
+#  JetBrains specific template is maintained in a separate JetBrains.gitignore that can
+#  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
+#  and can be added to the global gitignore or merged into this file.  For a more nuclear
+#  option (not recommended) you can uncomment the following to ignore the entire idea folder.
+#.idea/
+# Abstra
+# Abstra is an AI-powered process automation framework.
+# Ignore directories containing user credentials, local state, and settings.
+# Learn more at https://abstra.io/docs
+.abstra/
+# Visual Studio Code
+#  Visual Studio Code specific template is maintained in a separate VisualStudioCode.gitignore
+#  that can be found at https://github.com/github/gitignore/blob/main/Global/VisualStudioCode.gitignore
+#  and can be added to the global gitignore or merged into this file. However, if you prefer,
+#  you could uncomment the following to ignore the entire vscode folder
+# .vscode/
+# Ruff stuff:
+.ruff_cache/
+# PyPI configuration file
+.pypirc
+# Cursor
+#  Cursor is an AI-powered code editor. `.cursorignore` specifies files/directories to
+#  exclude from AI features like autocomplete and code analysis. Recommended for sensitive data
+#  refer to https://docs.cursor.com/context/ignore-files
+.cursorignore
+.cursorindexingignore
+# Marimo
+marimo/_static/
+marimo/_lsp/
+__marimo__/
+# Agent
+.agent/
+.cursor/
+# Local temp demo files
+temp_demos/
+# Gradio user action logs
+gradio/data/user_action_logs/

.python-version ADDED Viewed

	@@ -0,0 +1 @@


1	+ 3.11

README.md ADDED Viewed

	@@ -0,0 +1,10 @@

+---
+title: RoboMME Oracle Planner
+sdk: gradio
+app_file: gradio/main.py
+python_version: "3.11"
+---
+This Space runs the RoboMME Gradio interface in single-instance session mode.
+Project docs are in `readme.md`.

app.py ADDED Viewed

	@@ -0,0 +1,72 @@

+"""Hugging Face Spaces entrypoint for RoboMME Gradio app."""
+import os
+import sys
+import tempfile
+from pathlib import Path
+APP_DIR = Path(__file__).resolve().parent
+GRADIO_WEB_DIR = APP_DIR / "gradio-web"
+SRC_DIR = APP_DIR / "src"
+VIDEOS_DIR = GRADIO_WEB_DIR / "videos"
+TEMP_DEMOS_DIR = APP_DIR / "temp_demos"
+CWD_TEMP_DEMOS_DIR = Path.cwd() / "temp_demos"
+# Ensure local modules are importable when running from repository root (HF Spaces).
+for import_path in (GRADIO_WEB_DIR, SRC_DIR, APP_DIR):
+    resolved = str(import_path.resolve())
+    if resolved not in sys.path:
+        sys.path.insert(0, resolved)
+from state_manager import start_timeout_monitor
+from ui_layout import create_ui_blocks
+def ensure_media_dirs() -> None:
+    """Create temp media directories before first write."""
+    TEMP_DEMOS_DIR.mkdir(parents=True, exist_ok=True)
+    CWD_TEMP_DEMOS_DIR.mkdir(parents=True, exist_ok=True)
+def build_allowed_paths() -> list[str]:
+    """Build Gradio file access allowlist (absolute, deduplicated)."""
+    candidates = [
+        Path.cwd(),
+        APP_DIR,
+        GRADIO_WEB_DIR,
+        SRC_DIR,
+        VIDEOS_DIR,
+        TEMP_DEMOS_DIR,
+        CWD_TEMP_DEMOS_DIR,
+        Path(tempfile.gettempdir()),
+    ]
+    deduped = []
+    seen = set()
+    for path in candidates:
+        normalized = str(path.resolve())
+        if normalized not in seen:
+            seen.add(normalized)
+            deduped.append(normalized)
+    return deduped
+def main() -> None:
+    ensure_media_dirs()
+    start_timeout_monitor()
+    os.environ.setdefault("ROBOMME_TEMP_DEMOS_DIR", str(TEMP_DEMOS_DIR))
+    allowed_paths = build_allowed_paths()
+    demo = create_ui_blocks()
+    demo.queue(default_concurrency_limit=2)
+    demo.launch(
+        server_name="0.0.0.0",
+        server_port=int(os.getenv("PORT", "7860")),
+        allowed_paths=allowed_paths,
+    )
+if __name__ == "__main__":
+    main()

doc/env_format.md ADDED Viewed

	@@ -0,0 +1,117 @@

+# Environment Input/Output
+On RoboMME, a key difference from traditional Gym-like envs is that every observation value is a **list** rather than a single item. This is because some RoboMME tasks use conditioning video input, and for discrete action types (e.g. waypoint or multi_choice) we also return intermediate observations for potential use with video-based policy models.
+## Env Input Format
+We support four `ACTION_SPACE` types:
+- `joint_angle`: 7 joint angles + gripper open/close
+- `ee_pose`: 3 position (xyz) + 3 rotation (rpy) + gripper open/close
+- `waypoint`: Same format as ee_pose, but executed in discrete keyframe steps
+- `multi_choice`: Command dict, e.g. `{"choice": "A", "point": [y, x]}`; the total choices can be found in `info["available_multi_choices"]`, where the `point` is the pixel location on the front image. this action is designed for Video-QA research.
+Note: Gripper closed is -1, gripper open is 1.
+## Env Output Format
+When calling the `step` function:
+```python
+obs, reward, terminated, truncated, info = env.step(action)
+```
+| Return | Description | Typical type |
+|--------|-------------|--------------|
+| `obs` | Observation dict | `dict[str, list]` |
+| `info` | Info dict | `dict[str, Any]` |
+| `reward` | Reward value (not used) | scalar tensor |
+| `terminated` | Termination flag | scalar boolean tensor |
+| `truncated` | Truncation flag | scalar boolean tensor |
+### `obs` dict
+| Key | Meaning | Typical content |
+|-----|---------|-----------------|
+| `maniskill_obs` | The original raw env observation from ManiSkill | Raw observation dict |
+| `front_rgb_list` | Front camera RGB List | Image frames, e.g. `(H, W, 3)` |
+| `wrist_rgb_list` | Wrist camera RGB List | Image frames, e.g. `(H, W, 3)` |
+| `front_depth_list` | Front camera depth List | Depth map, e.g. `(H, W, 1)` |
+| `wrist_depth_list` | Wrist camera depth List | Depth map, e.g. `(H, W, 1)` |
+| `eef_state_list` | End-effector state List | `[x, y, z, roll, pitch, yaw]` |
+| `joint_state_list` | Robot joint state List | Joint vector, often 7-D |
+| `gripper_state_list` | Robot gripper state List | 2-D |
+| `front_camera_extrinsic_list` | Front camera extrinsic List | Camera extrinsic matrix |
+| `wrist_camera_extrinsic_list` | Wrist camera extrinsic List | Camera extrinsic matrix |
+To use only the current (latest) observation, use `obs[key][-1]`.
+### Optional field switches (`include_*`)
+`BenchmarkEnvBuilder.make_env_for_episode(...)` controls optional observation/info fields through `include_*` flags.
+Default behavior:
+- All `include_*` flags default to `False`.
+- Without extra flags, env returns RGB + state related fields only.
+Mapping:
+| Flag | Added key |
+|------|-----------|
+| `include_maniskill_obs` | `obs["maniskill_obs"]` |
+| `include_front_depth` | `obs["front_depth_list"]` |
+| `include_wrist_depth` | `obs["wrist_depth_list"]` |
+| `include_front_camera_extrinsic` | `obs["front_camera_extrinsic_list"]` |
+| `include_wrist_camera_extrinsic` | `obs["wrist_camera_extrinsic_list"]` |
+| `include_available_multi_choices` | `info["available_multi_choices"]` |
+| `include_front_camera_intrinsic` | `info["front_camera_intrinsic"]` |
+| `include_wrist_camera_intrinsic` | `info["wrist_camera_intrinsic"]` |
+Special case:
+- If `action_space="multi_choice"`, front camera parameters are forced on internally:
+  - `front_camera_extrinsic_list`
+  - `front_camera_intrinsic`
+  Even if the corresponding `include_front_camera_*` flags are `False`.
+Example:
+```python
+from robomme.env_record_wrapper import BenchmarkEnvBuilder
+builder = BenchmarkEnvBuilder(
+    env_id="VideoUnmaskSwap",
+    dataset="test",
+    action_space="joint_angle",
+    gui_render=False,
+)
+env = builder.make_env_for_episode(
+    episode_idx=0,
+    max_steps=1000,
+    include_maniskill_obs=False,
+    include_front_depth=True,
+    include_wrist_depth=False,
+    include_front_camera_extrinsic=True,
+    include_wrist_camera_extrinsic=False,
+    include_available_multi_choices=False,
+    include_front_camera_intrinsic=True,
+    include_wrist_camera_intrinsic=False,
+)
+obs, info = env.reset()
+```
+### `info` dict
+| Key | Meaning | Typical content |
+|-----|---------|-----------------|
+| `task_goal` | Task goal list | `list[str]` |
+| `simple_subgoal_online` | Oracle online simple subgoal | Description of the current simple subgoal |
+| `grounded_subgoal_online` | Oracle online grounded subgoal | Description of the current grounded subgoal |
+| `available_multi_choices` | Current available options for multi-choice action | List of e.g. `{"label: "a/b/...", "action": str, "need_parameter": bool}`, need_parameter means this action needs grounding info like `[y, x]` |
+| `front_camera_intrinsic` | Front camera intrinsic | Camera intrinsic matrix |
+| `wrist_camera_intrinsic` | Wrist camera intrinsic | Camera intrinsic matrix |
+| `status` | Status flag | One of `success`, `fail`, `timeout`, `ongoing`, `error` |

doc/h5_data_format.md ADDED Viewed

	@@ -0,0 +1,71 @@

+# HDF5 Training Data Format
+Structure inside each `record_dataset_<EnvID>.h5` file:
+```text
+episode_1/
+  setup/
+  timestep_1/
+    obs/
+    action/
+    info/
+  timestep_2/
+    obs/
+    action/
+    info/
+  ...
+...
+```
+Each episode contains:
+- `setup/`: episode-level configuration.
+- `timestep_<K>/`: per-timestep data.
+## `setup/` fields (episode configuration)
+| Field | Type | Description |
+|-------|------|-------------|
+| `seed` | `int` | Environment seed (fixed for benchmarking) |
+| `difficulty` | `str` | Difficulty level (fixed for benchmarking) |
+| `task_goal` | `list[str]` | Possible language goals for the task |
+| `front_camera_intrinsic` | `float32 (3, 3)` | Front camera intrinsic matrix |
+| `wrist_camera_intrinsic` | `float32 (3, 3)` | Wrist camera intrinsic matrix |
+| `available_multi_choices` | `str` | Available options for the multi-choice Video-QA problem |
+## `obs/` fields (observations)
+| Field | Type / shape | Description |
+|-------|---------------|-------------|
+| `front_rgb` | `uint8 (512, 512, 3)` | Front camera RGB |
+| `wrist_rgb` | `uint8 (256, 256, 3)` | Wrist camera RGB |
+| `front_depth` | `int16 (512, 512, 1)` | Front camera depth (mm) |
+| `wrist_depth` | `int16 (256, 256, 1)` | Wrist camera depth (mm) |
+| `joint_state` | `float32 (7,)` | Joint positions (7 joints) |
+| `eef_state` | `float32 (6,)` | End-effector pose `[x, y, z, roll, pitch, yaw]` |
+| `gripper_state` | `float32 (2,)` | Gripper opening width in [0, 0.04] |
+| `is_gripper_close` | `bool` | Whether gripper is closed |
+| `front_camera_extrinsic` | `float32 (3, 4)` | Front camera extrinsic matrix |
+| `wrist_camera_extrinsic` | `float32 (3, 4)` | Wrist camera extrinsic matrix |
+## `action/` fields
+| Field | Type / shape | Description |
+|-------|---------------|-------------|
+| `joint_action` | `float32 (8,)` | Joint-space action: 7 joint angles + gripper |
+| `eef_action` | `float32 (7,)` | End-effector action `[x, y, z, roll, pitch, yaw, gripper]` |
+| `waypoint_action` | `float32 (7,)` | End-effector action at discrete time steps; a subtask may contain multiple waypoint actions. Used for data generation. |
+| `choice_action` | `str` | JSON string for multi-choice selection with an optional grounded pixel location on the front image, e.g., `{"choice": "A", "point": [y, x]}` |
+In RoboMME, a gripper action of -1 means close and 1 means open.
+## `info/` fields (metadata)
+| Field | Type | Description |
+|-------|------|-------------|
+| `simple_subgoal` | `bytes (UTF-8)` | Simple subgoal text (built-in planner view) |
+| `simple_subgoal_online` | `bytes (UTF-8)` | Simple subgoal text (online view; may advance to the next subgoal earlier than planner view) |
+| `grounded_subgoal` | `bytes (UTF-8)` | Grounded subgoal text (built-in planner view) |
+| `grounded_subgoal_online` | `bytes (UTF-8)` | Grounded subgoal text (online view; may advance to the next subgoal earlier than planner view) |
+| `is_video_demo` | `bool` | Whether this frame is from the conditioning video shown before execution |
+| `is_subgoal_boundary` | `bool` | Whether this is a keyframe (i.e., a boundary between subtasks) |
+| `is_completed` | `bool` | Whether the task is finished |

doc/submission/model_example.md ADDED Viewed

	@@ -0,0 +1,69 @@

+# Your Cool Model Name
+### [Website]() | [Paper]() | [Code]()
+## Introduction
+My cool model leverages a novel representation for history keyframes and maintains a memory cache to integrate with diffusion policy.
+## Results
+> We ask for **at least three runs** with different model seeds to decrease the performance fluctuations.
+> The benchmark seed is fixed internally.
+### Table
+<table>
+<tr>
+  <th rowspan="2">Suite</th>
+  <th rowspan="2">Task</th>
+</tr>
+<tr>
+  <th>Seed 7</th><th>Seed 42</th><th>Seed 0</th><th><b>Avg</b></th>
+</tr>
+<tr>
+  <td rowspan="4">Counting</td>
+  <td>BinFill</td><td></td><td></td><td></td><td></td>
+</tr>
+<tr><td>PickXtimes</td><td></td><td></td><td></td><td></td></tr>
+<tr><td>SwingXtimes</td><td></td><td></td><td></td><td></td></tr>
+<tr><td>StopCube</td><td></td><td></td><td></td><td></td></tr>
+<tr>
+  <td rowspan="4">Permanence</td>
+  <td>VideoUnmask</td><td></td><td></td><td></td><td></td>
+</tr>
+<tr><td>VideoUnmaskSwap</td><td></td><td></td><td></td><td></td></tr>
+<tr><td>ButtonUnmask</td><td></td><td></td><td></td><td></td></tr>
+<tr><td>ButtonUnmaskSwap</td><td></td><td></td><td></td><td></td></tr>
+<tr>
+  <td rowspan="4">Reference</td>
+  <td>PickHighlight</td><td></td><td></td><td></td><td></td>
+</tr>
+<tr><td>VideoRepick</td><td></td><td></td><td></td><td></td></tr>
+<tr><td>VideoPlaceButton</td><td></td><td></td><td></td><td></td></tr>
+<tr><td>VideoPlaceOrder</td><td></td><td></td><td></td><td></td></tr>
+<tr>
+  <td rowspan="4">Imitation</td>
+  <td>MoveCube</td><td></td><td></td><td></td><td></td>
+</tr>
+<tr><td>InsertPeg</td><td></td><td></td><td></td><td></td></tr>
+<tr><td>PatternLock</td><td></td><td></td><td></td><td></td></tr>
+<tr><td>RouteStick</td><td></td><td></td><td></td><td></td></tr>
+<tr>
+  <td colspan="2"><b>Overall</b></td><td></td><td></td><td></td><td></td>
+</tr>
+</table>
+### Training Details
+Any hyperparameters you would like to share
+### Released Checkpoints
+Any fine-tuned checkpoints you would like to release
+> We highly encourage authors to fully release their training/eval code and checkpoints to help the community accelerate memory-augmented manipulation.
+### Citations
+```
+```

gradio-web/AAAuser_generator.py ADDED Viewed

	@@ -0,0 +1,147 @@

+import json
+import random
+# Deprecated runtime path:
+# This script is only for offline generation experiments and is not used by
+# the current Gradio runtime task assignment flow.
+ENVS = [
+    # Counting
+    "BinFill",
+    "PickXtimes",
+    "SwingXtimes",
+    "StopCube",
+    # Persistence
+    "VideoUnmask",
+    "ButtonUnmask",
+    "VideoUnmaskSwap",
+    "ButtonUnmaskSwap",
+    # Reference
+    "PickHighlight",
+    "VideoRepick",
+    "VideoPlaceButton",
+    "VideoPlaceOrder",
+    # Behavior
+    "MoveCube",
+    "InsertPeg",
+    "PatternLock",
+    "RouteStick",
+]
+REAL_USERS = [
+    "Hongyu_Zhou",
+    "Wanling_Cai",
+    "Xinyi_Wang",
+    "Yinpei_Dai",
+    "Hongze_Fu",
+    "Run_Peng",
+    "Haoran_Zhang",
+    "Yunqi_Zhao",
+    "Yue_Hu",
+    "Yiwei_Lyu",
+    "Josue_Torres-Fonseca",
+    "Jung-Chun_Liu",
+    "Jacob_Sansom",
+    "Long-Jing_Hsu"
+]
+NUM_USERS = 20
+EPISODES_PER_ENV = 50
+TEST_EPISODE_IDX = 98
+def generate_json(seed: int = 0):
+    rng = random.Random(seed)
+    # 1️⃣ 为每个环境生成所有任务
+    env_tasks = {}
+    for env in ENVS:
+        env_tasks[env] = [
+            {"env_id": env, "episode_idx": ep}
+            for ep in range(EPISODES_PER_ENV)
+        ]
+    # Generate user keys
+    user_keys = []
+    for i in range(NUM_USERS):
+        if i < len(REAL_USERS):
+            user_keys.append(REAL_USERS[i])
+        else:
+            user_keys.append(f"user{i+1}")
+    # 2️⃣ 初始化用户任务列表
+    users = {key: [] for key in user_keys}
+    # 3️⃣ 阶段1：保证每个用户都有全部环境至少一次
+    # 为每个用户从每个环境随机选择1个任务
+    used_tasks = {env: set() for env in ENVS}  # 记录已使用的episode_idx
+    for user_key in user_keys:
+        for env in ENVS:
+            # 从该环境的可用任务中随机选择一个
+            available = [
+                task for task in env_tasks[env]
+                if task["episode_idx"] not in used_tasks[env]
+            ]
+            if available:
+                selected_task = rng.choice(available)
+                users[user_key].append(selected_task)
+                used_tasks[env].add(selected_task["episode_idx"])
+    # 4️⃣ 阶段2：均匀分配剩余任务
+    # 收集剩余任务（未被使用的任务）
+    remaining_tasks = []
+    for env in ENVS:
+        for task in env_tasks[env]:
+            if task["episode_idx"] not in used_tasks[env]:
+                remaining_tasks.append(task)
+    # 打乱剩余任务
+    rng.shuffle(remaining_tasks)
+    # 均匀分配给用户，保持每个环境在每个用户中的平衡
+    # 每个用户再分到剩余任务数/用户数的任务
+    remaining_per_user = len(remaining_tasks) // NUM_USERS
+    for i in range(NUM_USERS):
+        start = i * remaining_per_user
+        end = (i + 1) * remaining_per_user
+        users[user_keys[i]].extend(remaining_tasks[start:end])
+    # 如果有余数，分配给前几个用户（每个用户1个）
+    remainder = len(remaining_tasks) % NUM_USERS
+    if remainder > 0:
+        start_idx = remaining_per_user * NUM_USERS
+        for i in range(remainder):
+            users[user_keys[i]].append(remaining_tasks[start_idx + i])
+    # 5️⃣ test（保持你原格式）
+    test_template = [
+        {"env_id": env, "episode_idx": TEST_EPISODE_IDX}
+        #for env in ENVS if env == "ButtonUnmask" or env == "VideoUnmaskSwap"
+        for env in ENVS
+    ]
+    output = {}
+    for user_key in user_keys:
+        # 把test任务放在训练任务前面
+        output[user_key] = test_template + users[user_key]
+        #output[f"user{i}_test"] = test_template 不输出test
+    return output
+if __name__ == "__main__":
+    data = generate_json(seed=42)
+    with open("user_tasks.json", "w", encoding="utf-8") as f:
+        json.dump(data, f, indent=2, ensure_ascii=False)
+    counts = {k: len(v) for k, v in data.items() if not k.endswith("_test")}
+    print("Train counts:", counts)
+    print("Min/Max:", min(counts.values()), max(counts.values()))
+    print("✅ 已生成并保存到 user_tasks.json")

gradio-web/config.py ADDED Viewed

	@@ -0,0 +1,41 @@

+"""
+配置常量模块
+"""
+# --- Configuration ---
+VIDEO_PLAYBACK_FPS = 30.0  # Frame rate for demonstration video playback
+USE_SEGMENTED_VIEW = False  # Set to True to use segmented view, False to use original image
+# 主界面两列宽度比例 (Keypoint Selection : Right Panel)
+KEYPOINT_SELECTION_SCALE = 1
+CONTROL_PANEL_SCALE = 2
+# 右侧顶部并排比例 (Action Selection : System Log)
+RIGHT_TOP_ACTION_SCALE = 2
+RIGHT_TOP_LOG_SCALE = 1
+# Session超时配置
+SESSION_TIMEOUT = 300  # Session超时时间（秒），如果30秒内没有execute_step操作，将自动回收session
+# 兜底执行次数配置
+EXECUTE_LIMIT_OFFSET = 4  # 兜底执行次数 = non_demonstration_task_length + EXECUTE_LIMIT_OFFSET
+# 应该显示demonstration videos的环境ID列表
+DEMO_VIDEO_ENV_IDS = [
+    "VideoPlaceOrder",
+    "VideoUnmaskSwap",
+    "VideoUnmask",
+    "VideoRepick",
+    "VideoPlaceButton",
+    "InsertPeg",
+    "MoveCube",
+    "PatternLock",
+    "RouteStick"
+]
+def should_show_demo_video(env_id):
+    """
+    判断指定的环境ID是否应该显示demonstration video
+    只有DEMO_VIDEO_ENV_IDS列表中的环境才显示demonstration videos
+    """
+    return env_id in DEMO_VIDEO_ENV_IDS

gradio-web/gradio_callbacks.py ADDED Viewed

	@@ -0,0 +1,997 @@

+"""
+Gradio回调函数模块
+响应UI事件，调用业务逻辑，返回UI更新
+"""
+import gradio as gr
+import numpy as np
+import time
+import threading
+import queue
+import os
+import re
+from datetime import datetime
+from PIL import Image
+from state_manager import (
+    get_session,
+    create_session,
+    set_ui_phase,
+    reset_ui_phase,
+    get_execute_count,
+    increment_execute_count,
+    reset_execute_count,
+    set_task_start_time,
+    update_session_activity,
+    get_session_activity,
+    cleanup_session,
+    reset_play_button_clicked,
+    GLOBAL_SESSIONS,
+    SESSION_LAST_ACTIVITY,
+    _state_lock,
+)
+from image_utils import draw_marker, save_video, concatenate_frames_horizontally
+from user_manager import user_manager
+from config import USE_SEGMENTED_VIEW, should_show_demo_video, SESSION_TIMEOUT, EXECUTE_LIMIT_OFFSET
+from process_session import ScrewPlanFailureError, ProcessSessionProxy
+from note_content import get_task_hint
+# --- live_obs refresh queue state ---
+# Each uid keeps its own FIFO queue and sampling cursor.
+_LIVE_OBS_REFRESH = {}
+_LIVE_OBS_REFRESH_LOCK = threading.Lock()
+def capitalize_first_letter(text: str) -> str:
+    """确保字符串的第一个字母大写，其余字符保持不变"""
+    if not text:
+        return text
+    if len(text) == 1:
+        return text.upper()
+    return text[0].upper() + text[1:]
+def get_videoplacebutton_goal(original_goal: str) -> str:
+    """
+    为 VideoPlaceButton 任务构造新的任务目标
+    匹配 "cube on the target" 并替换为新的目标格式
+    """
+    if not original_goal:
+        return ""
+    original_lower = original_goal.lower()
+    # 匹配 "cube on the target" 并替换
+    if "cube on the target" in original_lower:
+        # 使用正则表达式进行不区分大小写的替换
+        pattern = re.compile(re.escape("cube on the target"), re.IGNORECASE)
+        new_goal = pattern.sub("cube on the target that it was previously placed on", original_goal)
+        return capitalize_first_letter(new_goal)
+    else:
+        # 如果无法匹配，保持原始任务目标不变
+        return capitalize_first_letter(original_goal)
+def _ui_option_label(session, opt_label: str, opt_idx: int) -> str:
+    """
+    仅在 Gradio UI 层对选项显示文案做覆盖（不改底层 env/options 生成逻辑）。
+    目前只对 RouteStick 任务把 4 个长句 label 显示为短 label。
+    """
+    env_id = getattr(session, "env_id", None)
+    if env_id == "RouteStick":
+        routestick_map = {
+            0: "move left clockwise",
+            1: "move right clockwise",
+            2: "move left counterclockwise",
+            3: "move right counterclockwise",
+        }
+        return routestick_map.get(int(opt_idx), opt_label)
+    return opt_label
+def format_log_markdown(log_message):
+    """
+    将日志消息标准化为纯文本，供 Textbox 展示。
+    Args:
+        log_message: 纯文本日志消息（可以是多行）
+    Returns:
+        str: 清洗后的纯文本日志字符串
+    """
+    if log_message is None:
+        return ""
+    return str(log_message).replace("\r\n", "\n").replace("\r", "\n")
+def show_task_hint(uid, current_hint=""):
+    """
+    按需加载任务提示内容（仅在用户点击"Task Hint"按钮时调用）
+    On-demand loading of task hint based on current session's env_id.
+    支持切换显示/隐藏：如果当前提示为空则显示，如果不为空则隐藏。
+    【修改说明】
+    此函数用于实现任务提示的延迟加载和切换显示功能。用户点击"Task Hint"按钮时：
+    - 如果当前提示内容为空，则从当前session中读取env_id并加载对应的提示内容
+    - 如果当前提示内容不为空，则清空提示内容（隐藏）
+    Args:
+        uid: 用户会话的唯一标识符，用于获取当前session对象
+        current_hint: 当前提示内容的文本，用于判断是否显示/隐藏
+    Returns:
+        str: 根据当前环境ID返回的任务提示内容（Markdown格式），
+             如果当前提示不为空则返回空字符串（隐藏），
+             如果session不存在或env_id未加载则返回空字符串或错误提示
+    """
+    # 如果当前提示内容不为空，则切换为隐藏（返回空字符串）
+    if current_hint and current_hint.strip():
+        return ""
+    # 从全局状态管理器中获取当前用户的session对象
+    session = get_session(uid)
+    if not session:
+        # 如果session不存在，返回空字符串（前端不会显示任何内容）
+        return ""
+    # 从session对象中获取当前加载的环境ID（env_id）
+    # 使用getattr安全获取属性，如果不存在则返回None
+    env_id = getattr(session, 'env_id', None)
+    if not env_id:
+        # 如果环境ID未加载，返回提示信息
+        return "No environment loaded."
+    # 根据环境ID调���get_task_hint函数获取对应的任务提示内容
+    # 该函数会根据不同的env_id返回不同的提示文本（如PickXtimes、VideoPlaceOrder等）
+    return get_task_hint(env_id)
+def show_loading_info():
+    """
+    显示加载环境的全屏遮罩层提示信息
+    功能说明：
+    - 此函数在用户点击登录/加载任务等按钮时被调用
+    - 返回包含全屏遮罩层的 HTML 字符串，用于显示加载提示
+    - 遮罩层会覆盖整个页面，防止用户在加载过程中进行其他操作
+    - 加载完成后，回调函数会返回空字符串 "" 来清空 loading_overlay 组件，从而隐藏遮罩层
+    工作流程：
+    1. 用户点击按钮（如 Login、Next Task 等）
+    2. 按钮的 click 事件首先调用此函数，显示遮罩层
+    3. 然后通过 .then() 链式调用实际的加载函数（如 login_and_load_task）
+    4. 加载函数执行完成后，返回 gr.update(visible=False) 隐藏遮罩层
+    Returns:
+        gr.update: 显示 loading overlay group
+    """
+    return gr.update(visible=True)
+def on_video_end(uid):
+    """
+    Called when the demonstration video finishes playing.
+    Updates the system log to prompt for action selection.
+    """
+    return format_log_markdown("please select the action below 👇🏻,\nsome actions also need to select keypoint")
+def switch_to_execute_phase(uid):
+    """Disable controls and keypoint clicking during execute playback."""
+    if uid:
+        session = get_session(uid)
+        base_count = len(getattr(session, "base_frames", []) or []) if session else 0
+        with _LIVE_OBS_REFRESH_LOCK:
+            _LIVE_OBS_REFRESH[uid] = {
+                "frame_queue": queue.Queue(),
+                "last_base_count": base_count,
+                "take_next": True,  # downsample x2 by enqueueing every other frame
+            }
+    return (
+        gr.update(interactive=False),  # options_radio
+        gr.update(interactive=False),  # exec_btn
+        gr.update(interactive=False),  # restart_episode_btn
+        gr.update(interactive=False),  # next_task_btn
+        gr.update(interactive=False),  # img_display
+        gr.update(interactive=False),  # reference_action_btn
+    )
+def switch_to_action_phase(uid=None):
+    """Switch display to action phase and restore control panel interactions."""
+    if uid:
+        with _LIVE_OBS_REFRESH_LOCK:
+            _LIVE_OBS_REFRESH.pop(uid, None)
+    return (
+        gr.update(interactive=True),  # options_radio
+        gr.update(),  # exec_btn (keep execute_step result)
+        gr.update(),  # restart_episode_btn (keep execute_step result)
+        gr.update(),  # next_task_btn (keep execute_step result)
+        gr.update(interactive=True),  # img_display
+        gr.update(interactive=True),  # reference_action_btn
+    )
+def _get_live_obs_refresh_state(uid, base_count=0):
+    with _LIVE_OBS_REFRESH_LOCK:
+        if uid not in _LIVE_OBS_REFRESH:
+            _LIVE_OBS_REFRESH[uid] = {
+                "frame_queue": queue.Queue(),
+                "last_base_count": int(base_count),
+                "take_next": True,  # downsample x2 by enqueueing every other frame
+            }
+        return _LIVE_OBS_REFRESH[uid]
+def _enqueue_live_obs_frames(uid, base_frames):
+    """
+    Push newly appended base_frames into per-uid FIFO queue with x2 downsampling.
+    """
+    if not uid:
+        return 0
+    frames = base_frames or []
+    state = _get_live_obs_refresh_state(uid, base_count=len(frames))
+    frame_queue = state["frame_queue"]
+    current_count = len(frames)
+    last_count = int(state.get("last_base_count", 0))
+    # Session/task reset: history shrank.
+    if current_count < last_count:
+        with _LIVE_OBS_REFRESH_LOCK:
+            state["frame_queue"] = queue.Queue()
+            state["last_base_count"] = current_count
+            state["take_next"] = True
+        return 0
+    if current_count <= last_count:
+        return frame_queue.qsize()
+    new_frames = frames[last_count:current_count]
+    take_next = bool(state.get("take_next", True))
+    for frame in new_frames:
+        if take_next and frame is not None:
+            frame_queue.put(frame)
+        take_next = not take_next
+    with _LIVE_OBS_REFRESH_LOCK:
+        state["last_base_count"] = current_count
+        state["take_next"] = take_next
+    return frame_queue.qsize()
+def _wait_for_live_obs_queue_drain(uid, max_wait_sec=None, empty_grace_sec=0.2, poll_sec=0.05):
+    """
+    Wait for timer-driven live_obs refresh to consume queued frames before phase switch.
+    """
+    if not uid:
+        return
+    with _LIVE_OBS_REFRESH_LOCK:
+        state0 = _LIVE_OBS_REFRESH.get(uid)
+        queue0 = state0.get("frame_queue") if state0 else None
+        initial_qsize = int(queue0.qsize()) if queue0 is not None else 0
+    if max_wait_sec is None:
+        # 0.1s tick playback + small buffer, capped to keep UI responsive.
+        max_wait_sec = min(30.0, max(2.0, initial_qsize * 0.12 + 1.0))
+    start = time.time()
+    empty_since = None
+    while True:
+        if (time.time() - start) >= max_wait_sec:
+            break
+        with _LIVE_OBS_REFRESH_LOCK:
+            state = _LIVE_OBS_REFRESH.get(uid)
+            frame_queue = state.get("frame_queue") if state else None
+        if frame_queue is None:
+            break
+        if frame_queue.qsize() > 0:
+            empty_since = None
+        else:
+            if empty_since is None:
+                empty_since = time.time()
+            elif (time.time() - empty_since) >= empty_grace_sec:
+                break
+        time.sleep(poll_sec)
+def _prepare_refresh_frame(frame):
+    """Normalize cached frame to an RGB uint8 PIL image for gr.Image."""
+    if frame is None:
+        return None
+    frame_arr = np.asarray(frame)
+    if frame_arr.dtype != np.uint8:
+        max_val = float(np.max(frame_arr)) if frame_arr.size else 0.0
+        if max_val <= 1.0:
+            frame_arr = (frame_arr * 255.0).clip(0, 255).astype(np.uint8)
+        else:
+            frame_arr = frame_arr.clip(0, 255).astype(np.uint8)
+    if frame_arr.ndim == 2:
+        frame_arr = np.stack([frame_arr] * 3, axis=-1)
+    elif frame_arr.ndim == 3 and frame_arr.shape[2] == 4:
+        frame_arr = frame_arr[:, :, :3]
+    return Image.fromarray(frame_arr)
+def refresh_live_obs(uid, ui_phase):
+    """
+    Poll latest cached frame during execute phase.
+    Updates live_obs every 0.1s via gr.Timer.
+    """
+    if ui_phase != "execution_playback":
+        return gr.update()
+    session = get_session(uid)
+    if not session:
+        return gr.update()
+    base_frames = getattr(session, "base_frames", None) or []
+    if not base_frames:
+        return gr.update()
+    _enqueue_live_obs_frames(uid, base_frames)
+    state = _get_live_obs_refresh_state(uid, base_count=len(base_frames))
+    frame_queue = state["frame_queue"]
+    if frame_queue.empty():
+        return gr.update()
+    latest = frame_queue.get()
+    env_id = getattr(session, "env_id", None)
+    stitched = concatenate_frames_horizontally([latest], env_id=env_id)
+    if stitched:
+        latest = stitched[-1]
+    img = _prepare_refresh_frame(latest)
+    if img is None:
+        return gr.update()
+    return gr.update(value=img, interactive=False)
+def on_video_end_transition(uid):
+    """Called when demo video finishes. Transition from video to action phase."""
+    return (
+        gr.update(visible=False),  # video_phase_group
+        gr.update(visible=True),   # action_phase_group
+        gr.update(visible=True),   # control_panel_group
+        format_log_markdown("please select the action below 👇🏻,\nsome actions also need to select keypoint")
+    )
+def _task_load_failed_response(uid, message):
+    return (
+        uid,
+        gr.update(visible=True),  # main_interface
+        gr.update(value=None, interactive=False),  # img_display
+        format_log_markdown(message),  # log_output
+        gr.update(choices=[], value=None),  # options_radio
+        "",  # goal_box
+        "No need for coordinates",  # coords_box
+        gr.update(value=None, visible=False),  # video_display
+        "",  # task_info_box
+        "",  # progress_info_box
+        gr.update(interactive=False),  # restart_episode_btn
+        gr.update(interactive=False),  # next_task_btn
+        gr.update(interactive=False),  # exec_btn
+        gr.update(visible=False),  # video_phase_group
+        gr.update(visible=False),  # action_phase_group
+        gr.update(visible=False),  # control_panel_group
+        gr.update(value=""),  # task_hint_display
+        gr.update(visible=False),  # loading_overlay
+        gr.update(interactive=False),  # reference_action_btn
+    )
+def _load_status_task(uid, status):
+    """Load status.current_task to session and build the standard UI update tuple."""
+    current_task = status.get("current_task") if isinstance(status, dict) else None
+    if not current_task:
+        return _task_load_failed_response(uid, "Error loading task: missing current_task")
+    env_id = current_task.get("env_id")
+    ep_num = current_task.get("episode_idx")
+    if env_id is None or ep_num is None:
+        return _task_load_failed_response(uid, "Error loading task: invalid task payload")
+    try:
+        completed_count = int(status.get("completed_count", 0))
+    except (TypeError, ValueError):
+        completed_count = 0
+    progress_text = f"Completed: {completed_count}"
+    session = get_session(uid)
+    if session is None:
+        print(f"Session {uid} not found, creating new session")
+        session = ProcessSessionProxy()
+        with _state_lock:
+            GLOBAL_SESSIONS[uid] = session
+            SESSION_LAST_ACTIVITY[uid] = time.time()
+        print(f"New session created for {uid}")
+    print(f"Loading {env_id} Ep {ep_num} for {uid}")
+    with _LIVE_OBS_REFRESH_LOCK:
+        _LIVE_OBS_REFRESH.pop(uid, None)
+    reset_play_button_clicked(uid)
+    reset_execute_count(uid, env_id, int(ep_num))
+    img, load_msg = session.load_episode(env_id, int(ep_num))
+    actual_env_id = getattr(session, "env_id", None) or env_id
+    if img is not None:
+        start_time = datetime.now().isoformat()
+        set_task_start_time(uid, env_id, int(ep_num), start_time)
+    if img is None:
+        set_ui_phase(uid, "executing_task")
+        return (
+            uid,
+            gr.update(visible=True),  # main_interface
+            gr.update(value=None, interactive=False),  # img_display
+            format_log_markdown(f"Error: {load_msg}"),  # log_output
+            gr.update(choices=[], value=None),  # options_radio
+            "",  # goal_box
+            "No need for coordinates",  # coords_box
+            gr.update(value=None, visible=False),  # video_display
+            f"{actual_env_id} (Episode {ep_num})",  # task_info_box
+            progress_text,  # progress_info_box
+            gr.update(interactive=True),  # restart_episode_btn
+            gr.update(interactive=True),  # next_task_btn
+            gr.update(interactive=False),  # exec_btn
+            gr.update(visible=False),  # video_phase_group
+            gr.update(visible=True),  # action_phase_group
+            gr.update(visible=True),  # control_panel_group
+            gr.update(value=get_task_hint(env_id) if env_id else ""),  # task_hint_display
+            gr.update(visible=False),  # loading_overlay
+            gr.update(interactive=False),  # reference_action_btn
+        )
+    if session.env_id == "VideoPlaceButton" and session.language_goal:
+        goal_text = get_videoplacebutton_goal(session.language_goal)
+    else:
+        goal_text = capitalize_first_letter(session.language_goal) if session.language_goal else ""
+    options = session.available_options
+    radio_choices = []
+    for opt_label, opt_idx in options:
+        opt_label = _ui_option_label(session, opt_label, opt_idx)
+        if 0 <= opt_idx < len(session.raw_solve_options):
+            opt = session.raw_solve_options[opt_idx]
+            if opt.get("available"):
+                opt_label_with_hint = f"{opt_label} (click mouse 🖱️ to select 🎯)"
+            else:
+                opt_label_with_hint = opt_label
+        else:
+            opt_label_with_hint = opt_label
+        radio_choices.append((opt_label_with_hint, opt_idx))
+    demo_video_path = None
+    has_demo_video = False
+    should_show = should_show_demo_video(actual_env_id) if actual_env_id else False
+    initial_log_msg = format_log_markdown("please select the action below 👇🏻,\nsome actions also need to select keypoint")
+    if should_show:
+        has_demo_video = True
+        initial_log_msg = format_log_markdown('press "Watch Video Input🎬" to watch a video\nNote: you can only watch the video once')
+        if session.demonstration_frames:
+            try:
+                demo_video_path = save_video(session.demonstration_frames, "demo")
+                if demo_video_path:
+                    file_exists = os.path.exists(demo_video_path)
+                    file_size = os.path.getsize(demo_video_path) if file_exists else 0
+                    if not (file_exists and file_size > 0):
+                        demo_video_path = None
+            except Exception:
+                demo_video_path = None
+    img = session.get_pil_image(use_segmented=USE_SEGMENTED_VIEW)
+    if has_demo_video:
+        set_ui_phase(uid, "executing_task")
+        return (
+            uid,
+            gr.update(visible=True),  # main_interface
+            gr.update(value=img, interactive=False),  # img_display
+            initial_log_msg,  # log_output
+            gr.update(choices=radio_choices, value=None),  # options_radio
+            goal_text,  # goal_box
+            "No need for coordinates",  # coords_box
+            gr.update(value=demo_video_path, visible=True),  # video_display
+            f"{actual_env_id} (Episode {ep_num})",  # task_info_box
+            progress_text,  # progress_info_box
+            gr.update(interactive=True),  # restart_episode_btn
+            gr.update(interactive=True),  # next_task_btn
+            gr.update(interactive=True),  # exec_btn
+            gr.update(visible=True),  # video_phase_group
+            gr.update(visible=False),  # action_phase_group
+            gr.update(visible=False),  # control_panel_group
+            gr.update(value=get_task_hint(actual_env_id)),  # task_hint_display
+            gr.update(visible=False),  # loading_overlay
+            gr.update(interactive=True),  # reference_action_btn
+        )
+    set_ui_phase(uid, "executing_task")
+    return (
+        uid,
+        gr.update(visible=True),  # main_interface
+        gr.update(value=img, interactive=False),  # img_display
+        initial_log_msg,  # log_output
+        gr.update(choices=radio_choices, value=None),  # options_radio
+        goal_text,  # goal_box
+        "No need for coordinates",  # coords_box
+        gr.update(value=None, visible=False),  # video_display (no video)
+        f"{actual_env_id} (Episode {ep_num})",  # task_info_box
+        progress_text,  # progress_info_box
+        gr.update(interactive=True),  # restart_episode_btn
+        gr.update(interactive=True),  # next_task_btn
+        gr.update(interactive=True),  # exec_btn
+        gr.update(visible=False),  # video_phase_group
+        gr.update(visible=True),  # action_phase_group
+        gr.update(visible=True),  # control_panel_group
+        gr.update(value=get_task_hint(actual_env_id)),  # task_hint_display
+        gr.update(visible=False),  # loading_overlay
+        gr.update(interactive=True),  # reference_action_btn
+    )
+def init_session_and_load_task(uid):
+    """Initialize the Gradio session and load the current task."""
+    if not uid:
+        uid = create_session()
+    success, msg, status = user_manager.init_session(uid)
+    if uid:
+        update_session_activity(uid)
+    if not success:
+        return _task_load_failed_response(uid, msg)
+    return _load_status_task(uid, status)
+def load_next_task_wrapper(uid):
+    """Move to a random episode within the same env and reload task."""
+    if not uid:
+        uid = create_session()
+    if uid:
+        update_session_activity(uid)
+    status = user_manager.next_episode_same_env(uid)
+    if not status:
+        return _task_load_failed_response(uid, "Failed to load next task")
+    return _load_status_task(uid, status)
+def restart_episode_wrapper(uid):
+    """Reload the current env + episode."""
+    if not uid:
+        uid = create_session()
+    if uid:
+        update_session_activity(uid)
+    status = user_manager.get_session_status(uid)
+    current_task = status.get("current_task") if isinstance(status, dict) else None
+    if not current_task:
+        return _task_load_failed_response(uid, "Failed to restart episode: missing current task")
+    env_id = current_task.get("env_id")
+    ep_num = current_task.get("episode_idx")
+    if env_id is None or ep_num is None:
+        return _task_load_failed_response(uid, "Failed to restart episode: invalid task payload")
+    return _load_status_task(uid, status)
+def switch_env_wrapper(uid, selected_env):
+    """Switch env from Current Task dropdown and randomly assign an episode."""
+    if not uid:
+        uid = create_session()
+    if uid:
+        update_session_activity(uid)
+    if selected_env:
+        status = user_manager.switch_env_and_random_episode(uid, selected_env)
+    else:
+        status = user_manager.get_session_status(uid)
+    if not status:
+        return _task_load_failed_response(uid, f"Failed to switch environment to '{selected_env}'")
+    return _load_status_task(uid, status)
+def on_map_click(uid, option_value, evt: gr.SelectData):
+    """
+    处理图片点击事件
+    """
+    # 更新session活动时间（点击图片操作）
+    if uid:
+        update_session_activity(uid)
+    session = get_session(uid)
+    if not session:
+        return None, "Session Error"
+    # Check if current option actually needs coordinates
+    needs_coords = False
+    if option_value is not None:
+        # Parse option index similar to on_option_select
+        option_idx = None
+        if isinstance(option_value, tuple):
+             _, option_idx = option_value
+        else:
+             option_idx = option_value
+        if option_idx is not None and 0 <= option_idx < len(session.raw_solve_options):
+             opt = session.raw_solve_options[option_idx]
+             if opt.get("available"):
+                 needs_coords = True
+    if not needs_coords:
+        # Return current state without changes (or reset to default message if needed, but it should already be there)
+        # We return the clean image and the "No need" message to enforce state
+        base_img = session.get_pil_image(use_segmented=USE_SEGMENTED_VIEW)
+        return base_img, "No need for coordinates"
+    x, y = evt.index[0], evt.index[1]
+    # Get clean image from session
+    base_img = session.get_pil_image(use_segmented=USE_SEGMENTED_VIEW)
+    # Draw marker
+    marked_img = draw_marker(base_img, x, y)
+    coords_str = f"{x}, {y}"
+    return marked_img, coords_str
+def _is_valid_coords_text(coords_text: str) -> bool:
+    if not isinstance(coords_text, str):
+        return False
+    text = coords_text.strip()
+    if text in {"", "please click the keypoint selection image", "No need for coordinates"}:
+        return False
+    if "," not in text:
+        return False
+    try:
+        x_raw, y_raw = text.split(",", 1)
+        int(x_raw.strip())
+        int(y_raw.strip())
+    except Exception:
+        return False
+    return True
+def on_option_select(uid, option_value, coords_str=None):
+    """
+    处理选项选择事件
+    """
+    default_msg = "No need for coordinates"
+    if option_value is None:
+        return default_msg, gr.update(interactive=False)
+    # 更新session活动时间（选择选项操作）
+    if uid:
+        update_session_activity(uid)
+    session = get_session(uid)
+    if not session:
+        return default_msg, gr.update(interactive=False)
+    # option_value 是 (label, idx) 元组或直接是 idx
+    if isinstance(option_value, tuple):
+        _, option_idx = option_value
+    else:
+        option_idx = option_value
+    # Determine coords message
+    if 0 <= option_idx < len(session.raw_solve_options):
+        opt = session.raw_solve_options[option_idx]
+        if opt.get("available"):
+             if _is_valid_coords_text(coords_str):
+                 return coords_str, gr.update(interactive=True)
+             return "please click the keypoint selection image", gr.update(interactive=True)
+    return default_msg, gr.update(interactive=False)
+def on_reference_action(uid):
+    """
+    自动获取并回填当前步参考 action + 像素坐标（不执行）。
+    """
+    if uid:
+        update_session_activity(uid)
+    session = get_session(uid)
+    if not session:
+        return (
+            None,
+            gr.update(),
+            "No need for coordinates",
+            format_log_markdown("Session Error"),
+        )
+    current_img = session.get_pil_image(use_segmented=USE_SEGMENTED_VIEW)
+    try:
+        reference = session.get_reference_action()
+    except Exception as exc:
+        return (
+            current_img,
+            gr.update(),
+            gr.update(),
+            format_log_markdown(f"Ground Truth Action Error: {exc}"),
+        )
+    if not isinstance(reference, dict) or not reference.get("ok", False):
+        message = "Failed to resolve ground truth action."
+        if isinstance(reference, dict) and reference.get("message"):
+            message = str(reference.get("message"))
+        return (
+            current_img,
+            gr.update(),
+            gr.update(),
+            format_log_markdown(f"Ground Truth Action: {message}"),
+        )
+    option_idx = reference.get("option_idx")
+    option_label = str(reference.get("option_label", "")).strip()
+    option_action = str(reference.get("option_action", "")).strip()
+    need_coords = bool(reference.get("need_coords", False))
+    coords_xy = reference.get("coords_xy")
+    updated_img = current_img
+    coords_text = "No need for coordinates"
+    log_text = f"Ground Truth Action: {option_label}. {option_action}".strip()
+    if need_coords and isinstance(coords_xy, (list, tuple)) and len(coords_xy) >= 2:
+        x = int(coords_xy[0])
+        y = int(coords_xy[1])
+        updated_img = draw_marker(current_img, x, y)
+        coords_text = f"{x}, {y}"
+        log_text = f"Ground Truth Action: {option_label}. {option_action} | coords: {coords_text}"
+    return (
+        updated_img,
+        gr.update(value=option_idx),
+        coords_text,
+        format_log_markdown(log_text),
+    )
+def init_app(request: gr.Request):
+    """
+    处理初始页面加载，直接初始化会话并加载首个任务。
+    Args:
+        request: Gradio Request 对象，包含查询参数 / Gradio Request object containing query parameters
+    Returns:
+        初始化后的UI状态
+    """
+    _ = request  # Query params are intentionally ignored in session-based mode.
+    uid = create_session()
+    return init_session_and_load_task(uid)
+def precheck_execute_inputs(uid, option_idx, coords_str):
+    """
+    Native precheck for execute action.
+    Replaces frontend JS interception by validating inputs server-side before phase switch.
+    """
+    if uid:
+        update_session_activity(uid)
+    session = get_session(uid)
+    if not session:
+        raise gr.Error("Session Error")
+    parsed_option_idx = option_idx
+    if isinstance(option_idx, tuple):
+        _, parsed_option_idx = option_idx
+    if parsed_option_idx is None:
+        raise gr.Error("Error: No action selected")
+    needs_coords = False
+    if (
+        isinstance(parsed_option_idx, int)
+        and 0 <= parsed_option_idx < len(session.raw_solve_options)
+    ):
+        opt = session.raw_solve_options[parsed_option_idx]
+        needs_coords = bool(opt.get("available"))
+    if needs_coords and not _is_valid_coords_text(coords_str):
+        raise gr.Error("please click the keypoint selection image before execute!")
+def execute_step(uid, option_idx, coords_str):
+    # 检查session是否超时（在更新活动时间之前检查）
+    last_activity = get_session_activity(uid)
+    if last_activity is not None:
+        elapsed = time.time() - last_activity
+        if elapsed > SESSION_TIMEOUT:
+            raise gr.Error(f"Session已超时：超过 {SESSION_TIMEOUT} 秒未活动。请刷新页面重新登录。")
+    # 更新session的最后活动时间
+    update_session_activity(uid)
+    session = get_session(uid)
+    if not session:
+        return None, format_log_markdown("Session Error"), gr.update(), gr.update(), gr.update(interactive=False), gr.update(interactive=False)
+    # 检查 execute 次数限制（在执行前检查，如果达到限制则模拟失败状态）
+    execute_limit_reached = False
+    if uid and session.env_id is not None and session.episode_idx is not None:
+        # 从 session 读取 non_demonstration_task_length，如果存在则加上配置的偏移量作为限制，否则不设置限制
+        max_execute = None
+        if hasattr(session, 'non_demonstration_task_length') and session.non_demonstration_task_length is not None:
+            max_execute = session.non_demonstration_task_length + EXECUTE_LIMIT_OFFSET
+        if max_execute is not None:
+            current_count = get_execute_count(uid, session.env_id, session.episode_idx)
+            if current_count >= max_execute:
+                execute_limit_reached = True
+    # Ensure at least one cached frame exists for timer-based refresh.
+    if not session.base_frames:
+        session.update_observation(use_segmentation=USE_SEGMENTED_VIEW)
+    if option_idx is None:
+        return session.get_pil_image(use_segmented=USE_SEGMENTED_VIEW), format_log_markdown("Error: No action selected"), gr.update(), gr.update(), gr.update(interactive=False), gr.update(interactive=True)
+    # 检查当前选项是否需要坐标
+    needs_coords = False
+    if option_idx is not None and 0 <= option_idx < len(session.raw_solve_options):
+        opt = session.raw_solve_options[option_idx]
+        if opt.get("available"):
+            needs_coords = True
+    # 如果选项需要坐标，检查是否已经点击了图片
+    if needs_coords:
+        # 检查 coords_str 是否是有效的坐标（不是提示信息）
+        is_valid_coords = False
+        if coords_str and "," in coords_str:
+            try:
+                parts = coords_str.split(",")
+                x = int(parts[0].strip())
+                y = int(parts[1].strip())
+                # 如果成功解析为数字，且不是提示信息，则认为是有效坐标
+                if coords_str.strip() not in ["please click the keypoint selection image", "No need for coordinates"]:
+                    is_valid_coords = True
+            except:
+                pass
+        # 如果需要坐标但没有有效坐标，返回错误提示
+        if not is_valid_coords:
+            current_img = session.get_pil_image(use_segmented=USE_SEGMENTED_VIEW)
+            error_msg = "please click the keypoint selection image before execute!"
+            return current_img, format_log_markdown(error_msg), gr.update(), gr.update(), gr.update(interactive=False), gr.update(interactive=True)
+    # Parse coords
+    click_coords = None
+    if coords_str and "," in coords_str:
+        try:
+            parts = coords_str.split(",")
+            click_coords = (int(parts[0].strip()), int(parts[1].strip()))
+        except:
+            pass
+    # Execute
+    # 如果达到 execute 次数限制，模拟失败状态（使用和任务失败一样的机制）
+    if execute_limit_reached:
+        # 获取选项标签用于状态消息
+        option_label = None
+        if session.available_options:
+            for label, idx in session.available_options:
+                if idx == option_idx:
+                    option_label = _ui_option_label(session, label, idx)
+                    break
+        # 模拟失败状态，使用和 oracle_logic.py 中任务失败一样的格式
+        status = f"Executing: {option_label or 'Action'}"
+        status += " | FAILED"  # 和任务失败一样的格式
+        done = True  # 设置为完成，触发任务完成流程
+        # 获取当前图片
+        img = session.get_pil_image(use_segmented=USE_SEGMENTED_VIEW)
+        # 增加 execute 计数（因为这也算一次尝试）
+        if uid and session.env_id is not None and session.episode_idx is not None:
+            new_count = increment_execute_count(uid, session.env_id, session.episode_idx)
+            print(f"Execute limit reached for {uid}:{session.env_id}:{session.episode_idx} (count: {new_count})")
+    else:
+        # 正常执行
+        # 异常处理：所有异常（ScrewPlanFailure 和其他执行错误）都会显示弹窗通知
+        print(f"Executing step: Opt {option_idx}, Coords {click_coords}")
+        try:
+            img, status, done = session.execute_action(option_idx, click_coords)
+        except ScrewPlanFailureError as e:
+            # 捕获 screw plan 失败异常，显示弹窗通知
+            error_message = str(e)
+            gr.Info(f"Robot cannot reach position, Refresh the page and try again.")
+            # 返回当前状态，在状态消息中显示错误信息
+            current_img = session.get_pil_image(use_segmented=USE_SEGMENTED_VIEW)
+            status = f"Screw plan failed: {error_message}"
+            done = False
+            # 继续正常返回流程
+            img = current_img
+        except RuntimeError as e:
+            # 捕获所有其他执行错误，显示弹窗通知
+            error_message = str(e)
+            gr.Info(f"Cannot find suitable target, Refresh the page and try again.")
+            # 返回当前状态，在状态消息中显示错误信息
+            current_img = session.get_pil_image(use_segmented=USE_SEGMENTED_VIEW)
+            status = f"Error: {error_message}"
+            done = False
+            # 继续正常返回流程
+            img = current_img
+        # 增加 execute 计数（无论成功或失败都计数，因为用户已经执行了一次操作）
+        if uid and session.env_id is not None and session.episode_idx is not None:
+            new_count = increment_execute_count(uid, session.env_id, session.episode_idx)
+            print(f"Execute count for {uid}:{session.env_id}:{session.episode_idx} = {new_count}")
+    # Execute frames are produced in batch when execute_action returns from worker process.
+    # Enqueue them now, then wait briefly for the 0.1s timer to drain FIFO playback.
+    _enqueue_live_obs_frames(uid, getattr(session, "base_frames", None))
+    _wait_for_live_obs_queue_drain(uid)
+    # 注意：执行阶段画面由 live_obs 的 0.1s 轮询刷新。
+    progress_update = gr.update()  # 默认不更新 progress
+    task_update = gr.update()
+    if done:
+        # 确定最终状态用于日志记录
+        final_log_status = "failed"
+        if "SUCCESS" in status:
+            final_log_status = "success"
+        # Episode完成时，格式化System Log的状态消息
+        # 使用固定模板，所有行长度一致（32个字符），无空行
+        if final_log_status == "success":
+            status = "********************************\n****   episode success      ****\n********************************\n  ---please press change episode----   "
+        else:
+            status = "********************************\n****   episode failed       ****\n********************************\n  ---please press change episode----   "
+        # 更新累计完成计数，不再推进固定任务索引
+        if uid:
+            seed = getattr(session, 'seed', None)
+            user_status = user_manager.complete_current_task(
+                uid,
+                env_id=session.env_id,
+                episode_idx=session.episode_idx,
+                status=final_log_status,
+                difficulty=session.difficulty if hasattr(session, 'difficulty') and session.difficulty is not None else None,
+                language_goal=session.language_goal,
+                seed=seed
+            )
+            if user_status:
+                completed_count = user_status.get("completed_count", 0)
+                task_update = f"{session.env_id} (Episode {session.episode_idx})"
+                progress_update = f"Completed: {completed_count}"
+    # 根据视图模式重新获取图片
+    img = session.get_pil_image(use_segmented=USE_SEGMENTED_VIEW)
+    restart_episode_update = gr.update(interactive=True)
+    next_task_update = gr.update(interactive=True)
+    exec_btn_update = gr.update(interactive=False) if done else gr.update(interactive=True)
+    # 格式化日志消息为 HTML 格式（支持颜色显示）
+    formatted_status = format_log_markdown(status)
+    return (
+        img,
+        formatted_status,
+        task_update,
+        progress_update,
+        restart_episode_update,
+        next_task_update,
+        exec_btn_update,
+    )

gradio-web/image_utils.py ADDED Viewed

	@@ -0,0 +1,716 @@

+"""
+图像处理工具模块
+无状态的图像处理函数
+"""
+import numpy as np
+import tempfile
+import os
+import traceback
+import math
+from pathlib import Path
+from PIL import Image, ImageDraw, ImageFont
+import cv2
+from config import VIDEO_PLAYBACK_FPS
+# DEPRECATED: 历史任务特化图像叠加配置，保留仅为兼容旧代码路径。
+# 当前已统一关闭任务特化渲染。
+DEPRECATED_COORDINATE_AXES_ENVS = ["PatternLock", "RouteStick", "InsertPeg", "SwingXtimes"]
+ENABLE_DEPRECATED_COORDINATE_AXES_OVERLAY = False
+def _video_output_dirs():
+    """视频输出目录候选（按优先级）。"""
+    current_dir = Path(__file__).resolve().parent
+    project_root = current_dir.parent
+    env_dir = os.environ.get("ROBOMME_TEMP_DEMOS_DIR")
+    candidates = [
+        Path(env_dir).expanduser() if env_dir else None,
+        project_root / "temp_demos",
+        current_dir / "temp_demos",
+        Path.cwd() / "temp_demos",
+        Path(tempfile.gettempdir()) / "robomme_temp_demos",
+    ]
+    result = []
+    seen = set()
+    for path in candidates:
+        if path is None:
+            continue
+        resolved = path.resolve()
+        key = str(resolved)
+        if key in seen:
+            continue
+        seen.add(key)
+        result.append(resolved)
+    return result
+def _write_with_opencv(path, frames):
+    """imageio 不可用时使用 OpenCV 写视频。"""
+    if not frames:
+        return False
+    h, w = frames[0].shape[:2]
+    writer = cv2.VideoWriter(
+        path,
+        cv2.VideoWriter_fourcc(*"mp4v"),
+        VIDEO_PLAYBACK_FPS,
+        (w, h),
+    )
+    if not writer.isOpened():
+        return False
+    try:
+        for frame in frames:
+            if frame.shape[:2] != (h, w):
+                frame = cv2.resize(frame, (w, h), interpolation=cv2.INTER_LINEAR)
+            writer.write(cv2.cvtColor(frame, cv2.COLOR_RGB2BGR))
+        return True
+    finally:
+        writer.release()
+def save_video(frames, suffix=""):
+    """
+    视频保存函数 - 使用imageio生成视频
+    优化点：
+    1. 使用imageio.mimwrite，不依赖FFmpeg编码器
+    2. 直接处理RGB帧，无需颜色空间转换
+    3. 自动处理编码，简单可靠
+    """
+    if not frames or len(frames) == 0:
+        return None
+    try:
+        # 准备帧：确保是 uint8 RGB
+        processed_frames = []
+        for f in frames:
+            if not isinstance(f, np.ndarray):
+                f = np.array(f)
+            if f.dtype != np.uint8:
+                if np.max(f) <= 1.0:
+                    f = (f * 255).astype(np.uint8)
+                else:
+                    f = f.clip(0, 255).astype(np.uint8)
+            if len(f.shape) == 2:
+                f = np.stack([f] * 3, axis=-1)
+            elif len(f.shape) == 3 and f.shape[2] == 4:
+                f = f[:, :, :3]
+            processed_frames.append(f)
+        imageio = None
+        try:
+            import imageio as _imageio
+            imageio = _imageio
+        except Exception:
+            imageio = None
+        for temp_dir in _video_output_dirs():
+            try:
+                temp_dir.mkdir(parents=True, exist_ok=True)
+                fd, path = tempfile.mkstemp(suffix=f"_{suffix}.mp4", dir=str(temp_dir))
+                os.close(fd)
+                if imageio is not None:
+                    imageio.mimwrite(
+                        path,
+                        processed_frames,
+                        fps=VIDEO_PLAYBACK_FPS,
+                        quality=8,
+                        macro_block_size=None,
+                    )
+                else:
+                    ok = _write_with_opencv(path, processed_frames)
+                    if not ok:
+                        raise RuntimeError("OpenCV video writer failed")
+                if os.path.exists(path) and os.path.getsize(path) > 0:
+                    return path
+                raise RuntimeError(f"generated empty video: {path}")
+            except Exception as e:
+                print(f"save_video failed in {temp_dir}: {e}")
+                traceback.print_exc()
+                try:
+                    if "path" in locals() and path and os.path.exists(path):
+                        os.remove(path)
+                except Exception:
+                    pass
+        print("Error in save_video: all video output directories failed")
+        return None
+    except Exception as e:
+        print(f"Error in save_video: {e}")
+        traceback.print_exc()
+        return None
+def concatenate_frames_horizontally(frames1, frames2=None, env_id=None):
+    """
+    处理 base frames 序列，添加标注和坐标系（已移除 wrist camera）
+    Args:
+        frames1: base frames 视频帧列表
+        frames2: 已弃用，保留以保持向后兼容，但不会被使用
+        env_id: 环境ID，用于决定是否显示坐标系（可选）
+    Returns:
+        处理后的帧列表
+    """
+    # DEPRECATED: 任务特化图像叠加（坐标系/RouteStick示意图）已关闭。
+    # 保留机制与绘图函数，便于后续按需恢复。
+    show_coordinate_axes = (
+        ENABLE_DEPRECATED_COORDINATE_AXES_OVERLAY
+        and (env_id in DEPRECATED_COORDINATE_AXES_ENVS if env_id else False)
+    )
+    if not frames1:
+        return []
+    concatenated_frames = []
+    for i in range(len(frames1)):
+        # 获取当前帧
+        frame1 = frames1[i] if i < len(frames1) else frames1[-1]
+        # 转换为numpy数组并确保格式正确
+        if frame1 is not None:
+            if not isinstance(frame1, np.ndarray):
+                frame1 = np.array(frame1)
+            if frame1.dtype != np.uint8:
+                if np.max(frame1) <= 1.0:
+                    frame1 = (frame1 * 255).astype(np.uint8)
+                else:
+                    frame1 = frame1.clip(0, 255).astype(np.uint8)
+            if len(frame1.shape) == 2:
+                frame1 = np.stack([frame1] * 3, axis=-1)
+        else:
+            continue
+        # 获取帧的宽度和高度
+        actual_h, actual_w1 = frame1.shape[:2]
+        # 确定左侧和右侧边框宽度
+        left_border_width = 0
+        right_border_width = 0
+        if show_coordinate_axes:
+            if env_id == "RouteStick":
+                left_border_width = 200  # RouteStick 任务的左侧边框宽度（用于四个半圆示意图）
+                right_border_width = 0  # RouteStick 任务不再显示右侧边框
+            else:
+                left_border_width = 150  # 其他任务的左侧边框宽度
+        if show_coordinate_axes:
+            # 添加左侧黑色边框
+            left_border = np.zeros((actual_h, left_border_width, 3), dtype=np.uint8)
+            # 拼接（包含左侧边框）
+            concatenated_frame = np.concatenate([left_border, frame1], axis=1)
+            # 转换为PIL图像以便在黑色边框区域绘制
+            concatenated_pil = Image.fromarray(concatenated_frame)
+            # 在左侧黑色边框绘制内容
+            left_border_pil = Image.new('RGB', (left_border_width, actual_h), (0, 0, 0))
+            if env_id == "RouteStick":
+                # RouteStick 任务：在左侧绘制四个半圆示意图（不绘制坐标系）
+                left_border_pil = draw_coordinate_axes(left_border_pil, position="left", rotate_180=False, env_id=env_id)
+            else:
+                # 其他任务：绘制坐标系（旋转180度）
+                left_border_pil = draw_coordinate_axes(left_border_pil, position="left", rotate_180=True, env_id=env_id)
+            # 将内容绘制到拼接后的图像上
+            concatenated_pil.paste(left_border_pil, (0, 0))
+            # 转换回numpy数组
+            concatenated_frame = np.array(concatenated_pil)
+        else:
+            # 不显示坐标系，直接使用原帧
+            concatenated_frame = frame1
+        concatenated_frames.append(concatenated_frame)
+    return concatenated_frames
+def draw_semicircle(draw, center, radius, color, width=2, half="lower", start_pos="left", end_pos="right", arrow_position="end", arrow_size=6):
+    """
+    DEPRECATED: 仅供旧版 RouteStick 旋转示意图绘制使用（当前默认不再调用）。
+    绘制半圆封装函数
+    Args:
+        draw: PIL ImageDraw object
+        center: (x, y) 圆心坐标
+        radius: 半径
+        color: 颜色
+        width: 线宽
+        half: "upper" (上半圆) or "lower" (下半圆)
+        start_pos: "left" or "right" (起始位置)
+        end_pos: "left" or "right" (结束位置)
+        arrow_position: "start" (箭头在起始位置) or "end" (箭头在结束位置) or None
+        arrow_size: 箭头大小
+    """
+    cx, cy = center
+    # 确定角度范围
+    # 在图像坐标系中(y向下):
+    # lower: 0-180度 (y > cy)
+    # upper: 180-360度 (y < cy)
+    angle_map = {
+        "lower": {"right": 0, "left": 180},
+        "upper": {"right": 360, "left": 180}
+    }
+    start_angle = angle_map[half].get(start_pos, 0)
+    end_angle = angle_map[half].get(end_pos, 180)
+    # 确定步长
+    step = 5
+    if start_angle > end_angle:
+        step = -5
+    points = []
+    # 生成点
+    # 注意range不包含end，所以要根据step方向加减1
+    for a in range(start_angle, end_angle + (1 if step > 0 else -1), step):
+        rad = math.radians(a)
+        x = cx + radius * math.cos(rad)
+        y = cy + radius * math.sin(rad)
+        points.append((x, y))
+    if len(points) < 2:
+        return
+    # 绘制圆弧
+    draw.line(points, fill=color, width=width)
+    # 绘制箭头
+    if arrow_position:
+        if arrow_position == "start":
+            # 箭头在起点，方向指向路径方向
+            arrow_center = points[0]  # 箭头中心点位于半圆端点
+            next_pt = points[1]
+            dx = next_pt[0] - arrow_center[0]
+            dy = next_pt[1] - arrow_center[1]
+        else: # end
+            # 箭头在终点，方向指向路径方向
+            arrow_center = points[-1]  # 箭头中心点位于半圆端点
+            prev_pt = points[-2]
+            dx = arrow_center[0] - prev_pt[0]
+            dy = arrow_center[1] - prev_pt[1]
+        angle = math.atan2(dy, dx)
+        # 箭头参数
+        arrow_len = arrow_size * 1.5
+        arrow_wing = arrow_size
+        # 箭头中心点在半圆端点，箭头沿路径方向延伸
+        # 计算箭头尖端（沿方向向前）
+        tip_x = arrow_center[0] + arrow_len * math.cos(angle)
+        tip_y = arrow_center[1] + arrow_len * math.sin(angle)
+        tip_pt = (tip_x, tip_y)
+        # 计算箭头尾部中心（沿方向向后）
+        bx = arrow_center[0] - arrow_len * math.cos(angle)
+        by = arrow_center[1] - arrow_len * math.sin(angle)
+        # 计算两翼（从尾部中心向两侧展开）
+        w1x = bx + arrow_wing * math.cos(angle + math.pi/2)
+        w1y = by + arrow_wing * math.sin(angle + math.pi/2)
+        w2x = bx + arrow_wing * math.cos(angle - math.pi/2)
+        w2y = by + arrow_wing * math.sin(angle - math.pi/2)
+        draw.polygon([tip_pt, (w1x, w1y), (w2x, w2y)], fill=color)
+def draw_coordinate_axes(img, position="right", rotate_180=False, env_id=None):
+    """
+    DEPRECATED: 历史任务特化图像叠加函数（当前默认不再调用）。
+    在图片外的黑色区域绘制坐标系，标注 forward/backward/left/right
+    Args:
+        img: PIL Image 或 numpy array
+        position: "left" 或 "right"，指定在左侧还是右侧绘制
+        rotate_180: 如果为 True，将坐标系顺时针旋转180度（用于 base camera）
+        env_id: 环境ID，用于决定是否绘制特殊示意图（如 RouteStick 的旋转方向）
+    Returns:
+        PIL Image with coordinate axes drawn
+    """
+    if isinstance(img, np.ndarray):
+        img = Image.fromarray(img)
+    img = img.copy()
+    draw = ImageDraw.Draw(img)
+    # 获取图片尺寸
+    width, height = img.size
+    # 如果是 RouteStick 任务，绘制旋转方向示意图（左侧或右侧）
+    if env_id == "RouteStick" and (position == "right" or position == "left"):
+        # 绘制四个半圆箭头示意图（垂直排列）
+        # 示意图位置：在图像的左侧或右侧，从上到下垂直排列
+        illustration_width = 220  # 示意图区域宽度（已弃用，保留以保持兼容性）
+        # 尝试加载字体
+        try:
+            small_font = ImageFont.truetype("/usr/share/fonts/truetype/dejavu/DejaVuSans-Bold.ttf", 12)
+        except:
+            try:
+                small_font = ImageFont.truetype("/System/Library/Fonts/Helvetica.ttc", 12)
+            except:
+                small_font = ImageFont.load_default()
+        line_color = (255, 255, 255)  # 白色
+        semicircle_radius = 15  # 半圆半径
+        arrow_size = 3  # 箭头大小
+        vertical_spacing = 5  # 垂直间距（每个半圆之间的间距）
+        line_width = 2  # 线宽
+        # 计算垂直布局
+        # 每个半圆需要的高度：半圆直径 + 标签高度 + 间距
+        item_height = semicircle_radius * 2 + 20 +10 # 半圆直径 + 标签空间
+        total_height = 4 * item_height + 3 * vertical_spacing  # 4个半圆 + 3个间距
+        # 布局中心位置
+        # 左侧或右侧黑色区域的中心，将布局中心放在区域中心
+        layout_center_x = width // 2
+        start_y = (height - total_height) // 2 -20 # 从顶部开始的起始位置
+        # 计算四个半圆的中心位置（从上到下）
+        # 1. Left Clockwise (最上)
+        lcw_center_x = layout_center_x
+        lcw_center_y = start_y + item_height // 2
+        # 2. Left Counterclockwise
+        lccw_center_x = layout_center_x
+        lccw_center_y = lcw_center_y + item_height + vertical_spacing
+        # 3. Right Clockwise
+        rcw_center_x = layout_center_x
+        rcw_center_y = lccw_center_y + item_height + vertical_spacing
+        # 4. Right Counterclockwise (最下)
+        rccw_center_x = layout_center_x
+        rccw_center_y = rcw_center_y + item_height + vertical_spacing
+        # 1. 绘制 left clockwise（最上）：左半圆，右→左（顺时针），箭头在左端朝上
+        draw_semicircle(draw, (lcw_center_x , lcw_center_y+15), semicircle_radius, line_color, line_width, half="upper", start_pos="left", end_pos="right", arrow_position="end", arrow_size=arrow_size)
+        # 添加标签 "L CW"
+        lcw_text = "Left Clockwise"
+        lcw_bbox = draw.textbbox((0, 0), lcw_text, font=small_font)
+        lcw_text_width = lcw_bbox[2] - lcw_bbox[0]
+        lcw_text_height = lcw_bbox[3] - lcw_bbox[1]
+        lcw_text_x = lcw_center_x - lcw_text_width // 2
+        lcw_text_y = lcw_center_y + semicircle_radius + 5
+        draw.rectangle(
+            [(lcw_text_x - 2, lcw_text_y - 2),
+             (lcw_text_x + lcw_text_width + 2, lcw_text_y + lcw_text_height + 2)],
+            fill=(0, 0, 0)
+        )
+        draw.text((lcw_text_x, lcw_text_y), lcw_text, fill=line_color, font=small_font)
+        # 2. 绘制 left counterclockwise（第二个）：左半圆，左→右（逆时针），箭头在右端朝下
+        draw_semicircle(draw, (lccw_center_x, lccw_center_y), semicircle_radius, line_color, line_width, half="lower", start_pos="left", end_pos="right", arrow_position="end", arrow_size=arrow_size)
+        # 添加标签 "L CCW"
+        lccw_text = "Left Counterclockwise"
+        lccw_bbox = draw.textbbox((0, 0), lccw_text, font=small_font)
+        lccw_text_width = lccw_bbox[2] - lccw_bbox[0]
+        lccw_text_height = lccw_bbox[3] - lccw_bbox[1]
+        lccw_text_x = lccw_center_x - lccw_text_width // 2
+        lccw_text_y = lccw_center_y + semicircle_radius + 5
+        draw.rectangle(
+            [(lccw_text_x - 2, lccw_text_y - 2),
+             (lccw_text_x + lccw_text_width + 2, lccw_text_y + lccw_text_height + 2)],
+            fill=(0, 0, 0)
+        )
+        draw.text((lccw_text_x, lccw_text_y), lccw_text, fill=line_color, font=small_font)
+        # 3. 绘制 right clockwise（第三个）：右半圆，左→右（顺时针），箭头在右端朝上
+        draw_semicircle(draw, (rcw_center_x , rcw_center_y), semicircle_radius, line_color, line_width, half="lower", start_pos="right", end_pos="left", arrow_position="end", arrow_size=arrow_size)
+        # 添加标签 "R CW"
+        rcw_text = "Right Clockwise"
+        rcw_bbox = draw.textbbox((0, 0), rcw_text, font=small_font)
+        rcw_text_width = rcw_bbox[2] - rcw_bbox[0]
+        rcw_text_height = rcw_bbox[3] - rcw_bbox[1]
+        rcw_text_x = rcw_center_x - rcw_text_width // 2
+        rcw_text_y = rcw_center_y + semicircle_radius + 5
+        draw.rectangle(
+            [(rcw_text_x - 2, rcw_text_y - 2),
+             (rcw_text_x + rcw_text_width + 2, rcw_text_y + rcw_text_height + 2)],
+            fill=(0, 0, 0)
+        )
+        draw.text((rcw_text_x, rcw_text_y), rcw_text, fill=line_color, font=small_font)
+        # 4. 绘制 right counterclockwise（最下）：右半圆，右→左（逆时针），箭头在左端朝下
+        draw_semicircle(draw, (rccw_center_x , rccw_center_y+15), semicircle_radius, line_color, line_width, half="upper",start_pos="right", end_pos="left", arrow_position="end", arrow_size=arrow_size)
+        # 添加标签 "R CCW"
+        rccw_text = "Right Counterclockwise"
+        rccw_bbox = draw.textbbox((0, 0), rccw_text, font=small_font)
+        rccw_text_width = rccw_bbox[2] - rccw_bbox[0]
+        rccw_text_height = rccw_bbox[3] - rccw_bbox[1]
+        rccw_text_x = rccw_center_x - rccw_text_width // 2
+        rccw_text_y = rccw_center_y + semicircle_radius + 5
+        draw.rectangle(
+            [(rccw_text_x - 2, rccw_text_y - 2),
+             (rccw_text_x + rccw_text_width + 2, rccw_text_y + rccw_text_height + 2)],
+            fill=(0, 0, 0)
+        )
+        draw.text((rccw_text_x, rccw_text_y), rccw_text, fill=line_color, font=small_font)
+        # RouteStick 任务只绘制旋转示意图，不绘制坐标系，直接返回
+        return img
+    # 坐标系位置（在黑色边框内）
+    axis_size = 60  # 坐标系大小
+    # 坐标轴中心位于边框宽度的中心
+    origin_x = width // 2 - axis_size // 2
+    origin_y = height // 2 - axis_size // 2
+    # 尝试加载字体
+    try:
+        font = ImageFont.truetype("/usr/share/fonts/truetype/dejavu/DejaVuSans-Bold.ttf", 14)
+    except:
+        try:
+            font = ImageFont.truetype("/System/Library/Fonts/Helvetica.ttc", 14)
+        except:
+            font = ImageFont.load_default()
+    # 绘制坐标轴（十字形）
+    axis_length = axis_size - 20
+    center_x = origin_x + axis_size // 2
+    center_y = origin_y + axis_size // 2
+    # 绘制坐标轴线条（白色，带半透明效果）
+    line_color = (255, 255, 255)  # 白色
+    line_width = 2
+    # 根据是否旋转180度，调整方向
+    if rotate_180:
+        # 旋转180度：forward变成backward，left变成right
+        # 水平轴（left-right，但方向相反）
+        draw.line(
+            [(center_x - axis_length // 2, center_y),
+             (center_x + axis_length // 2, center_y)],
+            fill=line_color, width=line_width
+        )
+        # 垂直轴（forward-backward，但方向相反）
+        draw.line(
+            [(center_x, center_y - axis_length // 2),
+             (center_x, center_y + axis_length // 2)],
+            fill=line_color, width=line_width
+        )
+        # 绘制箭头（旋转180度后的方向）
+        arrow_size = 5
+        # Forward 箭头（现在在下方��原来是上方）
+        draw.polygon(
+            [(center_x, center_y + axis_length // 2),
+             (center_x - arrow_size, center_y + axis_length // 2 - arrow_size),
+             (center_x + arrow_size, center_y + axis_length // 2 - arrow_size)],
+            fill=line_color
+        )
+        # Backward 箭头（现在在上方，原来是下方）
+        draw.polygon(
+            [(center_x, center_y - axis_length // 2),
+             (center_x - arrow_size, center_y - axis_length // 2 + arrow_size),
+             (center_x + arrow_size, center_y - axis_length // 2 + arrow_size)],
+            fill=line_color
+        )
+        # Right 箭头（现在在左侧，原来是右侧）
+        draw.polygon(
+            [(center_x - axis_length // 2, center_y),
+             (center_x - axis_length // 2 + arrow_size, center_y - arrow_size),
+             (center_x - axis_length // 2 + arrow_size, center_y + arrow_size)],
+            fill=line_color
+        )
+        # Left 箭头（现在在右侧，原来是左侧）
+        draw.polygon(
+            [(center_x + axis_length // 2, center_y),
+             (center_x + axis_length // 2 - arrow_size, center_y - arrow_size),
+             (center_x + axis_length // 2 - arrow_size, center_y + arrow_size)],
+            fill=line_color
+        )
+        # 添加文字标签（旋转180度后的位置）
+        text_color = (255, 255, 255)  # 白色文字
+        # Forward (现在在下方)
+        forward_text = "forward"
+        forward_bbox = draw.textbbox((0, 0), forward_text, font=font)
+        forward_width = forward_bbox[2] - forward_bbox[0]
+        forward_x = center_x - forward_width // 2
+        forward_y = center_y + axis_length // 2 + 5
+        draw.rectangle(
+            [(forward_x - 2, forward_y - 2),
+             (forward_x + forward_width + 2, forward_y + (forward_bbox[3] - forward_bbox[1]) + 2)],
+            fill=(0, 0, 0)
+        )
+        draw.text((forward_x, forward_y), forward_text, fill=text_color, font=font)
+        # Backward (现在在上方)
+        backward_text = "backward"
+        backward_bbox = draw.textbbox((0, 0), backward_text, font=font)
+        backward_width = backward_bbox[2] - backward_bbox[0]
+        backward_x = center_x - backward_width // 2
+        backward_y = center_y - axis_length // 2 - 20
+        draw.rectangle(
+            [(backward_x - 2, backward_y - 2),
+             (backward_x + backward_width + 2, backward_y + (backward_bbox[3] - backward_bbox[1]) + 2)],
+            fill=(0, 0, 0)
+        )
+        draw.text((backward_x, backward_y), backward_text, fill=text_color, font=font)
+        # Right (现在在左侧)
+        right_text = "right"
+        right_bbox = draw.textbbox((0, 0), right_text, font=font)
+        right_width = right_bbox[2] - right_bbox[0]
+        right_x = center_x - axis_length // 2 - right_width - 5
+        right_y = center_y - (right_bbox[3] - right_bbox[1]) // 2
+        draw.rectangle(
+            [(right_x - 2, right_y - 2),
+             (right_x + right_width + 2, right_y + (right_bbox[3] - right_bbox[1]) + 2)],
+            fill=(0, 0, 0)
+        )
+        draw.text((right_x, right_y), right_text, fill=text_color, font=font)
+        # Left (现在在右侧)
+        left_text = "left"
+        left_bbox = draw.textbbox((0, 0), left_text, font=font)
+        left_width = left_bbox[2] - left_bbox[0]
+        left_x = center_x + axis_length // 2 + 5
+        left_y = center_y - (left_bbox[3] - left_bbox[1]) // 2
+        draw.rectangle(
+            [(left_x - 2, left_y - 2),
+             (left_x + left_width + 2, left_y + (left_bbox[3] - left_bbox[1]) + 2)],
+            fill=(0, 0, 0)
+        )
+        draw.text((left_x, left_y), left_text, fill=text_color, font=font)
+    else:
+        # 正常方向（不旋转）
+        # 水平轴（left-right）
+        draw.line(
+            [(center_x - axis_length // 2, center_y),
+             (center_x + axis_length // 2, center_y)],
+            fill=line_color, width=line_width
+        )
+        # 垂直轴（forward-backward）
+        draw.line(
+            [(center_x, center_y - axis_length // 2),
+             (center_x, center_y + axis_length // 2)],
+            fill=line_color, width=line_width
+        )
+        # 绘制箭头（在轴的两端）
+        arrow_size = 5
+        # Forward (上) 箭头
+        draw.polygon(
+            [(center_x, center_y - axis_length // 2),
+             (center_x - arrow_size, center_y - axis_length // 2 + arrow_size),
+             (center_x + arrow_size, center_y - axis_length // 2 + arrow_size)],
+            fill=line_color
+        )
+        # Backward (下) 箭头
+        draw.polygon(
+            [(center_x, center_y + axis_length // 2),
+             (center_x - arrow_size, center_y + axis_length // 2 - arrow_size),
+             (center_x + arrow_size, center_y + axis_length // 2 - arrow_size)],
+            fill=line_color
+        )
+        # Right (右) 箭头
+        draw.polygon(
+            [(center_x + axis_length // 2, center_y),
+             (center_x + axis_length // 2 - arrow_size, center_y - arrow_size),
+             (center_x + axis_length // 2 - arrow_size, center_y + arrow_size)],
+            fill=line_color
+        )
+        # Left (左) 箭头
+        draw.polygon(
+            [(center_x - axis_length // 2, center_y),
+             (center_x - axis_length // 2 + arrow_size, center_y - arrow_size),
+             (center_x - axis_length // 2 + arrow_size, center_y + arrow_size)],
+            fill=line_color
+        )
+        # 添加文字标签
+        text_color = (255, 255, 255)  # 白色文字
+        # Forward (上)
+        forward_text = "forward"
+        forward_bbox = draw.textbbox((0, 0), forward_text, font=font)
+        forward_width = forward_bbox[2] - forward_bbox[0]
+        forward_x = center_x - forward_width // 2
+        forward_y = center_y - axis_length // 2 - 20
+        draw.rectangle(
+            [(forward_x - 2, forward_y - 2),
+             (forward_x + forward_width + 2, forward_y + (forward_bbox[3] - forward_bbox[1]) + 2)],
+            fill=(0, 0, 0)
+        )
+        draw.text((forward_x, forward_y), forward_text, fill=text_color, font=font)
+        # Backward (下)
+        backward_text = "backward"
+        backward_bbox = draw.textbbox((0, 0), backward_text, font=font)
+        backward_width = backward_bbox[2] - backward_bbox[0]
+        backward_x = center_x - backward_width // 2
+        backward_y = center_y + axis_length // 2 + 5
+        draw.rectangle(
+            [(backward_x - 2, backward_y - 2),
+             (backward_x + backward_width + 2, backward_y + (backward_bbox[3] - backward_bbox[1]) + 2)],
+            fill=(0, 0, 0)
+        )
+        draw.text((backward_x, backward_y), backward_text, fill=text_color, font=font)
+        # Right (右)
+        right_text = "right"
+        right_bbox = draw.textbbox((0, 0), right_text, font=font)
+        right_width = right_bbox[2] - right_bbox[0]
+        right_x = center_x + axis_length // 2 + 5
+        right_y = center_y - (right_bbox[3] - right_bbox[1]) // 2
+        draw.rectangle(
+            [(right_x - 2, right_y - 2),
+             (right_x + right_width + 2, right_y + (right_bbox[3] - right_bbox[1]) + 2)],
+            fill=(0, 0, 0)
+        )
+        draw.text((right_x, right_y), right_text, fill=text_color, font=font)
+        # Left (左)
+        left_text = "left"
+        left_bbox = draw.textbbox((0, 0), left_text, font=font)
+        left_width = left_bbox[2] - left_bbox[0]
+        left_x = center_x - axis_length // 2 - left_width - 5
+        left_y = center_y - (left_bbox[3] - left_bbox[1]) // 2
+        draw.rectangle(
+            [(left_x - 2, left_y - 2),
+             (left_x + left_width + 2, left_y + (left_bbox[3] - left_bbox[1]) + 2)],
+            fill=(0, 0, 0)
+        )
+        draw.text((left_x, left_y), left_text, fill=text_color, font=font)
+    return img
+def draw_marker(img, x, y):
+    """Draws a red circle and cross at (x, y)."""
+    if isinstance(img, np.ndarray):
+        img = Image.fromarray(img)
+    img = img.copy()
+    draw = ImageDraw.Draw(img)
+    r = 5
+    # Circle
+    draw.ellipse((x-r, y-r, x+r, y+r), outline="red", width=2)
+    # Cross
+    draw.line((x-r, y, x+r, y), fill="red", width=2)
+    draw.line((x, y-r, x, y+r), fill="red", width=2)
+    return img

gradio-web/main.py ADDED Viewed

	@@ -0,0 +1,61 @@

+"""Main entry for Gradio app (single-instance mode for Hugging Face Spaces)."""
+import os
+import tempfile
+from pathlib import Path
+from ui_layout import create_ui_blocks
+from state_manager import start_timeout_monitor
+APP_DIR = Path(__file__).resolve().parent
+PROJECT_ROOT = APP_DIR.parent
+VIDEOS_DIR = APP_DIR / "videos"
+TEMP_DEMOS_DIR = PROJECT_ROOT / "temp_demos"
+CWD_TEMP_DEMOS_DIR = Path.cwd() / "temp_demos"
+def ensure_media_dirs():
+    """Ensure media temp directories exist before first write."""
+    TEMP_DEMOS_DIR.mkdir(parents=True, exist_ok=True)
+    CWD_TEMP_DEMOS_DIR.mkdir(parents=True, exist_ok=True)
+def build_allowed_paths():
+    """Build Gradio file access allowlist (absolute, deduplicated)."""
+    candidates = [
+        Path.cwd(),
+        PROJECT_ROOT,
+        APP_DIR,
+        VIDEOS_DIR,
+        TEMP_DEMOS_DIR,
+        CWD_TEMP_DEMOS_DIR,
+        Path(tempfile.gettempdir()),
+    ]
+    deduped = []
+    seen = set()
+    for path in candidates:
+        normalized = str(path.resolve())
+        if normalized not in seen:
+            seen.add(normalized)
+            deduped.append(normalized)
+    return deduped
+def main():
+    ensure_media_dirs()
+    start_timeout_monitor()
+    os.environ.setdefault("ROBOMME_TEMP_DEMOS_DIR", str(TEMP_DEMOS_DIR))
+    allowed_paths = build_allowed_paths()
+    demo = create_ui_blocks()
+    demo.queue(default_concurrency_limit=2)
+    demo.launch(
+        server_name="0.0.0.0",
+        server_port=int(os.getenv("PORT", "7860")),
+        allowed_paths=allowed_paths,
+    )
+if __name__ == "__main__":
+    main()

gradio-web/note_content.py ADDED Viewed

	@@ -0,0 +1,181 @@

+"""
+Note content management module
+Manages Coordinate Information and Task Hint content
+"""
+def get_coordinate_information():
+    """
+    Get coordinate information content (Note 1)
+    Returns:
+        str: Coordinate information in Markdown format
+    """
+    return """
+The coordinate system differs based on the camera perspective.
+In the base camera view, the lateral axis is inverted relative to the robot: the right side of the camera frame corresponds to the robot's left side, and vice versa.
+Conversely, the wrist camera view is fully aligned with the robot's motion frame. Directional movements are consistent, meaning 'right' in the camera view corresponds to the robot's right, and 'forward' implies forward movement
+"""
+def get_task_hint(env_id):
+    """
+    Get task hint content based on environment ID (Note 2)
+    Args:
+        env_id (str): Environment ID, e.g., "VideoPlaceOrder", "PickXtimes", etc.
+    Returns:
+        str: Task hint in Markdown format
+    """
+    # Return different hints based on env_id
+    # Order follows solve_3.5_parallel_multi_loop_v4.py DEFAULT_ENVS list
+    hints = {
+        "PickXtimes": """Suppose the task goal is to pick up red cubes for two times, a typical action sequence could be:
+       1. pick up the cube (use mouse click to select the cube with the correct color)
+       2. place the cube onto the target.
+       3. pick up the cube (use mouse click to select the cube with the correct color)
+       4. place the cube onto the target.
+       5. press the button to stop.
+       """,
+        "StopCube": """Suppose the task goal is to stop the cube on the target for three times, a typical action sequence could be:
+       1. move to the top of the button to prepare
+       2. remain static  (it will execute for a fixed time duration, you need to count the times the cube has passed the target)
+       3. remain static
+       4. remain static
+       5. remain static  (Suppose you feel the cube is about to reach the target for the expected number of times, you should press the button to stop the cube directly)
+       6. press the button to stop.
+       """,
+        "SwingXtimes": """Suppose the task goal is to swing the back and forth for two times, a typical action sequence could be:
+       1. pick up the cube (use mouse click to select the cube with the correct color)
+       2. move to the top of the target (use mouse click to select the right-side target)
+       3. move to the top of the target (use mouse click to select the left-side target)
+       4. move to the top of the target (use mouse click to select the right-side target)
+       5. move to the top of the target (use mouse click to select the left-side target)
+       6. put the cube onto the table
+       7. press the button to stop.
+        """,
+        "BinFill": """Suppose the task goal is to pick two red cubes in the bin, a typical action sequence could be:
+       1. pick up the cube (use mouse click to select the cube with the correct color)
+       2. put it into the bin.
+       3. pick up the cube (use mouse click to select the cube with the correct color)
+       4. put it into the bin.
+       5. press the button to stop.
+        """,
+        "VideoUnmaskSwap": """Watch the video carefully. Cubes will be hidden by containers, and you need to memorize the color of the cube inside each one.
+        You need to track the containers since they swap positions!
+        A typical action sequence could be:
+        1. pick up the container (use mouse click to select the container)
+        2. drop the container down.
+        pick up another container if the task goal is to find two containers.
+        """,
+        "VideoUnmask": """Watch the video carefully. Cubes will be hidden by containers, and you need to memorize the color of the cube inside each one.
+        A typical action sequence could be:
+        1. pick up the container (use mouse click to select the container)
+        2. drop the container down.
+        pick up another container if the task goal is to find two containers.
+        """,
+        "ButtonUnmaskSwap":
+        """Press the buttons sequentially. While pressing the buttons, the cubes will be hidden inside the containers, and you need to memorize the color of the cube inside each one.
+        You need to track the containers since they swap positions!
+        A typical action sequence could be:
+        1. press the first button.
+        2. press the second button.
+        3. pick up the container (use mouse click to select the container)
+        4. drop the container down.
+        pick up another container if the task goal is to find two containers.
+        """,
+        "ButtonUnmask":"""Press the buttons sequentially. While pressing the buttons, the cubes will be hidden inside the containers, and you need to memorize the color of the cube inside each one.
+        A typical action sequence could be:
+        1. press the button.
+        2. pick up the container (use mouse click to select the container)
+        3. drop the container down.
+        pick up another container if the task goal is to find two containers.
+        """,
+        "VideoRepick": """Remember the cube that has been picked up before, and then pick it up again. The cubes might be swapped positions.
+        A typical action sequence could be:
+        1. pick up the cube (use mouse click to select the correct cube with the correct color)
+        2. put the cube down on the table.
+        (repeat 1 and 2 for the expected number of times)
+        3. press the button to stop.
+        """,
+        "VideoPlaceButton":
+        """The video shows a robot placing a cube on different targets and pressing the button in a sequence. The targets may change positions.
+        A typical action sequence could be:
+        1. pick up the cube (use mouse click to select the correct cube with the correct color)
+        2. put the cube down on the target (use mouse click to select the target)
+        """
+        ,
+        "VideoPlaceOrder":  """The video shows a robot placing a cube on different targets and pressing the button in a sequence. The targets may change positions.
+        A typical action sequence could be:
+        1. pick up the cube (use mouse click to select the correct cube with the correct color)
+        2. put the cube down on the target (use mouse click to select the target)
+        """,
+        "PickHighlight": """While the robot is pressing the button, some cubes will be highlighted with white discs on the table. Remember them.
+        A typical action sequence could be:
+        1. press the button.
+        2. pick up the cube (use mouse click to select the correct cube with the correct color)
+        3. put the cube down on the table.
+        (Repeat 2 and 3 for with the rest of highlighted cubes)
+        """,
+        "InsertPeg": """The video shows a robot picking up and inserting a peg into a hole.
+        The peg consists of two parts with different colors; you need to pick up the correct part of the peg and insert it into the hole from the correct side.
+        A typical action sequence could be:
+        1. pick up the peg (use mouse click to select the correct peg and the correct part of the peg)
+        2. insert the peg into the hole on the left side
+        """,
+        "MoveCube": """The video shows a robot moving a cube to a target using different methods.
+        The robot might (1) pick up and place the cube, (2) push it with the gripper, or (3) hook it using a peg.
+        Remember the way the robot moves the cube and choose the correct action to execute.
+        """,
+        "PatternLock": """The video shows a robot tracing a pattern with a stick.
+        Remember the movements and reproduce them by choosing correct actions.
+        The correct directions (e.g., left, right, forward, backward) are as given near the base camera view.
+        """,
+        "RouteStick": """The video shows a robot navigating from one target to another by circling around a stick.
+        The movement can be clockwise or counter-clockwise, and the stick may be on the left or right side.
+        Remember the sequence of actions and choose the correct action to execute.
+        The correct directions (e.g., left, right, forward, backward) are as given near the base camera view.
+        """,
+    }
+    # Normalize env_id to handle case-insensitive matching
+    # First try direct lookup
+    if env_id in hints:
+        return hints[env_id]
+    # Create a mapping from lowercase to standard format for case-insensitive lookup
+    # This handles cases where env_id might be passed as lowercase (e.g., "pickxtimes", "binfill")
+    env_id_lower_to_standard = {
+        key.lower(): key for key in hints.keys()
+    }
+    # Try case-insensitive lookup
+    if env_id:
+        env_id_lower = env_id.lower()
+        if env_id_lower in env_id_lower_to_standard:
+            standard_key = env_id_lower_to_standard[env_id_lower]
+            return hints[standard_key]
+    # Return default hint if not found
+    return """///"""

gradio-web/oracle_logic.py ADDED Viewed

	@@ -0,0 +1,975 @@

+import os
+import sys
+import numpy as np
+import gymnasium as gym
+import cv2
+import colorsys
+import torch
+from pathlib import Path
+from PIL import Image
+# --- Setup Paths ---
+# Ensure we can import local project packages from parent directory
+current_dir = os.path.dirname(os.path.abspath(__file__))
+parent_dir = os.path.dirname(current_dir)
+if parent_dir not in sys.path:
+    sys.path.insert(0, parent_dir)
+# --- NLP Imports ---
+try:
+    from sentence_transformers import SentenceTransformer, util as st_util
+    print("Loading NLP Model (all-MiniLM-L6-v2)...")
+    _NLP_MODEL = SentenceTransformer('all-MiniLM-L6-v2')
+    print("NLP Model loaded.")
+except ImportError:
+    print("Warning: sentence-transformers not found. NLP matching will fail.")
+    _NLP_MODEL = None
+except Exception as e:
+    print(f"Error loading NLP model: {e}")
+    _NLP_MODEL = None
+# --- Project Imports ---
+from robomme.env_record_wrapper import BenchmarkEnvBuilder
+from robomme.robomme_env import *  # noqa: F401,F403; ensure gym envs are registered
+from robomme.robomme_env.utils.vqa_options import get_vqa_options
+from robomme.robomme_env.utils.oracle_action_matcher import (
+    find_exact_label_option_index,
+    map_action_text_to_option_label,
+)
+from robomme.robomme_env.utils.choice_action_mapping import (
+    extract_actor_position_xyz,
+    project_world_to_pixel,
+    select_target_with_position,
+)
+from mani_skill.examples.motionplanning.panda.motionplanner import PandaArmMotionPlanningSolver
+from mani_skill.examples.motionplanning.panda.motionplanner_stick import PandaStickMotionPlanningSolver
+# --- FailAware Planner Imports ---
+try:
+    from robomme.robomme_env.utils.planner_fail_safe import (
+        FailAwarePandaArmMotionPlanningSolver,
+        FailAwarePandaStickMotionPlanningSolver,
+        ScrewPlanFailure,
+    )
+except ImportError as e:
+    print(f"Warning: Failed to import robomme fail-aware planners: {e}")
+    # Fallback to regular planners
+    FailAwarePandaArmMotionPlanningSolver = PandaArmMotionPlanningSolver
+    FailAwarePandaStickMotionPlanningSolver = PandaStickMotionPlanningSolver
+    ScrewPlanFailure = RuntimeError
+# --- Constants ---
+ROBOMME_METADATA_ROOT_ENV = "ROBOMME_METADATA_ROOT"
+# For backward compatibility with process_session constructor naming.
+# Semantics: optional override root for metadata json files.
+DEFAULT_DATASET_ROOT = os.environ.get(ROBOMME_METADATA_ROOT_ENV)
+# --- Helper Functions from Script ---
+def _generate_color_map(n=10000, s_min=0.70, s_max=0.95, v_min=0.78, v_max=0.95):
+    phi = 0.6180339887498948
+    color_map = {}
+    for i in range(1, n + 1):
+        h = (i * phi) % 1.0
+        s = s_min + (s_max - s_min) * ((i % 7) / 6)
+        v = v_min + (v_max - v_min) * (((i * 3) % 5) / 4)
+        r, g, b = colorsys.hsv_to_rgb(h, s, v)
+        color_map[i] = [int(round(r * 255)), int(round(g * 255)), int(round(b * 255))]
+    return color_map
+def _sync_table_color(env, color_map):
+    seg_id_map = getattr(env.unwrapped, "segmentation_id_map", None)
+    if not isinstance(seg_id_map, dict):
+        return
+    for obj_id, obj in seg_id_map.items():
+        if getattr(obj, "name", None) == "table-workspace":
+            color_map[obj_id] = [0, 0, 0]
+def _tensor_to_bool(value):
+    if value is None:
+        return False
+    if isinstance(value, torch.Tensor):
+        return bool(value.detach().cpu().bool().item())
+    if isinstance(value, np.ndarray):
+        return bool(np.any(value))
+    return bool(value)
+def _prepare_frame(frame):
+    frame = np.asarray(frame)
+    if frame.dtype != np.uint8:
+        max_val = float(np.max(frame)) if frame.size else 0.0
+        if max_val <= 1.0:
+            frame = (frame * 255.0).clip(0, 255).astype(np.uint8)
+        else:
+            frame = frame.clip(0, 255).astype(np.uint8)
+    if frame.ndim == 2:
+        frame = np.stack([frame] * 3, axis=-1)
+    return frame
+def _prepare_segmentation_visual(segmentation, color_map, target_hw):
+    if segmentation is None:
+        return None, None
+    seg = segmentation
+    if hasattr(seg, "cpu"):
+        seg = seg.cpu().numpy()
+    seg = np.asarray(seg)
+    if seg.ndim > 2:
+        seg = seg[0]
+    seg_2d = seg.squeeze().astype(np.int64)
+    h, w = seg_2d.shape[:2]
+    seg_rgb = np.zeros((h, w, 3), dtype=np.uint8)
+    unique_ids = np.unique(seg_2d)
+    for seg_id in unique_ids:
+        if seg_id <= 0:
+            continue
+        color = color_map.get(int(seg_id))
+        if color is None:
+            continue
+        seg_rgb[seg_2d == seg_id] = color
+    seg_bgr = cv2.cvtColor(seg_rgb, cv2.COLOR_RGB2BGR)
+    target_h, target_w = target_hw
+    if seg_bgr.shape[:2] != (target_h, target_w):
+        seg_bgr = cv2.resize(seg_bgr, (target_w, target_h), interpolation=cv2.INTER_NEAREST)
+    return seg_bgr, seg_2d
+def _fetch_segmentation(env):
+    obs = env.unwrapped.get_obs(unflattened=True)
+    return obs["sensor_data"]["base_camera"]["segmentation"]
+def _build_solve_options(env, planner, selected_target, env_id):
+    return get_vqa_options(env, planner, selected_target, env_id)
+def _extract_first_text(value, default="Unknown Goal"):
+    if isinstance(value, str):
+        text = value.strip()
+        return text or default
+    if isinstance(value, (list, tuple)):
+        for item in value:
+            if item is None:
+                continue
+            text = str(item).strip()
+            if text:
+                return text
+    return default
+def _ensure_list(value):
+    if value is None:
+        return []
+    if isinstance(value, list):
+        return value
+    if isinstance(value, tuple):
+        return list(value)
+    return []
+def _to_frame_list(frames_like):
+    if frames_like is None:
+        return []
+    if isinstance(frames_like, list):
+        return frames_like
+    if isinstance(frames_like, tuple):
+        return list(frames_like)
+    if isinstance(frames_like, torch.Tensor):
+        arr = frames_like.detach().cpu().numpy()
+        if arr.ndim == 3:
+            return [arr]
+        if arr.ndim == 4:
+            return [x for x in arr]
+        return []
+    if isinstance(frames_like, np.ndarray):
+        if frames_like.ndim == 3:
+            return [frames_like]
+        if frames_like.ndim == 4:
+            return [x for x in frames_like]
+        return []
+    return []
+def _iter_env_chain(env, max_depth=16):
+    current = env
+    seen = set()
+    for _ in range(max_depth):
+        if current is None:
+            return
+        env_id = id(current)
+        if env_id in seen:
+            return
+        seen.add(env_id)
+        yield current
+        current = getattr(current, "env", None)
+def _extract_obs_front_frames(env):
+    """
+    Strict path: only use wrapper-produced obs batch front_rgb_list.
+    Returns (front_list, obs_ref_id) or (None, None) if unavailable.
+    """
+    for wrapped in _iter_env_chain(env):
+        for attr_name in ("_last_obs", "last_obs"):
+            obs_candidate = getattr(wrapped, attr_name, None)
+            if not isinstance(obs_candidate, dict):
+                continue
+            if "front_rgb_list" not in obs_candidate:
+                continue
+            front_list = _to_frame_list(obs_candidate.get("front_rgb_list"))
+            return front_list, id(obs_candidate)
+    return None, None
+def _collect_front_frames_from_step_output(step_output):
+    """
+    Extract front camera frames from a single env.step(...) output.
+    Supports both classic step tuple and dense batch tuple.
+    """
+    if not (isinstance(step_output, tuple) and len(step_output) == 5):
+        return []
+    obs = step_output[0]
+    if not isinstance(obs, dict):
+        return []
+    return _to_frame_list(obs.get("front_rgb_list"))
+def _collect_choice_segment_candidates(item, out):
+    if isinstance(item, (list, tuple)):
+        for child in item:
+            _collect_choice_segment_candidates(child, out)
+        return
+    if isinstance(item, dict):
+        for child in item.values():
+            _collect_choice_segment_candidates(child, out)
+        return
+    if item is not None:
+        out.append(item)
+def _extract_choice_segment_position_xyz(current_segment):
+    candidates = []
+    _collect_choice_segment_candidates(current_segment, candidates)
+    for candidate in candidates:
+        pos = extract_actor_position_xyz(candidate)
+        if pos is not None:
+            return pos.astype(np.float64)
+    return None
+def _find_actor_segmentation_id(segmentation_id_map, actor):
+    if not isinstance(segmentation_id_map, dict):
+        return None
+    for seg_id, obj in segmentation_id_map.items():
+        if obj is actor:
+            try:
+                return int(seg_id)
+            except Exception:
+                continue
+    return None
+def _compute_segmentation_centroid_xy(segmentation, seg_id):
+    if segmentation is None:
+        return None
+    try:
+        seg_arr = np.asarray(segmentation)
+    except Exception:
+        return None
+    if seg_arr.ndim > 2:
+        seg_arr = np.squeeze(seg_arr)
+    if seg_arr.ndim != 2:
+        return None
+    mask = seg_arr == int(seg_id)
+    if not np.any(mask):
+        return None
+    ys, xs = np.nonzero(mask)
+    x = int(np.rint(xs.mean()))
+    y = int(np.rint(ys.mean()))
+    return [x, y]
+def _extract_demonstration_payload(demonstration_data):
+    """
+    Compatible with both legacy dict payloads and current DemonstrationWrapper tuple batch:
+    - dict style: {"language goal": "...", "frames": [...]}
+    - tuple/list style: (obs_batch, reward_batch, terminated_batch, truncated_batch, info_batch)
+    """
+    default_goal = "Unknown Goal"
+    default_frames = []
+    if isinstance(demonstration_data, dict):
+        goal_candidate = (
+            demonstration_data.get("language goal")
+            or demonstration_data.get("language_goal")
+            or demonstration_data.get("task_goal")
+        )
+        frames_candidate = demonstration_data.get("frames")
+        if frames_candidate is None:
+            frames_candidate = demonstration_data.get("front_rgb_list")
+        return _extract_first_text(goal_candidate, default_goal), _ensure_list(frames_candidate)
+    if isinstance(demonstration_data, (tuple, list)):
+        obs_batch = demonstration_data[0] if len(demonstration_data) >= 1 else None
+        info_batch = demonstration_data[4] if len(demonstration_data) >= 5 else None
+        if info_batch is None and len(demonstration_data) >= 2 and isinstance(demonstration_data[1], dict):
+            # Fallback for (obs, info) shaped payloads
+            info_batch = demonstration_data[1]
+        frames_candidate = None
+        if isinstance(obs_batch, dict):
+            frames_candidate = obs_batch.get("front_rgb_list")
+        goal_candidate = None
+        if isinstance(info_batch, dict):
+            goal_candidate = info_batch.get("task_goal")
+            if goal_candidate is None:
+                goal_candidate = info_batch.get("language goal")
+            if goal_candidate is None:
+                goal_candidate = info_batch.get("language_goal")
+        return _extract_first_text(goal_candidate, default_goal), _ensure_list(frames_candidate)
+    return default_goal, default_frames
+def _find_best_semantic_match(user_query, options):
+    if _NLP_MODEL is None:
+        return -1, 0.0
+    if not options:
+        return -1, 0.0
+    labels = [opt.get("label", "") for opt in options]
+    query_text = str(user_query or "").strip()
+    try:
+        query_embedding = _NLP_MODEL.encode(query_text, convert_to_tensor=True)
+        corpus_embeddings = _NLP_MODEL.encode(labels, convert_to_tensor=True)
+        cos_scores = st_util.cos_sim(query_embedding, corpus_embeddings)[0]
+        best_idx = torch.argmax(cos_scores).item()
+        best_score = cos_scores[best_idx].item()
+    except Exception as exc:
+        print(f"  [NLP] Semantic match failed ({exc}); defaulting to option 1.")
+        return 0, 0.0
+    return best_idx, best_score
+# --- Core Logic Wrapper ---
+class OracleSession:
+    def __init__(self, dataset_root=DEFAULT_DATASET_ROOT, gui_render=False):
+        """
+        gui_render: If True, uses 'human' render mode (pops up window).
+                    For Gradio, we usually want False (rgb_array).
+        """
+        self.dataset_root = Path(dataset_root) if dataset_root else None
+        self.gui_render = gui_render # Usually False for web app
+        self.render_mode = "human" if gui_render else "rgb_array"
+        self.env = None
+        self.planner = None
+        self.color_map = None
+        self.env_id = None
+        self.episode_idx = None
+        self.language_goal = ""
+        self.difficulty = None
+        self.seed = None
+        self.history = [] # Logs interaction steps
+        # State caches
+        self.seg_vis = None
+        self.seg_raw = None
+        self.base_frames = []
+        self.wrist_frames = []
+        self.demonstration_frames = []
+        self.available_options = []
+        self.raw_solve_options = []
+        # Track frame indices for incremental video updates
+        self.last_base_frame_idx = 0
+        self.last_wrist_frame_idx = 0
+        self.non_demonstration_task_length = None  # 从 DemonstrationWrapper 读取
+        # Track latest obs-batch object and consumed indices to avoid duplicate appends.
+        self._last_obs_ref_id = None
+        self._last_obs_front_consumed = 0
+    def _resolve_metadata_override_root(self):
+        if self.dataset_root:
+            return self.dataset_root
+        env_root = os.environ.get(ROBOMME_METADATA_ROOT_ENV)
+        if env_root:
+            return Path(env_root)
+        return None
+    def load_episode(self, env_id, episode_idx):
+        """Initialize environment for a specific episode."""
+        if self.env:
+            self.env.close()
+        try:
+            metadata_override_root = self._resolve_metadata_override_root()
+            builder = BenchmarkEnvBuilder(
+                env_id=env_id,
+                dataset="train",
+                # Gradio uses local oracle solve() directly (not env.step(command_dict)),
+                # so we must keep a low-level stepping wrapper chain.
+                # "multi_choice" inserts OraclePlannerDemonstrationWrapper, which expects
+                # dict commands and may swallow planner low-level action arrays.
+                action_space="joint_angle",
+                gui_render=self.gui_render,
+                #gui_render=True,
+                override_metadata_path=metadata_override_root,
+                max_steps=3000,
+            )
+            episode_num = builder.get_episode_num()
+            if episode_num <= 0:
+                if metadata_override_root:
+                    expected = metadata_override_root / f"record_dataset_{env_id}_metadata.json"
+                    return None, f"Dataset metadata not found or empty: {expected}"
+                return None, f"Dataset metadata not found or empty for env '{env_id}' in split 'test'"
+            if episode_idx < 0 or episode_idx >= episode_num:
+                return None, f"Episode index out of range for {env_id}: {episode_idx} (valid 0-{episode_num - 1})"
+            seed, difficulty = builder.resolve_episode(episode_idx)
+            self.env = builder.make_env_for_episode(episode_idx)
+            self.env.reset()
+            self.env_id = env_id
+            self.episode_idx = episode_idx
+            self.difficulty = difficulty
+            self.seed = seed
+            # Demonstration data
+            demonstration_data = getattr(self.env, "demonstration_data", None)
+            self.language_goal, self.demonstration_frames = _extract_demonstration_payload(demonstration_data)
+            # Setup Color Map
+            self.color_map = _generate_color_map()
+            _sync_table_color(self.env, self.color_map)
+            # Initialize Planner (using FailAware versions)
+            if env_id in ("PatternLock", "RouteStick"):
+                self.planner = FailAwarePandaStickMotionPlanningSolver(
+                    self.env, debug=False, vis=self.gui_render,
+                    base_pose=self.env.unwrapped.agent.robot.pose,
+                    visualize_target_grasp_pose=False, print_env_info=False,
+                    joint_vel_limits=0.3,
+                )
+            else:
+                self.planner = FailAwarePandaArmMotionPlanningSolver(
+                    self.env, debug=False, vis=self.gui_render,
+                    base_pose=self.env.unwrapped.agent.robot.pose,
+                    visualize_target_grasp_pose=False, print_env_info=False,
+                )
+            self.env.unwrapped.evaluate() # Initial eval check
+            # 从 DemonstrationWrapper 读取 non_demonstration_task_length（如果存在）
+            self.non_demonstration_task_length = getattr(self.env, 'non_demonstration_task_length', None)
+            # Reset logs
+            self.history = []
+            # Reset frame indices
+            self.last_base_frame_idx = 0
+            self.last_wrist_frame_idx = 0
+            self.base_frames = []
+            self.wrist_frames = []
+            self._last_obs_ref_id = None
+            self._last_obs_front_consumed = 0
+            # Initial Observation
+            return self.update_observation()
+        except Exception as e:
+            import traceback
+            traceback.print_exc()
+            return None, f"Error initializing episode: {e}"
+    def update_observation(self, use_segmentation=True):
+        """Captures current state, updates segmentation, and generates options."""
+        if not self.env:
+            return None, "Environment not initialized"
+        # 1. Capture Frames (strict path: only front_rgb_list from wrapper obs batch)
+        front_frames, obs_ref_id = _extract_obs_front_frames(self.env)
+        self.wrist_frames = []
+        if front_frames is not None:
+            front_frames = front_frames or []
+            if obs_ref_id != self._last_obs_ref_id:
+                self._last_obs_ref_id = obs_ref_id
+                self._last_obs_front_consumed = 0
+            new_front = front_frames[self._last_obs_front_consumed:]
+            self._last_obs_front_consumed = len(front_frames)
+            if new_front:
+                self.base_frames.extend(_prepare_frame(frame) for frame in new_front if frame is not None)
+        else:
+            self.base_frames = []
+            self._last_obs_ref_id = None
+            self._last_obs_front_consumed = 0
+        seg_data = _fetch_segmentation(self.env)
+        # 2. Determine Resolution
+        seg_hw = (255, 255) # Default
+        if self.base_frames and len(self.base_frames) > 0:
+            seg_hw = self.base_frames[-1].shape[:2]
+        elif seg_data is not None:
+             # Try to guess from seg data
+            try:
+                temp = seg_data
+                if hasattr(temp, "cpu"): temp = temp.cpu().numpy()
+                temp = np.asarray(temp)
+                if temp.ndim > 2: temp = temp[0]
+                seg_hw = temp.shape[:2]
+            except: pass
+        # 3. Process Segmentation/Image
+        if use_segmentation:
+            self.seg_vis, self.seg_raw = _prepare_segmentation_visual(seg_data, self.color_map, seg_hw)
+        else:
+            # If not using segmentation view, use RGB but scale to match seg logic
+             seg_vis_from_seg, self.seg_raw = (
+                 _prepare_segmentation_visual(seg_data, self.color_map, seg_hw)
+                 if seg_data is not None
+                 else (None, None)
+             )
+             if self.base_frames:
+                vis_frame = _prepare_frame(self.base_frames[-1])
+                vis_frame = cv2.cvtColor(vis_frame, cv2.COLOR_RGB2BGR) # Keep consistent BGR internally
+                if vis_frame.shape[:2] != seg_hw:
+                    vis_frame = cv2.resize(vis_frame, (seg_hw[1], seg_hw[0]), interpolation=cv2.INTER_LINEAR)
+                self.seg_vis = vis_frame
+             elif seg_vis_from_seg is not None:
+                 # 没有 RGB 原始帧时，回退到 segmentation 可视化，避免首屏空白。
+                 self.seg_vis = seg_vis_from_seg
+             else:
+                 self.seg_vis = np.zeros((seg_hw[0], seg_hw[1], 3), dtype=np.uint8)
+        # 4. Generate Options
+        dummy_target = {"obj": None, "name": None, "seg_id": None, "click_point": None, "centroid_point": None}
+        self.raw_solve_options = _build_solve_options(self.env, self.planner, dummy_target, self.env_id)
+        # Format for UI
+        self.available_options = []
+        for i, opt in enumerate(self.raw_solve_options):
+            opt_label = str(opt.get("label", f"Option {i + 1}")).strip()
+            opt_action = str(opt.get("action", "")).strip()
+            if opt_label and opt_action:
+                ui_label = f"{opt_label}. {opt_action}"
+            else:
+                ui_label = opt_label or opt_action or f"Option {i + 1}"
+            self.available_options.append((ui_label, i)) # Tuple for Gradio Radio/Dropdown
+        return self.get_pil_image(), "Ready"
+    def get_pil_image(self, use_segmented=True):
+        """
+        获取PIL图像
+        Args:
+            use_segmented: 如果为True，返回分割视图(seg_vis)；如果为False，返回原图(base_frames)
+        """
+        if use_segmented:
+            # 返回分割视图
+            if self.seg_vis is None:
+                return Image.new('RGB', (255, 255), color='gray')
+            # Convert BGR (OpenCV) to RGB (PIL)
+            rgb = cv2.cvtColor(self.seg_vis, cv2.COLOR_BGR2RGB)
+            return Image.fromarray(rgb)
+        else:
+            # 返回原图
+            if not self.base_frames or len(self.base_frames) == 0:
+                return Image.new('RGB', (255, 255), color='gray')
+            # 获取最后一帧
+            frame = self.base_frames[-1]
+            # 准备帧（确保格式正确）
+            frame = _prepare_frame(frame)
+            # frame 已经是 RGB 格式，直接转换为 PIL Image
+            return Image.fromarray(frame)
+    def close(self):
+        if self.env:
+            self.env.close()
+    def _get_front_camera_projection_params(self):
+        if not self.env:
+            return None, None, None
+        intrinsic = None
+        extrinsic = None
+        image_shape = None
+        try:
+            obs = self.env.unwrapped.get_obs(unflattened=True)
+        except Exception:
+            obs = None
+        if isinstance(obs, dict):
+            try:
+                cam_param = obs.get("sensor_param", {}).get("base_camera", {})
+                intrinsic = np.asarray(cam_param.get("intrinsic_cv")).reshape(-1)[:9].reshape(3, 3)
+                extrinsic = np.asarray(cam_param.get("extrinsic_cv")).reshape(-1)[:12].reshape(3, 4)
+            except Exception:
+                intrinsic = None
+                extrinsic = None
+            try:
+                rgb = obs.get("sensor_data", {}).get("base_camera", {}).get("rgb")
+                if rgb is not None and hasattr(rgb, "cpu"):
+                    rgb = rgb.cpu().numpy()
+                rgb = np.asarray(rgb)
+                if rgb.ndim == 4:
+                    image_shape = (int(rgb.shape[1]), int(rgb.shape[2]))
+                elif rgb.ndim == 3:
+                    image_shape = (int(rgb.shape[0]), int(rgb.shape[1]))
+            except Exception:
+                image_shape = None
+        if image_shape is None and self.seg_raw is not None:
+            try:
+                seg = np.asarray(self.seg_raw)
+                image_shape = (int(seg.shape[0]), int(seg.shape[1]))
+            except Exception:
+                image_shape = None
+        if image_shape is None and self.base_frames:
+            frame = np.asarray(self.base_frames[-1])
+            image_shape = (int(frame.shape[0]), int(frame.shape[1]))
+        return intrinsic, extrinsic, image_shape
+    def get_reference_action(self):
+        if not self.env:
+            return {
+                "ok": False,
+                "option_idx": None,
+                "option_label": "",
+                "option_action": "",
+                "need_coords": False,
+                "coords_xy": None,
+                "message": "Environment not initialized.",
+            }
+        target_action_text = getattr(self.env.unwrapped, "current_choice_label", "")
+        if not isinstance(target_action_text, str) or not target_action_text.strip():
+            return {
+                "ok": False,
+                "option_idx": None,
+                "option_label": "",
+                "option_action": "",
+                "need_coords": False,
+                "coords_xy": None,
+                "message": "Current step has no ground truth action text.",
+            }
+        selected_target = {
+            "obj": None,
+            "name": None,
+            "seg_id": None,
+            "click_point": None,
+            "centroid_point": None,
+        }
+        try:
+            current_options = _build_solve_options(self.env, self.planner, selected_target, self.env_id)
+        except Exception as exc:
+            return {
+                "ok": False,
+                "option_idx": None,
+                "option_label": "",
+                "option_action": "",
+                "need_coords": False,
+                "coords_xy": None,
+                "message": f"Failed to build options: {exc}",
+            }
+        if not current_options:
+            return {
+                "ok": False,
+                "option_idx": None,
+                "option_label": "",
+                "option_action": "",
+                "need_coords": False,
+                "coords_xy": None,
+                "message": "No available options for current step.",
+            }
+        matched_label = map_action_text_to_option_label(target_action_text, current_options)
+        if matched_label is None:
+            return {
+                "ok": False,
+                "option_idx": None,
+                "option_label": "",
+                "option_action": "",
+                "need_coords": False,
+                "coords_xy": None,
+                "message": f"Cannot map ground truth action '{target_action_text}' to option label.",
+            }
+        option_idx = find_exact_label_option_index(matched_label, current_options)
+        if option_idx < 0:
+            return {
+                "ok": False,
+                "option_idx": None,
+                "option_label": "",
+                "option_action": "",
+                "need_coords": False,
+                "coords_xy": None,
+                "message": f"Mapped label '{matched_label}' not found in current options.",
+            }
+        option = current_options[option_idx]
+        option_label = str(option.get("label", "")).strip()
+        option_action = str(option.get("action", "")).strip()
+        need_coords = bool(option.get("available"))
+        if not need_coords:
+            return {
+                "ok": True,
+                "option_idx": int(option_idx),
+                "option_label": option_label,
+                "option_action": option_action,
+                "need_coords": False,
+                "coords_xy": None,
+                "message": "Ground truth action resolved.",
+            }
+        reference_position = _extract_choice_segment_position_xyz(
+            getattr(self.env.unwrapped, "current_segment", None)
+        )
+        if reference_position is None:
+            return {
+                "ok": False,
+                "option_idx": int(option_idx),
+                "option_label": option_label,
+                "option_action": option_action,
+                "need_coords": True,
+                "coords_xy": None,
+                "message": "Cannot resolve reference target position from current segment.",
+            }
+        best_candidate = select_target_with_position(option.get("available"), reference_position)
+        if best_candidate is None or best_candidate.get("obj") is None:
+            return {
+                "ok": False,
+                "option_idx": int(option_idx),
+                "option_label": option_label,
+                "option_action": option_action,
+                "need_coords": True,
+                "coords_xy": None,
+                "message": "Cannot match reference target to available candidates.",
+            }
+        actor = best_candidate.get("obj")
+        segmentation_id_map = getattr(self.env.unwrapped, "segmentation_id_map", {}) or {}
+        seg_id = _find_actor_segmentation_id(segmentation_id_map, actor)
+        coords_xy = None
+        if seg_id is not None:
+            coords_xy = _compute_segmentation_centroid_xy(self.seg_raw, seg_id)
+        if coords_xy is None:
+            world_xyz = best_candidate.get("position")
+            if world_xyz is None:
+                world_xyz = extract_actor_position_xyz(actor)
+            intrinsic, extrinsic, image_shape = self._get_front_camera_projection_params()
+            if world_xyz is not None and intrinsic is not None and extrinsic is not None and image_shape is not None:
+                coords_xy = project_world_to_pixel(
+                    world_xyz=world_xyz,
+                    intrinsic_cv=intrinsic,
+                    extrinsic_cv=extrinsic,
+                    image_shape=image_shape,
+                )
+        if coords_xy is None:
+            return {
+                "ok": False,
+                "option_idx": int(option_idx),
+                "option_label": option_label,
+                "option_action": option_action,
+                "need_coords": True,
+                "coords_xy": None,
+                "message": "Failed to compute pixel coordinates for reference target.",
+            }
+        coords_xy = [int(coords_xy[0]), int(coords_xy[1])]
+        return {
+            "ok": True,
+            "option_idx": int(option_idx),
+            "option_label": option_label,
+            "option_action": option_action,
+            "need_coords": True,
+            "coords_xy": coords_xy,
+            "message": f"Ground truth action resolved at ({coords_xy[0]}, {coords_xy[1]}).",
+        }
+    def execute_action(self, action_idx, click_coords):
+# 用户点击EXECUTE
+#   ↓
+# execute_step() 调用 session.execute_action()
+#   ↓
+# execute_action() 执行 solve()
+#   ↓ (在solve()执行过程中，step()可能检测到失败)
+#   ↓
+# evaluate(solve_complete_eval=True) 被调用
+#   ↓
+# BinFill.evaluate() 检查失败状态
+#   - 保存 previous_failure
+#   - 调用 sequential_task_check
+#   - 如果 previous_failure=True 或 task_failed=True，设置 failureflag=True
+#   ↓
+# oracle_logic.py 获取 evaluation 结果
+#   - 如果 is_fail=False，额外检查 failureflag 和 current_task_failure
+#   - 设置 done = is_success or is_fail
+#   ↓
+# execute_step() 检查 done
+#   - 如果 done=True，调用 complete_current_task()
+#   ↓
+# complete_current_task() 更新任务索引
+#   - current_idx: 0 -> 1 (episode: 0 -> 1)
+        """
+        The real step logic.
+        """
+        if not self.env: return None, "No Env", False
+        # 1. Re-create options with a persistent target dict that we can modify
+        target_ref = {"obj": None, "name": None, "seg_id": None, "click_point": None, "centroid_point": None}
+        current_options = _build_solve_options(self.env, self.planner, target_ref, self.env_id)
+        if action_idx < 0 or action_idx >= len(current_options):
+             return self.get_pil_image(), "Invalid Action Index", False
+        chosen_opt = current_options[action_idx]
+        # 2. Resolve Target (Click -> Object)
+        if click_coords:
+             # Reuse logic from step() above, applying to target_ref
+             cx, cy = click_coords
+             h, w = self.seg_raw.shape[:2]
+             cx = max(0, min(cx, w-1))
+             cy = max(0, min(cy, h-1))
+             seg_id_map = getattr(self.env.unwrapped, "segmentation_id_map", {}) or {}
+             candidates = []
+             def _collect(item):
+                if isinstance(item, (list, tuple)):
+                    for x in item: _collect(x)
+                elif isinstance(item, dict):
+                    for x in item.values(): _collect(x)
+                else:
+                    if item: candidates.append(item)
+             avail = chosen_opt.get("available")
+             if avail:
+                 _collect(avail)
+                 best_cand = None
+                 min_dist = float('inf')
+                 for actor in candidates:
+                    target_ids = [sid for sid, obj in seg_id_map.items() if obj is actor]
+                    for tid in target_ids:
+                        tid = int(tid)
+                        mask = (self.seg_raw == tid)
+                        if np.any(mask):
+                            ys, xs = np.nonzero(mask)
+                            center_x, center_y = xs.mean(), ys.mean()
+                            dist = (center_x - cx)**2 + (center_y - cy)**2
+                            if dist < min_dist:
+                                min_dist = dist
+                                best_cand = {
+                                    "obj": actor,
+                                    "name": getattr(actor, "name", f"id_{tid}"),
+                                    "seg_id": tid,
+                                    "click_point": (int(cx), int(cy)),
+                                    "centroid_point": (int(center_x), int(center_y))
+                                }
+                 if best_cand:
+                    target_ref.update(best_cand)
+                 else:
+                    target_ref["click_point"] = (int(cx), int(cy))
+             else:
+                  target_ref["click_point"] = (int(cx), int(cy))
+        # 3. Execute Solve
+        # 异常处理流程：
+        #   任何异常发生 (ScrewPlanFailure 或其他异常)
+        #   ↓
+        #   oracle_logic.py: 重新抛出异常
+        #   ↓
+        #   process_session.py: 捕获并传递到主进程
+        #   ↓
+        #   gradio_callbacks.py: 捕获并显示弹窗 (gr.Info)
+        status_msg = f"Executing: {chosen_opt.get('label')}"
+        before_elapsed_steps = getattr(self.env.unwrapped, "elapsed_steps", None)
+        # Collect intermediate front-camera frames during solve() so livestream
+        # can show the full execution process instead of only the final frame.
+        original_step = self.env.step
+        captured_front_frames = []
+        stream_frame_callback = getattr(self, "stream_frame_callback", None)
+        self._execute_streamed_frame_count = 0
+        def _step_with_capture(action):
+            step_output = original_step(action)
+            step_front_frames = _collect_front_frames_from_step_output(step_output)
+            if step_front_frames:
+                prepared_frames = [
+                    _prepare_frame(frame) for frame in step_front_frames if frame is not None
+                ]
+                if prepared_frames:
+                    captured_front_frames.extend(prepared_frames)
+                    if callable(stream_frame_callback):
+                        try:
+                            stream_frame_callback(prepared_frames)
+                            self._execute_streamed_frame_count += len(prepared_frames)
+                        except Exception:
+                            # Keep solve path robust even if streaming callback fails.
+                            pass
+            return step_output
+        self.env.step = _step_with_capture
+        try:
+            chosen_opt.get("solve")()
+        except ScrewPlanFailure as e:
+            # Re-raise ScrewPlanFailure so it can be handled in process_session and displayed as popup
+            print(f"Screw Plan Failure")
+            raise
+        except Exception as e:
+            # Re-raise all other exceptions so they can be displayed as popup too
+            print(f"Execution Error")
+            raise
+        finally:
+            self.env.step = original_step
+        if captured_front_frames:
+            self.base_frames.extend(captured_front_frames)
+        print(f"[execute_action] captured_front_frames={len(captured_front_frames)}")
+        after_elapsed_steps = getattr(self.env.unwrapped, "elapsed_steps", None)
+        print(
+            "[execute_action] elapsed_steps: "
+            f"{before_elapsed_steps} -> {after_elapsed_steps}"
+        )
+        # 4. Evaluate
+        self.env.unwrapped.evaluate()
+        evaluation = self.env.unwrapped.evaluate(solve_complete_eval=True)
+        is_success = _tensor_to_bool(evaluation.get("success", False))
+        is_fail = _tensor_to_bool(evaluation.get("fail", False))
+        # 如果evaluate()没有检测到失败，但环境已经设置了failureflag，则使用failureflag
+        # 这是因为失败可能在solve()执行过程中的step()里被检测到，但evaluate()可能还没有反映
+        failureflag = getattr(self.env.unwrapped, "failureflag", None)
+        current_task_failure = getattr(self.env.unwrapped, "current_task_failure", False)
+        if not is_fail:
+            if failureflag is not None:
+                failureflag_bool = _tensor_to_bool(failureflag)
+                if failureflag_bool:
+                    is_fail = True
+            elif current_task_failure:
+                is_fail = True
+        if is_success: status_msg += " | SUCCESS"
+        if is_fail: status_msg += " | FAILED"
+        # 5. Update State for next step
+        img, _ = self.update_observation()
+        done = is_success or is_fail
+        return img, status_msg, done

gradio-web/process_session.py ADDED Viewed

	@@ -0,0 +1,448 @@

+"""
+多进程会话管理模块
+本模块实现了多进程架构，将每个用户的 OracleSession 运行在独立的工作进程中。
+这样可以确保重计算任务不会阻塞主进程，多个用户可以并发使用系统。
+架构说明：
+1. ProcessSessionProxy: 主进程中的代理类，提供与 OracleSession 相同的接口
+2. session_worker_loop: 工作进程中的循环函数，运行实际的 OracleSession
+3. 进程间通信：通过 multiprocessing.Queue 进行命令和结果的传递
+4. 视频帧同步：工作进程产生的新帧通过 stream_queue 推送到主进程，由后台线程同步到代理的本地缓存
+"""
+import multiprocessing
+import queue
+import threading
+import time
+import traceback
+import numpy as np
+import sys
+import os
+# 添加父目录到路径（逻辑复制自 oracle_logic.py）
+current_dir = os.path.dirname(os.path.abspath(__file__))
+parent_dir = os.path.dirname(current_dir)
+if parent_dir not in sys.path:
+    sys.path.insert(0, parent_dir)
+from oracle_logic import OracleSession, DEFAULT_DATASET_ROOT
+# Import ScrewPlanFailure for exception handling
+try:
+    from robomme.robomme_env.utils.planner_fail_safe import ScrewPlanFailure
+except ImportError:
+    # Fallback if import fails
+    ScrewPlanFailure = RuntimeError
+# Custom exception for screw plan failures (to be caught in gradio_callbacks)
+class ScrewPlanFailureError(RuntimeError):
+    """Exception raised when screw plan fails, to be caught and displayed via gr.Info popup"""
+    pass
+# 定义命令常量
+CMD_LOAD_EPISODE = "load_episode"
+CMD_UPDATE_OBSERVATION = "update_observation"
+CMD_GET_PIL_IMAGE = "get_pil_image"
+CMD_EXECUTE_ACTION = "execute_action"
+CMD_GET_REFERENCE_ACTION = "get_reference_action"
+CMD_CLOSE = "close"
+def _sanitize_options(options):
+    """
+    清理选项数据，移除不可序列化的项（如 'solve' 函数）
+    在跨进程通信时，需要确保所有数据都可以被 pickle 序列化。
+    raw_solve_options 中包含的 'solve' 函数无法序列化，需要移除。
+    'available' 字段可能是复杂对象，需要转换为简单的布尔值。
+    Args:
+        options: 原始选项列表
+    Returns:
+        list: 清理后的选项列表
+    """
+    clean_opts = []
+    if not options:
+        return clean_opts
+    for opt in options:
+        clean_opt = opt.copy()
+        if "solve" in clean_opt:
+            del clean_opt["solve"]
+        if "available" in clean_opt:
+            # Only keep truthiness for UI logic
+            clean_opt["available"] = bool(clean_opt["available"])
+        clean_opts.append(clean_opt)
+    return clean_opts
+def session_worker_loop(cmd_queue, result_queue, stream_queue, dataset_root, gui_render):
+    """
+    工作进程主循环
+    此函数在工作进程中运行，负责：
+    1. 初始化 OracleSession 实例
+    2. 监听来自主进程的命令（通过 cmd_queue）
+    3. 执行命令并返回结果（通过 result_queue）
+    4. 监控视频帧变化，将新帧推送到流队列（通过 stream_queue）
+    5. 处理异常和清理资源
+    Args:
+        cmd_queue: 命令队列，主进程发送命令到此队列
+        result_queue: 结果队列，工作进程返回命令执行结果到此队列
+        stream_queue: 流队列，工作进程推送新视频帧到此队列
+        dataset_root: 数据集根目录路径
+        gui_render: 是否使用GUI渲染模式
+    """
+    session = None
+    try:
+        session = OracleSession(dataset_root=dataset_root, gui_render=gui_render)
+        session.stream_frame_callback = lambda frames: stream_queue.put({"base": frames, "wrist": []})
+        while True:
+            try:
+                # Check for commands
+                cmd_data = cmd_queue.get(timeout=0.1)
+            except queue.Empty:
+                continue
+            cmd = cmd_data["cmd"]
+            args = cmd_data.get("args", [])
+            kwargs = cmd_data.get("kwargs", {})
+            if cmd == CMD_CLOSE:
+                if session:
+                    session.close()
+                break
+            elif cmd == CMD_LOAD_EPISODE:
+                # 加载环境episode
+                res = session.load_episode(*args, **kwargs)
+                # 更新帧索引跟踪（用于增量同步）
+                session.last_base_frame_idx = len(session.base_frames)
+                session.last_wrist_frame_idx = len(session.wrist_frames)
+                # 获取演示状态（从 DemonstrationWrapper 获取）
+                is_demonstration = False
+                if session.env:
+                    is_demonstration = getattr(session.env, 'current_task_demonstration', False)
+                # 构建状态更新（完整同步，因为这是加载操作）
+                state_update = {
+                    "env_id": session.env_id,
+                    "episode_idx": session.episode_idx,
+                    "language_goal": session.language_goal,
+                    "difficulty": session.difficulty,
+                    "seed": session.seed,
+                    "demonstration_frames": session.demonstration_frames,
+                    "base_frames": session.base_frames,  # 加载时完整同步
+                    "wrist_frames": session.wrist_frames,  # 加载时完整同步
+                    "available_options": session.available_options,
+                    "raw_solve_options": _sanitize_options(session.raw_solve_options),
+                    "seg_vis": session.seg_vis,
+                    "is_demonstration": is_demonstration,
+                    "non_demonstration_task_length": session.non_demonstration_task_length  # 同步非demonstration任务长度
+                }
+                result_queue.put({"status": "success", "result": res, "state": state_update})
+            elif cmd == CMD_EXECUTE_ACTION:
+                # 执行动作（重计算任务）
+                try:
+                    res = session.execute_action(*args, **kwargs)
+                except ScrewPlanFailure as e:
+                    # 捕获 ScrewPlanFailure 并作为特殊状态传递到主进程，用于显示弹窗
+                    result_queue.put({"status": "screw_plan_failure", "message": str(e)})
+                    continue
+                except Exception as e:
+                    # 捕获所有其他异常并传递到主进程，用于显示弹窗
+                    result_queue.put({"status": "execution_error", "message": str(e)})
+                    continue
+                # 增量帧同步：只发送新增的帧
+                new_base = session.base_frames[session.last_base_frame_idx:]
+                new_wrist = session.wrist_frames[session.last_wrist_frame_idx:]
+                streamed_count = int(getattr(session, "_execute_streamed_frame_count", 0) or 0)
+                # Frames already pushed by stream_frame_callback during solve() should not be sent twice.
+                if streamed_count > 0 and new_base:
+                    if streamed_count >= len(new_base):
+                        new_base = []
+                    else:
+                        new_base = new_base[streamed_count:]
+                # 更新帧索引
+                session.last_base_frame_idx = len(session.base_frames)
+                session.last_wrist_frame_idx = len(session.wrist_frames)
+                # 如果有新帧，推送到流队列
+                if new_base or new_wrist:
+                    stream_queue.put({"base": new_base, "wrist": new_wrist})
+                # 获取演示状态（从 DemonstrationWrapper 获取）
+                is_demonstration = False
+                if session.env:
+                    is_demonstration = getattr(session.env, 'current_task_demonstration', False)
+                # 构建状态更新（只更新选项和分割视图，帧通过流队列同步）
+                state_update = {
+                    "available_options": session.available_options,
+                    "raw_solve_options": _sanitize_options(session.raw_solve_options),
+                    "seg_vis": session.seg_vis,
+                    "is_demonstration": is_demonstration
+                }
+                result_queue.put({"status": "success", "result": res, "state": state_update})
+            elif cmd == CMD_GET_PIL_IMAGE:
+                res = session.get_pil_image(*args, **kwargs)
+                result_queue.put({"status": "success", "result": res})
+            elif cmd == CMD_UPDATE_OBSERVATION:
+                # 更新观察（获取当前环境状态）
+                res = session.update_observation(*args, **kwargs)
+                # 增量帧同步
+                new_base = session.base_frames[session.last_base_frame_idx:]
+                new_wrist = session.wrist_frames[session.last_wrist_frame_idx:]
+                # 更新帧索引
+                session.last_base_frame_idx = len(session.base_frames)
+                session.last_wrist_frame_idx = len(session.wrist_frames)
+                # 如果有新帧，推送到流队列
+                if new_base or new_wrist:
+                    stream_queue.put({"base": new_base, "wrist": new_wrist})
+                # 获取演示状态（从 DemonstrationWrapper 获取）
+                is_demonstration = False
+                if session.env:
+                    is_demonstration = getattr(session.env, 'current_task_demonstration', False)
+                # 构建状态更新
+                state_update = {
+                    "available_options": session.available_options,
+                    "raw_solve_options": _sanitize_options(session.raw_solve_options),
+                    "seg_vis": session.seg_vis,
+                    "is_demonstration": is_demonstration
+                }
+                result_queue.put({"status": "success", "result": res, "state": state_update})
+            elif cmd == CMD_GET_REFERENCE_ACTION:
+                res = session.get_reference_action(*args, **kwargs)
+                result_queue.put({"status": "success", "result": res})
+            else:
+                result_queue.put({"status": "error", "message": f"Unknown command: {cmd}"})
+    except Exception as e:
+        traceback.print_exc()
+        result_queue.put({"status": "fatal", "message": str(e)})
+class ProcessSessionProxy:
+    """
+    进程会话代理类
+    此类在主进程中运行，提供与 OracleSession 相同的接口。
+    所有方法调用都会被转发到工作进程中的实际 OracleSession 实例。
+    主要功能：
+    1. 启动和管理工作进程
+    2. 通过队列与工作进程通信
+    3. 维护本地状态缓存（从工作进程同步）
+    4. 后台线程实时同步视频帧
+    """
+    def __init__(self, dataset_root=DEFAULT_DATASET_ROOT, gui_render=False):
+        """
+        初始化代理对象
+        Args:
+            dataset_root: 数据集根目录路径
+            gui_render: 是否使用GUI渲染模式
+        """
+        # 使用 spawn 上下文以获得更清晰的进程隔离
+        ctx = multiprocessing.get_context("spawn")
+        # 创建进程间通信队列
+        self.cmd_queue = ctx.Queue()      # 命令队列：主进程 -> 工作进程
+        self.result_queue = ctx.Queue()   # 结果队列：工作进程 -> 主进程
+        self.stream_queue = ctx.Queue()    # 流队列：工作进程 -> 主进程（视频帧）
+        # 启动工作进程
+        self.process = ctx.Process(
+            target=session_worker_loop,
+            args=(self.cmd_queue, self.result_queue, self.stream_queue, dataset_root, gui_render),
+            daemon=True
+        )
+        self.process.start()
+        # 本地状态缓存（从工作进程同步）
+        self.env_id = None
+        self.episode_idx = None
+        self.language_goal = ""
+        self.difficulty = None
+        self.seed = None
+        self.demonstration_frames = []
+        self.base_frames = []  # 由后台同步线程持续更新
+        self.wrist_frames = []  # 由后台同步线程持续更新
+        self.available_options = []
+        self.raw_solve_options = []
+        self.seg_vis = None
+        self.is_demonstration = False  # 演示模式标志
+        self.non_demonstration_task_length = None  # 从工作进程同步的非demonstration任务长度
+        # 帧同步线程：从流队列接收新帧并更新本地缓存
+        self.stop_sync = False
+        self.sync_thread = threading.Thread(target=self._sync_loop, daemon=True)
+        self.sync_thread.start()
+    def _sync_loop(self):
+        """
+        后台线程循环：从流队列消费视频帧并更新本地缓存
+        此线程持续运行，实时接收工作进程推送的新视频帧，
+        并将其追加到本地的 base_frames 和 wrist_frames 列表中。
+        UI 刷新逻辑会直接从代理的本地缓存读取帧数据。
+        """
+        while not self.stop_sync:
+            try:
+                # Use a short timeout to check stop_sync frequently
+                frames = self.stream_queue.get(timeout=0.1)
+                new_base = frames.get("base", [])
+                new_wrist = frames.get("wrist", [])
+                # Append to local lists
+                if new_base:
+                    self.base_frames.extend(new_base)
+                if new_wrist:
+                    self.wrist_frames.extend(new_wrist)
+            except queue.Empty:
+                continue
+            except Exception:
+                break
+    def _send_cmd(self, cmd, *args, **kwargs):
+        """
+        发送命令到工作进程并等待结果
+        Args:
+            cmd: 命令名称
+            *args: 位置参数
+            **kwargs: 关键字参数
+        Returns:
+            命令执行结果
+        Raises:
+            RuntimeError: 工作进程返回错误或致命错误
+            TimeoutError: 工作进程超时（600秒）
+        """
+        # 发送命令到工作进程
+        self.cmd_queue.put({"cmd": cmd, "args": args, "kwargs": kwargs})
+        try:
+            # 等待结果（重任务如加载/执行可能需要较长时间，设置600秒超时）
+            res = self.result_queue.get(timeout=600)
+            # 检查错误状态并转换为异常，以便在 gradio_callbacks 中捕获并显示弹窗
+            if res.get("status") == "screw_plan_failure":
+                raise ScrewPlanFailureError(f"screw plan failed: {res.get('message', 'unknown error')}")
+            if res.get("status") == "execution_error":
+                raise RuntimeError(f"Execution error: {res.get('message', 'unknown error')}")
+            if res.get("status") == "fatal":
+                raise RuntimeError(f"工作进程致命错误: {res.get('message')}")
+            if res.get("status") == "error":
+                raise RuntimeError(f"命令执行错误: {res.get('message')}")
+            # 更新本地状态缓存（如果工作进程返回了状态更新）
+            if "state" in res:
+                state = res["state"]
+                for k, v in state.items():
+                    if k in ["base_frames", "wrist_frames"]:
+                        # 对于帧数据：只有在显式发送时才替换（如加载时）
+                        # 否则由同步循环处理增量更新
+                        if v is not None:
+                             setattr(self, k, v)
+                    else:
+                        # 其他状态直接更新
+                        setattr(self, k, v)
+            return res.get("result")
+        except queue.Empty:
+            raise TimeoutError("工作进程超时")
+    def load_episode(self, env_id, episode_idx):
+        """
+        加载环境episode（在工作进程中执行）
+        Args:
+            env_id: 环境ID
+            episode_idx: episode索引
+        Returns:
+            tuple: (PIL.Image, str) 图像和状态消息
+        """
+        return self._send_cmd(CMD_LOAD_EPISODE, env_id, episode_idx)
+    def execute_action(self, action_idx, click_coords):
+        """
+        执行动作（在工作进程中执行，重计算任务）
+        Args:
+            action_idx: 动作索引
+            click_coords: 点击坐标 (x, y) 或 None
+        Returns:
+            tuple: (PIL.Image, str, bool) 图像、状态消息、是否完成
+        """
+        return self._send_cmd(CMD_EXECUTE_ACTION, action_idx, click_coords)
+    def get_pil_image(self, use_segmented=True):
+        """
+        获取PIL图像（在工作进程中执行）
+        Args:
+            use_segmented: 是否使用分割视图
+        Returns:
+            PIL.Image: 图像对象
+        """
+        return self._send_cmd(CMD_GET_PIL_IMAGE, use_segmented=use_segmented)
+    def update_observation(self, use_segmentation=True):
+        """
+        更新观察（在工作进程中执行）
+        Args:
+            use_segmentation: 是否使用分割视图
+        Returns:
+            tuple: (PIL.Image, str) 图像和状态消息
+        """
+        return self._send_cmd(CMD_UPDATE_OBSERVATION, use_segmentation=use_segmentation)
+    def get_reference_action(self):
+        """
+        获取当前步参考动作与坐标（在工作进程中执行）
+        Returns:
+            dict: 参考动作结果
+        """
+        return self._send_cmd(CMD_GET_REFERENCE_ACTION)
+    def close(self):
+        """
+        关闭代理并清理资源
+        此方法会：
+        1. 停止帧同步线程
+        2. 发送关闭命令到工作进程
+        3. 等待工作进程优雅退出（最多1秒）
+        4. 如果进程仍在运行，强制终止
+        """
+        self.stop_sync = True
+        try:
+            self.cmd_queue.put({"cmd": CMD_CLOSE})
+        except:
+            pass
+        # 等待工作进程优雅退出
+        self.process.join(timeout=1)
+        if self.process.is_alive():
+            self.process.terminate()

gradio-web/scripts/run_background.sh ADDED Viewed

	@@ -0,0 +1,287 @@

+#!/bin/bash
+# 后台运行脚本 - 统一管理 HistoryBench Gradio 服务器
+# 使用方法: bash run_background.sh [start|stop|status|restart|logs]
+# 获取脚本所在目录，然后定位到 gradio 目录
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+GRADIO_DIR="$(cd "$SCRIPT_DIR/.." && pwd)"
+cd "$GRADIO_DIR"
+# Micromamba 环境路径
+MICROMAMBA_ENV="/data/hongzefu/maniskillenv1120"
+# 日志目录（放在 gradio 目录中）
+LOG_DIR="$GRADIO_DIR/logs"
+PID_FILE="$GRADIO_DIR/server.pid"
+# 合并日志文件（包含所有输出：标准输出 + 错误输出）
+LOG_FILE="$LOG_DIR/server.log"
+# 创建日志目录
+mkdir -p "$LOG_DIR"
+# 函数：启动服务器
+start_server() {
+    # 检查是否已经在运行
+    if [ -f "$PID_FILE" ]; then
+        OLD_PID=$(cat "$PID_FILE")
+        if ps -p "$OLD_PID" > /dev/null 2>&1; then
+            echo "⚠️  服务器已经在运行中 (PID: $OLD_PID)"
+            echo "   如需重启，请使用: bash $0 restart"
+            return 1
+        else
+            echo "清理旧的 PID 文件..."
+            rm -f "$PID_FILE"
+        fi
+    fi
+    # 检查 micromamba 环境是否存在
+    if [ ! -d "$MICROMAMBA_ENV" ]; then
+        echo "❌ 错误: Micromamba 环境不存在: $MICROMAMBA_ENV"
+        return 1
+    fi
+    # 检查 Python 可执行文件
+    PYTHON_EXE="$MICROMAMBA_ENV/bin/python"
+    if [ ! -f "$PYTHON_EXE" ]; then
+        echo "❌ 错误: Python 可执行文件不存在: $PYTHON_EXE"
+        return 1
+    fi
+    # 启动服务器
+    echo "🚀 正在后台启动服务器..."
+    echo "   Micromamba 环境: $MICROMAMBA_ENV"
+    echo "   Python 可执行文件: $PYTHON_EXE"
+    echo "   工作目录: $GRADIO_DIR"
+    echo "   日志文件: $LOG_FILE"
+    echo ""
+    # 使用环境中的 Python 直接运行服务器
+    # 使用 nohup 在后台运行，并将所有输出重定向到日志文件
+    # 设置环境变量以确保使用环境中的包和正确的输出行为
+    # 使用 unbuffered 模式 (-u) 和 PYTHONUNBUFFERED=1 确保输出立即写入，不缓冲
+    # 使用 stdbuf -oL -eL 确保行缓冲输出（如果可用）
+    # 将标准输出和错误输出合并到一个文件 (2>&1)，这样所有日志都会完整显示
+    # 使用 >> 追加模式，确保日志不会覆盖
+    # 检查是否可以使用 stdbuf（Linux 系统通常有）
+    if command -v stdbuf >/dev/null 2>&1; then
+        # 使用 stdbuf 确保行缓冲输出，所有 print 和日志都会实时写入
+        nohup env PATH="$MICROMAMBA_ENV/bin:$PATH" \
+            PYTHONUNBUFFERED=1 \
+            PYTHONIOENCODING=utf-8 \
+            stdbuf -oL -eL "$PYTHON_EXE" -u "$GRADIO_DIR/main.py" >> "$LOG_FILE" 2>&1 &
+    else
+        # 如果没有 stdbuf，使用 Python 的 unbuffered 模式
+        # 仍然设置所有必要的环境变量确保输出实时写入
+        nohup env PATH="$MICROMAMBA_ENV/bin:$PATH" \
+            PYTHONUNBUFFERED=1 \
+            PYTHONIOENCODING=utf-8 \
+            "$PYTHON_EXE" -u "$GRADIO_DIR/main.py" >> "$LOG_FILE" 2>&1 &
+    fi
+    # 保存进程ID
+    SERVER_PID=$!
+    echo $SERVER_PID > "$PID_FILE"
+    # 等待一下，检查进程是否成功启动
+    sleep 3
+    if ps -p "$SERVER_PID" > /dev/null 2>&1; then
+        echo "✅ 服务器已成功在后台启动！"
+        echo "   PID: $SERVER_PID"
+        echo "   Micromamba 环境: $MICROMAMBA_ENV"
+        echo "   日志文件: $LOG_FILE"
+        echo ""
+        echo "📋 常用命令:"
+        echo "   查看实时日志: bash $0 logs"
+        echo "   查看状态: bash $0 status"
+        echo "   停止服务器: bash $0 stop"
+        echo ""
+        echo "💡 提示:"
+        echo "   - 所有输出都保存在 $LOG_FILE（包括所有 print、uvicorn 日志等）"
+        echo "   - 日志实时写入，与前台运行完全一致"
+        echo "   - 即使关闭SSH连接，服务器也会继续运行"
+        echo "   - 使用 PYTHONUNBUFFERED=1 和 stdbuf 确保日志实时写入"
+        echo ""
+        echo "🌐 服务器启动后，请查看日志文件获取访问地址："
+        echo "   bash $0 logs"
+        return 0
+    else
+        echo "❌ 服务器启动失败！"
+        echo "   请查看完整日志: $LOG_FILE"
+        rm -f "$PID_FILE"
+        return 1
+    fi
+}
+# 函数：停止服务器
+stop_server() {
+    # 检查PID文件是否存在
+    if [ ! -f "$PID_FILE" ]; then
+        echo "⚠️  未找到 PID 文件，服务器可能未运行"
+        return 1
+    fi
+    # 读取PID
+    SERVER_PID=$(cat "$PID_FILE")
+    # 检查进程是否存在
+    if ! ps -p "$SERVER_PID" > /dev/null 2>&1; then
+        echo "⚠️  进程 $SERVER_PID 不存在，可能已经停止"
+        rm -f "$PID_FILE"
+        return 1
+    fi
+    # 停止进程
+    echo "🛑 正在停止服务器 (PID: $SERVER_PID)..."
+    kill "$SERVER_PID"
+    # 等待进程结束（最多等待10秒）
+    for i in {1..10}; do
+        if ! ps -p "$SERVER_PID" > /dev/null 2>&1; then
+            echo "✅ 服务器已成功停止"
+            rm -f "$PID_FILE"
+            return 0
+        fi
+        sleep 1
+    done
+    # 如果还在运行，强制杀死
+    if ps -p "$SERVER_PID" > /dev/null 2>&1; then
+        echo "⚠️  进程未响应，强制终止..."
+        kill -9 "$SERVER_PID"
+        sleep 1
+        if ! ps -p "$SERVER_PID" > /dev/null 2>&1; then
+            echo "✅ 服务器已强制停止"
+            rm -f "$PID_FILE"
+            return 0
+        else
+            echo "❌ 无法停止服务器，请手动检查"
+            return 1
+        fi
+    fi
+}
+# 函数：查看服务器状态
+status_server() {
+    echo "📊 服务器状态信息"
+    echo "=========================================="
+    # 检查PID文件
+    if [ ! -f "$PID_FILE" ]; then
+        echo "❌ 服务器未运行 (未找到 PID 文件)"
+        return 1
+    fi
+    SERVER_PID=$(cat "$PID_FILE")
+    # 检查进程是否存在
+    if ps -p "$SERVER_PID" > /dev/null 2>&1; then
+        echo "✅ 服务器正在运行"
+        echo "   PID: $SERVER_PID"
+        echo ""
+        # 显示进程信息
+        echo "📋 进程信息:"
+        ps -p "$SERVER_PID" -o pid,ppid,user,%cpu,%mem,etime,cmd
+        echo ""
+        # 显示日志文件信息
+        if [ -f "$LOG_FILE" ]; then
+            LOG_SIZE=$(du -h "$LOG_FILE" | cut -f1)
+            LOG_LINES=$(wc -l < "$LOG_FILE" 2>/dev/null || echo "0")
+            echo "📄 日志文件信息:"
+            echo "   文件: $LOG_FILE"
+            echo "   大小: $LOG_SIZE"
+            echo "   行数: $LOG_LINES"
+            echo "   最后修改: $(stat -c %y "$LOG_FILE" 2>/dev/null || stat -f %Sm "$LOG_FILE" 2>/dev/null || echo "未知")"
+        fi
+        # 显示最后几行日志
+        if [ -f "$LOG_FILE" ]; then
+            echo ""
+            echo "📝 最近的日志输出 (最后10行):"
+            echo "----------------------------------------"
+            tail -n 10 "$LOG_FILE"
+        fi
+        return 0
+    else
+        echo "❌ 服务器未运行 (进程 $SERVER_PID 不存在)"
+        echo "   清理 PID 文件..."
+        rm -f "$PID_FILE"
+        return 1
+    fi
+}
+# 函数：重启服务器
+restart_server() {
+    echo "🔄 正在重启服务器..."
+    stop_server
+    sleep 2
+    start_server
+}
+# 函数：查看日志
+view_logs() {
+    if [ ! -f "$LOG_FILE" ]; then
+        echo "⚠️  日志文件不存在: $LOG_FILE"
+        return 1
+    fi
+    echo "📝 查看服务器日志 (按 Ctrl+C 退出)"
+    echo "=========================================="
+    tail -f "$LOG_FILE"
+}
+# 函数：显示帮助信息
+show_help() {
+    echo "HistoryBench 服务器管理脚本"
+    echo ""
+    echo "使用方法:"
+    echo "  bash $0 [命令]"
+    echo ""
+    echo "可用命令:"
+    echo "  start    - 启动服务器（后台运行）"
+    echo "  stop     - 停止服务器"
+    echo "  status   - 查看服务器状态"
+    echo "  restart  - 重启服务器"
+    echo "  logs     - 查看实时日志（按 Ctrl+C 退出）"
+    echo "  help     - 显示此帮助信息"
+    echo ""
+    echo "示例:"
+    echo "  bash $0 start      # 启动服务器"
+    echo "  bash $0 status    # 查看状态"
+    echo "  bash $0 logs      # 查看日志"
+    echo "  bash $0 stop      # 停止服务器"
+    echo ""
+}
+# 主逻辑：根据命令行参数执行相应操作
+case "${1:-help}" in
+    start)
+        start_server
+        ;;
+    stop)
+        stop_server
+        ;;
+    status)
+        status_server
+        ;;
+    restart)
+        restart_server
+        ;;
+    logs)
+        view_logs
+        ;;
+    help|--help|-h)
+        show_help
+        ;;
+    *)
+        echo "❌ 未知命令: $1"
+        echo ""
+        show_help
+        exit 1
+        ;;
+esac
+exit $?

gradio-web/scripts/后台运行说明.md ADDED Viewed

	@@ -0,0 +1,288 @@

+# HistoryBench 后台运行说明
+本文档说明如何在后台运行 HistoryBench Gradio 服务器。
+## 命令概览
+```bash
+# 启动服务器
+bash scripts/run_background.sh start
+# 查看状态
+bash scripts/run_background.sh status
+# 查看日志
+bash scripts/run_background.sh logs
+# 停止服务器
+bash scripts/run_background.sh stop
+# 重启服务器
+bash scripts/run_background.sh restart
+# 查看帮助
+bash scripts/run_background.sh help
+```
+## 快速开始
+### 启动服务器（后台运行）
+```bash
+cd /data/hongzefu/historybench-v5.6.7/gradio
+bash scripts/run_background.sh start
+```
+### 查看服务器状态
+```bash
+bash scripts/run_background.sh status
+```
+### 查看实时日志
+```bash
+# 方法1: 使用脚本命令（推荐）
+bash scripts/run_background.sh logs
+# 方法2: 直接使用 tail 命令
+tail -f logs/server.log
+# 方法3: 查看最后100行日志
+tail -n 100 logs/server.log
+```
+### 停止服务器
+```bash
+bash scripts/run_background.sh stop
+```
+### 重启服务器
+```bash
+bash scripts/run_background.sh restart
+```
+### 查看帮助信息
+```bash
+bash scripts/run_background.sh help
+```
+## 脚本功能说明
+### run_background.sh
+统一的后台运行管理脚本，支持以下命令：
+#### 可用命令
+- **`start`** - 启动服务器（后台运行）
+  - 自动检测服务器是否已在运行
+  - 使用指定的 Micromamba 环境运行
+  - 将所有输出（标准输出和错误输出）保存到日志文件
+  - 使用 `nohup` 确保即使关闭 SSH 连接也能继续运行
+  - 自动保存进程 ID 到 `server.pid` 文件
+  - 启动后自动验证进程是否成功运行
+- **`stop`** - 停止服务器
+  - 优雅地停止服务器进程
+  - 如果进程无响应，会自动强制终止
+  - 自动清理 PID 文件
+- **`status`** - 查看服务器状态
+  - 显示进程信息（PID、CPU、内存、运行时间等）
+  - 显示日志文件信息（大小、行数、最后修改时间）
+  - 显示最近的日志输出（最后10行）
+- **`restart`** - 重启服务器
+  - 先停止服务器，然后重新启动
+- **`logs`** - 查看实时日志
+  - 实时显示日志输出（类似 `tail -f`）
+  - 按 Ctrl+C 退出
+- **`help`** - 显示帮助信息
+### 配置信息
+脚本使用以下默认配置：
+- **Micromamba 环境**: `/data/hongzefu/maniskillenv1114`
+- **工作目录**: `/data/hongzefu/historybench-v5.6.7/gradio`
+- **日志目录**: `logs/`
+- **日志文件**: `logs/server.log`
+- **PID 文件**: `server.pid`
+### 修改配置
+如果需要修改配置，请编辑 `run_background.sh` 脚本中的以下变量：
+```bash
+# Micromamba 环境路径
+MICROMAMBA_ENV="/data/hongzefu/maniskillenv1114"
+# 日志目录
+LOG_DIR="$GRADIO_DIR/logs"
+# PID 文件
+PID_FILE="$GRADIO_DIR/server.pid"
+```
+## 使用场景
+### 场景1: 首次启动服务器
+```bash
+cd /data/hongzefu/historybench-v5.6.7/gradio
+bash scripts/run_background.sh start
+```
+启动后，查看日志获取服务器访问地址：
+```bash
+bash scripts/run_background.sh logs
+```
+### 场景2: 检查服务器是否运行
+```bash
+bash scripts/run_background.sh status
+```
+或者手动检查：
+```bash
+# 检查 PID 文件
+cat server.pid
+# 检查进程
+ps aux | grep main.py
+```
+### 场景3: 重启服务器
+```bash
+bash scripts/run_background.sh restart
+```
+### 场景4: 查看错误日志
+```bash
+# 查看最后50行日志
+tail -n 50 logs/server.log
+# 搜索错误信息
+grep -i error logs/server.log
+# 搜索警告信息
+grep -i warning logs/server.log
+# 或使用脚本查看实时日志
+bash scripts/run_background.sh logs
+```
+## 注意事项
+1. **脚本位置**: 脚本位于 `scripts/run_background.sh`，可以从任何位置运行，使用绝对路径或相对路径
+2. **日志文件位置**: 日志文件保存在 `gradio/logs/server.log`，包含所有标准输出和错误输出
+3. **进程持久化**: 使用 `nohup` 确保即使关闭 SSH 连接，服务器也会继续运行
+4. **日志实时写入**: 使用 `PYTHONUNBUFFERED=1` 和 `-u` 参数确保日志实时写入，方便调试
+5. **端口冲突**: 如果端口被占用，服务器会自动查找下一个可用端口（从 7860 开始）
+6. **环境变量**: 脚本会自动设置 `PATH` 环境变量，确保使用 Micromamba 环境中的 Python 和依赖包
+## 故障排查
+### 问题1: 服务器启动失败
+**检查步骤**:
+1. 查看日志文件：
+   ```bash
+   tail -n 100 logs/server.log
+   ```
+2. 检查 Micromamba 环境是否存在：
+   ```bash
+   ls -la /data/hongzefu/maniskillenv1114
+   ```
+3. 检查 Python 可执行文件：
+   ```bash
+   /data/hongzefu/maniskillenv1114/bin/python --version
+   ```
+### 问题2: 端口已被占用
+服务器会自动查找可用端口，但如果你想手动指定端口，可以修改 `main.py` 中的 `find_free_port()` 函数。
+### 问题3: 无法��问服务器
+1. 检查服务器是否正在运行：
+   ```bash
+   bash scripts/run_background.sh status
+   ```
+2. 查看日志获取正确的访问地址：
+   ```bash
+   tail -n 50 logs/server.log | grep -E "(http://|SERVER STARTING)"
+   ```
+3. 检查防火墙设置（如果需要从外部访问）
+### 问题4: 进程意外退出
+1. 查看日志文件中的错误信息：
+   ```bash
+   grep -i error logs/server.log | tail -n 20
+   ```
+2. 检查系统资源（内存、磁盘空间等）
+3. 检查依赖包是否完整安装
+## 相关脚本
+- `run_background.sh` - 统一管理脚本
+  - 支持 `start`、`stop`、`status`、`restart`、`logs`、`help` 命令
+  - 功能完整，使用方便
+## 技术细节
+### 后台运行机制
+脚本使用以下技术实现后台运行：
+1. **nohup**: 防止进程在终端关闭时被终止
+2. **重定向输出**: `>> "$LOG_FILE" 2>&1` 将所有输出保存到日志文件
+3. **PID 文件**: 保存进程 ID，方便后续管理
+4. **环境变量**: 设置 `PATH` 和 `PYTHONUNBUFFERED` 确保正确运行
+### 日志管理
+- **所有输出都会被捕获**：包括所有 `print()` 语句、uvicorn 的访问日志、错误日志等
+- 日志文件位置：`logs/server.log`
+- 实时写入机制：
+  - 使用 `PYTHONUNBUFFERED=1` 和 `-u` 参数确保 Python 输出不缓冲
+  - 使用 `stdbuf -oL -eL` 确保行缓冲输出（如果系统支持）
+  - 所有标准输出和错误输出都重定向到日志文件（`2>&1`）
+- 日志内容：
+  - 所有 `print()` 输出
+  - uvicorn 服务器日志（启动信息、请求日志等）
+  - FastAPI 应用日志
+  - Gradio 界面日志
+  - 错误和异常信息
+- 日志文件会持续增长，建议定期清理或使用日志轮转工具
+## 联系与支持
+如有问题，请查看：
+- 日志文件: `logs/server.log`
+- 项目文档: 项目根目录的 README 文件

gradio-web/state_manager.py ADDED Viewed

	@@ -0,0 +1,473 @@

+"""
+状态管理模块
+管理所有全局状态和Session生命周期
+本模块负责：
+1. 创建和管理 ProcessSessionProxy 实例（每个用户一个）
+2. 存储任务索引等UI状态
+3. 提供线程安全的访问接口
+4. 清理会话资源（当用户重复登录时，自动清理旧会话的进程和状态）
+注意：GLOBAL_SESSIONS 中存储的是 ProcessSessionProxy 对象，而不是 OracleSession。
+实际的 OracleSession 运行在独立的工作进程中，通过代理对象进行通信。
+当同一用户第二次登录时，系统会自动清理旧会话的所有资源（进程、RAM、VRAM、状态数据等）。
+"""
+import uuid
+import threading
+import traceback
+import time
+from process_session import ProcessSessionProxy
+# --- 全局会话存储 ---
+# 存储所有用户的 ProcessSessionProxy 实例
+# 每个用户登录时会创建一个代理，代理会启动一个独立的工作进程运行 OracleSession
+GLOBAL_SESSIONS = {}
+# --- 任务索引存储（用于进度显示） ---
+# 存储每个session的任务索引和总任务数，用于直接读取Progress
+TASK_INDEX_MAP = {}  # {uid: {"task_index": int, "total_tasks": int}}
+# --- UI阶段存储 ---
+# 存储每个session的UI阶段："watching_demo" 或 "executing_task"
+UI_PHASE_MAP = {}  # {uid: "watching_demo" | "executing_task"}
+# --- Execute 次数跟踪 ---
+# 跟踪每个会话每个任务的 execute 次数
+# 键格式: "{uid}:{env_id}:{episode_idx}"
+EXECUTE_COUNTS = {}  # {task_key: count}
+# --- 任务开始时间跟踪 ---
+# 跟踪每个任务的开始时间
+# 键格式: "{uid}:{env_id}:{episode_idx}"
+# 值: ISO 格式的时间字符串
+TASK_START_TIMES = {}  # {task_key: "2025-12-28T14:01:25.372278"}
+# --- Session活动时间跟踪 ---
+# 跟踪每个session的最后活动时间（用于超时检测）
+SESSION_LAST_ACTIVITY = {}  # {uid: timestamp} - timestamp是time.time()返回的浮点数
+SESSION_TIMEOUT_WARNED = {}  # {uid: bool} - 跟踪已警告的session，避免重复警告
+# --- 播放按钮状态跟踪 ---
+# 跟踪每个session的播放按钮是否已被点击（用于execute按钮条件控制）
+PLAY_BUTTON_CLICKED = {}  # {uid: bool} - 跟踪播放按钮是否已被点击
+# 线程锁，用于保护全局状态的访问
+_state_lock = threading.Lock()
+def get_session(uid):
+    """
+    获取指定uid的session（ProcessSessionProxy实例）
+    Args:
+        uid: 会话ID
+    Returns:
+        ProcessSessionProxy: 代理对象，提供与 OracleSession 相同的接口
+    """
+    with _state_lock:
+        return GLOBAL_SESSIONS.get(uid)
+def create_session():
+    """
+    创建新的session并返回uid
+    此函数会：
+    1. 生成一个唯一的会话ID（UUID）
+    2. 创建一个 ProcessSessionProxy 实例
+    3. ProcessSessionProxy 会自动启动一个独立的工作进程运行 OracleSession
+    4. 将代理对象存储到 GLOBAL_SESSIONS 中
+    5. 初始化最后活动时间为当前时间
+    Returns:
+        str: 新创建的会话ID
+    """
+    uid = str(uuid.uuid4())
+    session = ProcessSessionProxy()
+    with _state_lock:
+        GLOBAL_SESSIONS[uid] = session
+        SESSION_LAST_ACTIVITY[uid] = time.time()
+    return uid
+def get_task_index(uid):
+    """获取任务索引信息"""
+    with _state_lock:
+        return TASK_INDEX_MAP.get(uid)
+def set_task_index(uid, task_index, total_tasks):
+    """设置任务索引信息"""
+    with _state_lock:
+        TASK_INDEX_MAP[uid] = {
+            "task_index": task_index,
+            "total_tasks": total_tasks
+        }
+def get_ui_phase(uid):
+    """获取UI阶段"""
+    with _state_lock:
+        return UI_PHASE_MAP.get(uid, "watching_demo")  # 默认为观看示范阶段
+def set_ui_phase(uid, phase):
+    """设置UI阶段
+    Args:
+        uid: session ID
+        phase: "watching_demo" 或 "executing_task"
+    """
+    with _state_lock:
+        UI_PHASE_MAP[uid] = phase
+def reset_ui_phase(uid):
+    """重置UI阶段为初始阶段（watching_demo）"""
+    with _state_lock:
+        UI_PHASE_MAP[uid] = "watching_demo"
+def set_play_button_clicked(uid, clicked=True):
+    """
+    设置播放按钮是否已被点击
+    Args:
+        uid: 会话ID
+        clicked: 是否已被点击（默认 True）
+    """
+    with _state_lock:
+        PLAY_BUTTON_CLICKED[uid] = clicked
+def get_play_button_clicked(uid):
+    """
+    获取播放按钮是否已被点击
+    Args:
+        uid: 会话ID
+    Returns:
+        bool: 如果已被点击返回 True，否则返回 False
+    """
+    with _state_lock:
+        return PLAY_BUTTON_CLICKED.get(uid, False)
+def reset_play_button_clicked(uid):
+    """
+    重置播放按钮点击状态
+    Args:
+        uid: 会话ID
+    """
+    with _state_lock:
+        if uid in PLAY_BUTTON_CLICKED:
+            del PLAY_BUTTON_CLICKED[uid]
+def _get_task_key(uid, env_id, episode_idx):
+    """��成任务键（用于跟踪 execute 次数）"""
+    return f"{uid}:{env_id}:{episode_idx}"
+def get_execute_count(uid, env_id, episode_idx):
+    """
+    获取指定任务的 execute 次数
+    Args:
+        uid: 会话ID
+        env_id: 环境ID
+        episode_idx: Episode索引
+    Returns:
+        int: execute 次数，如果任务不存在则返回 0
+    """
+    with _state_lock:
+        task_key = _get_task_key(uid, env_id, episode_idx)
+        return EXECUTE_COUNTS.get(task_key, 0)
+def increment_execute_count(uid, env_id, episode_idx):
+    """
+    增加指定任务的 execute 次数
+    Args:
+        uid: 会话ID
+        env_id: 环境ID
+        episode_idx: Episode索引
+    Returns:
+        int: 增加后的 execute 次数
+    """
+    with _state_lock:
+        task_key = _get_task_key(uid, env_id, episode_idx)
+        current_count = EXECUTE_COUNTS.get(task_key, 0)
+        EXECUTE_COUNTS[task_key] = current_count + 1
+        return EXECUTE_COUNTS[task_key]
+def reset_execute_count(uid, env_id, episode_idx):
+    """
+    重置指定任务的 execute 次数为 0
+    Args:
+        uid: 会话ID
+        env_id: 环境ID
+        episode_idx: Episode索引
+    """
+    with _state_lock:
+        task_key = _get_task_key(uid, env_id, episode_idx)
+        EXECUTE_COUNTS[task_key] = 0
+def get_task_start_time(uid, env_id, episode_idx):
+    """
+    获取指定任务的开始时间
+    Args:
+        uid: 会话ID
+        env_id: 环境ID
+        episode_idx: Episode索引
+    Returns:
+        str: ISO 格式的时间字符串，如果任务不存在则返回 None
+    """
+    with _state_lock:
+        task_key = _get_task_key(uid, env_id, episode_idx)
+        return TASK_START_TIMES.get(task_key)
+def set_task_start_time(uid, env_id, episode_idx, start_time):
+    """
+    设置指定任务的开始时间
+    Args:
+        uid: 会话ID
+        env_id: 环境ID
+        episode_idx: Episode索引
+        start_time: ISO 格式的时间字符串
+    """
+    with _state_lock:
+        task_key = _get_task_key(uid, env_id, episode_idx)
+        TASK_START_TIMES[task_key] = start_time
+def clear_task_start_time(uid, env_id, episode_idx):
+    """
+    清除指定任务的开始时间记录
+    Args:
+        uid: 会话ID
+        env_id: 环境ID
+        episode_idx: Episode索引
+    """
+    with _state_lock:
+        task_key = _get_task_key(uid, env_id, episode_idx)
+        if task_key in TASK_START_TIMES:
+            del TASK_START_TIMES[task_key]
+def cleanup_session(uid):
+    """
+    清理指定会话的所有资源
+    此函数会清理与指定 uid 相关的所有资源：
+    1. 关闭 ProcessSessionProxy（会终止工作进程，释放 RAM/VRAM）
+    2. 从 GLOBAL_SESSIONS 中移除
+    3. 清理所有相关的状态数据（任务索引、UI阶段）
+    Args:
+        uid: 要清理的会话ID
+    """
+    if not uid:
+        return
+    with _state_lock:
+        # 1. 关闭 ProcessSessionProxy（终止工作进程）
+        session = GLOBAL_SESSIONS.get(uid)
+        if session:
+            try:
+                print(f"Cleaning up session {uid}: closing ProcessSessionProxy...")
+                session.close()
+                print(f"Session {uid}: ProcessSessionProxy closed successfully")
+            except Exception as e:
+                print(f"Error closing ProcessSessionProxy for {uid}: {e}")
+                traceback.print_exc()
+        # 2. 从 GLOBAL_SESSIONS 中移除
+        if uid in GLOBAL_SESSIONS:
+            del GLOBAL_SESSIONS[uid]
+            print(f"Session {uid}: removed from GLOBAL_SESSIONS")
+        # 3. 清理任务索引
+        if uid in TASK_INDEX_MAP:
+            del TASK_INDEX_MAP[uid]
+            print(f"Session {uid}: task index cleaned up")
+        # 4. 清理UI阶段
+        if uid in UI_PHASE_MAP:
+            del UI_PHASE_MAP[uid]
+        # 清理播放按钮状态
+        if uid in PLAY_BUTTON_CLICKED:
+            del PLAY_BUTTON_CLICKED[uid]
+            print(f"Session {uid}: UI phase cleaned up")
+        # 5. 清理活动时间跟踪
+        if uid in SESSION_LAST_ACTIVITY:
+            del SESSION_LAST_ACTIVITY[uid]
+            print(f"Session {uid}: last activity time cleaned up")
+        # 6. 清理超时警告标志
+        if uid in SESSION_TIMEOUT_WARNED:
+            del SESSION_TIMEOUT_WARNED[uid]
+            print(f"Session {uid}: timeout warning flag cleaned up")
+        # 注意：不清理 EXECUTE_COUNTS，因为它是按任务跟踪的，不是按 session 跟踪的
+        # 如果需要清理，应该在任务切换时调用 reset_execute_count
+    print(f"Session {uid}: all resources cleaned up successfully")
+def update_session_activity(uid):
+    """
+    更新指定session的最后活动时间为当前时间
+    Args:
+        uid: 会话ID
+    """
+    with _state_lock:
+        SESSION_LAST_ACTIVITY[uid] = time.time()
+        # 如果之前被警告过，清除警告标志
+        if uid in SESSION_TIMEOUT_WARNED:
+            del SESSION_TIMEOUT_WARNED[uid]
+def get_session_activity(uid):
+    """
+    获取指定session的最后活动时间
+    Args:
+        uid: 会话ID
+    Returns:
+        float: 最后活动时间戳（time.time()），如果session不存在则返回None
+    """
+    with _state_lock:
+        return SESSION_LAST_ACTIVITY.get(uid)
+def check_and_cleanup_timeout_sessions():
+    """
+    检查所有session，清理超时的session
+    此函数会：
+    1. 检查所有活跃session的最后活动时间
+    2. 如果超过SESSION_TIMEOUT秒且未警告，设置警告标志并记录日志
+    3. 如果已警告且超过警告时间（再等5秒），调用cleanup_session清理资源
+    """
+    from config import SESSION_TIMEOUT
+    current_time = time.time()
+    timeout_sessions = []
+    warned_sessions_to_cleanup = []
+    with _state_lock:
+        # 获取所有活跃的session uid
+        active_uids = list(GLOBAL_SESSIONS.keys())
+    # 在锁外检查，避免长时间持有锁
+    for uid in active_uids:
+        with _state_lock:
+            last_activity = SESSION_LAST_ACTIVITY.get(uid)
+            is_warned = SESSION_TIMEOUT_WARNED.get(uid, False)
+        if last_activity is None:
+            # 如果session没有活动记录，跳过（可能是刚创建的）
+            continue
+        elapsed = current_time - last_activity
+        if elapsed > SESSION_TIMEOUT:
+            if not is_warned:
+                # 首次超时，设置警告标志
+                with _state_lock:
+                    SESSION_TIMEOUT_WARNED[uid] = True
+                timeout_sessions.append(uid)
+                print(f"Session {uid}: 超时警告 - 已超过 {SESSION_TIMEOUT} 秒未活动")
+            elif elapsed > SESSION_TIMEOUT + 5:
+                # 已警告且再等5秒仍未活动，标记为需要清理
+                warned_sessions_to_cleanup.append(uid)
+    # 清理超时的session
+    for uid in warned_sessions_to_cleanup:
+        print(f"Session {uid}: 超时清理 - 已超过 {SESSION_TIMEOUT + 5} 秒未活动，开始清理资源")
+        cleanup_session(uid)
+        # cleanup_session内部会清理SESSION_LAST_ACTIVITY和SESSION_TIMEOUT_WARNED
+# 后台监控线程相关变量
+_timeout_monitor_thread = None
+_timeout_monitor_running = False
+_timeout_monitor_lock = threading.Lock()
+def _timeout_monitor_loop():
+    """
+    后台监控线程的主循环
+    每5秒检查一次所有session的超时状态
+    """
+    global _timeout_monitor_running
+    while _timeout_monitor_running:
+        try:
+            check_and_cleanup_timeout_sessions()
+        except Exception as e:
+            print(f"Error in timeout monitor loop: {e}")
+            traceback.print_exc()
+        # 每5秒检查一次
+        for _ in range(50):  # 5秒 = 50 * 0.1秒
+            if not _timeout_monitor_running:
+                break
+            time.sleep(0.1)
+def start_timeout_monitor():
+    """
+    启动后台监控线程
+    在应用启动时调用此函数
+    """
+    global _timeout_monitor_thread, _timeout_monitor_running
+    with _timeout_monitor_lock:
+        if _timeout_monitor_running:
+            print("Timeout monitor is already running")
+            return
+        _timeout_monitor_running = True
+        _timeout_monitor_thread = threading.Thread(
+            target=_timeout_monitor_loop,
+            daemon=True,
+            name="SessionTimeoutMonitor"
+        )
+        _timeout_monitor_thread.start()
+        print("Session timeout monitor started")
+def stop_timeout_monitor():
+    """
+    停止后台监控线程
+    在应用关闭时调用此函数
+    """
+    global _timeout_monitor_thread, _timeout_monitor_running
+    with _timeout_monitor_lock:
+        if not _timeout_monitor_running:
+            return
+        _timeout_monitor_running = False
+        if _timeout_monitor_thread:
+            _timeout_monitor_thread.join(timeout=2.0)
+            print("Session timeout monitor stopped")

gradio-web/test/conftest.py ADDED Viewed

	@@ -0,0 +1,39 @@

+from __future__ import annotations
+import importlib
+import sys
+from pathlib import Path
+import pytest
+def _find_repo_root(start_file: str | Path) -> Path:
+    path = Path(start_file).resolve()
+    cur = path if path.is_dir() else path.parent
+    for candidate in (cur, *cur.parents):
+        if (candidate / "pyproject.toml").exists():
+            return candidate
+    raise FileNotFoundError(f"Could not find repo root from {path}")
+REPO_ROOT = _find_repo_root(__file__)
+SRC_ROOT = REPO_ROOT / "src"
+GRADIO_ROOT = REPO_ROOT / "gradio"
+for p in (str(REPO_ROOT), str(SRC_ROOT), str(GRADIO_ROOT)):
+    if p not in sys.path:
+        sys.path.insert(0, p)
+@pytest.fixture(scope="session")
+def repo_root() -> Path:
+    return REPO_ROOT
+@pytest.fixture
+def reload_module():
+    def _reload(name: str):
+        module = importlib.import_module(name)
+        return importlib.reload(module)
+    return _reload

gradio-web/test/test_episode98_removed_behavior.py ADDED Viewed

	@@ -0,0 +1,107 @@

+from __future__ import annotations
+import time
+def test_load_next_task_wrapper_treats_episode98_as_normal(monkeypatch, reload_module):
+    callbacks = reload_module("gradio_callbacks")
+    expected = ("SENTINEL",)
+    monkeypatch.setattr(callbacks, "update_session_activity", lambda uid: None)
+    monkeypatch.setattr(
+        callbacks.user_manager,
+        "next_episode_same_env",
+        lambda uid: {"is_done_all": False, "current_task": {"env_id": "BinFill", "episode_idx": 98}},
+    )
+    monkeypatch.setattr(callbacks, "_load_status_task", lambda uid, status: expected)
+    result = callbacks.load_next_task_wrapper("uid1")
+    assert result == expected
+def test_restart_episode_wrapper_reloads_same_episode(monkeypatch, reload_module):
+    callbacks = reload_module("gradio_callbacks")
+    load_calls = []
+    expected = ("RESTARTED",)
+    monkeypatch.setattr(callbacks, "update_session_activity", lambda uid: None)
+    monkeypatch.setattr(
+        callbacks.user_manager,
+        "get_session_status",
+        lambda uid: {"is_done_all": False, "current_task": {"env_id": "BinFill", "episode_idx": 98}},
+    )
+    def _fake_load_status_task(uid, status):
+        load_calls.append((uid, status))
+        return expected
+    monkeypatch.setattr(callbacks, "_load_status_task", _fake_load_status_task)
+    result = callbacks.restart_episode_wrapper("uid1")
+    assert len(load_calls) == 1
+    assert load_calls[0][1]["current_task"] == {"env_id": "BinFill", "episode_idx": 98}
+    assert result == expected
+def test_restart_episode_wrapper_missing_status_returns_login_failed(monkeypatch, reload_module):
+    callbacks = reload_module("gradio_callbacks")
+    monkeypatch.setattr(callbacks, "update_session_activity", lambda uid: None)
+    monkeypatch.setattr(callbacks.user_manager, "get_session_status", lambda uid: None)
+    result = callbacks.restart_episode_wrapper("uid1")
+    assert "Failed to restart episode" in result[3]
+def test_execute_step_failed_episode98_still_advances(monkeypatch, reload_module):
+    callbacks = reload_module("gradio_callbacks")
+    class _FakeSession:
+        def __init__(self):
+            self.env_id = "BinFill"
+            self.episode_idx = 98
+            self.base_frames = []
+            self.raw_solve_options = [{"available": False}]
+            self.available_options = [("run", 0)]
+            self.difficulty = "hard"
+            self.language_goal = "goal"
+            self.seed = 123
+            self.non_demonstration_task_length = None
+        def update_observation(self, use_segmentation=False):
+            return None
+        def get_pil_image(self, use_segmented=False):
+            return "IMG"
+        def execute_action(self, option_idx, click_coords):
+            return "IMG", "FAILED", True
+    fake_session = _FakeSession()
+    complete_calls = []
+    monkeypatch.setattr(callbacks, "get_session_activity", lambda uid: time.time())
+    monkeypatch.setattr(callbacks, "update_session_activity", lambda uid: None)
+    monkeypatch.setattr(callbacks, "get_session", lambda uid: fake_session)
+    monkeypatch.setattr(callbacks, "increment_execute_count", lambda uid, env_id, episode_idx: 1)
+    def _fake_complete_current_task(*args, **kwargs):
+        payload = dict(kwargs)
+        if args:
+            payload["uid"] = args[0]
+        complete_calls.append(payload)
+        return {"is_done_all": False, "current_task": {"env_id": "MoveCube", "episode_idx": 7}}
+    monkeypatch.setattr(callbacks.user_manager, "complete_current_task", _fake_complete_current_task)
+    result = callbacks.execute_step("uid1", 0, "No need for coordinates")
+    assert len(complete_calls) == 1
+    assert complete_calls[0]["episode_idx"] == 98
+    assert complete_calls[0]["status"] == "failed"
+    assert result[2] == "BinFill (Episode 98)"

gradio-web/test/test_execute_stream_frames.py ADDED Viewed

	@@ -0,0 +1,59 @@

+from __future__ import annotations
+import numpy as np
+class _FakeUnwrapped:
+    def __init__(self):
+        self.segmentation_id_map = {}
+        self.elapsed_steps = 0
+    def evaluate(self, solve_complete_eval=False):
+        return {"success": False, "fail": False}
+class _FakeEnv:
+    def __init__(self):
+        self.unwrapped = _FakeUnwrapped()
+        self._step_idx = 0
+        self._last_obs = None
+    def step(self, action):
+        self._step_idx += 1
+        self.unwrapped.elapsed_steps = self._step_idx
+        frame = np.full((8, 8, 3), self._step_idx, dtype=np.uint8)
+        obs = {"front_rgb_list": frame}
+        self._last_obs = obs
+        return obs, 0.0, False, False, {}
+def test_execute_action_captures_intermediate_front_frames(monkeypatch, reload_module):
+    oracle_logic = reload_module("oracle_logic")
+    monkeypatch.setattr(
+        oracle_logic,
+        "_fetch_segmentation",
+        lambda env: np.zeros((1, 8, 8), dtype=np.int64),
+    )
+    monkeypatch.setattr(
+        oracle_logic,
+        "_build_solve_options",
+        lambda env, planner, selected_target, env_id: [
+            {"label": "a", "action": "run", "solve": lambda: [env.step(None) for _ in range(3)]}
+        ],
+    )
+    session = oracle_logic.OracleSession(dataset_root=None, gui_render=False)
+    session.env = _FakeEnv()
+    session.planner = object()
+    session.env_id = "BinFill"
+    session.color_map = {}
+    _img, status, done = session.execute_action(0, None)
+    # Captured during solve(): 1,2,3. update_observation may append the last frame again.
+    pixel_trace = [int(frame[0, 0, 0]) for frame in session.base_frames]
+    assert pixel_trace[:3] == [1, 2, 3]
+    assert len(pixel_trace) >= 3
+    assert status.startswith("Executing: a")
+    assert done is False

gradio-web/test/test_live_obs_refresh.py ADDED Viewed

	@@ -0,0 +1,70 @@

+from __future__ import annotations
+import numpy as np
+from PIL import Image
+class _FakeSession:
+    def __init__(self, frames, env_id="BinFill"):
+        self.base_frames = frames
+        self.env_id = env_id
+def test_refresh_live_obs_skips_when_not_execution_phase(monkeypatch, reload_module):
+    callbacks = reload_module("gradio_callbacks")
+    monkeypatch.setattr(callbacks, "get_session", lambda uid: _FakeSession([]))
+    update = callbacks.refresh_live_obs("uid-1", "action_keypoint")
+    assert update.get("__type__") == "update"
+    assert "value" not in update
+def test_refresh_live_obs_updates_image_from_latest_frame(monkeypatch, reload_module):
+    callbacks = reload_module("gradio_callbacks")
+    frame0 = np.zeros((8, 8, 3), dtype=np.uint8)
+    frame1 = np.full((8, 8, 3), 11, dtype=np.uint8)
+    frame2 = np.full((8, 8, 3), 22, dtype=np.uint8)
+    frame3 = np.full((8, 8, 3), 33, dtype=np.uint8)
+    frame4 = np.full((8, 8, 3), 44, dtype=np.uint8)
+    session = _FakeSession([frame0])
+    monkeypatch.setattr(callbacks, "get_session", lambda uid: session)
+    # Reset queue state at execute start (cursor anchored at current base_frames length).
+    callbacks.switch_to_execute_phase("uid-2")
+    session.base_frames.extend([frame1, frame2, frame3, frame4])
+    # Downsample x2 + FIFO => first frame1, then frame3.
+    update1 = callbacks.refresh_live_obs("uid-2", "execution_playback")
+    update2 = callbacks.refresh_live_obs("uid-2", "execution_playback")
+    update3 = callbacks.refresh_live_obs("uid-2", "execution_playback")
+    assert update1.get("__type__") == "update"
+    assert update1.get("interactive") is False
+    assert isinstance(update1.get("value"), Image.Image)
+    assert update1["value"].getpixel((0, 0)) == (11, 11, 11)
+    assert update2.get("__type__") == "update"
+    assert update2.get("interactive") is False
+    assert isinstance(update2.get("value"), Image.Image)
+    assert update2["value"].getpixel((0, 0)) == (33, 33, 33)
+    # Queue drained, so no further value update.
+    assert update3.get("__type__") == "update"
+    assert "value" not in update3
+def test_switch_phase_keeps_live_obs_visible_and_toggles_interactive(reload_module):
+    callbacks = reload_module("gradio_callbacks")
+    to_exec = callbacks.switch_to_execute_phase("uid-3")
+    assert len(to_exec) == 6
+    assert to_exec[0].get("interactive") is False
+    assert to_exec[4].get("interactive") is False
+    assert to_exec[5].get("interactive") is False
+    to_action = callbacks.switch_to_action_phase()
+    assert len(to_action) == 6
+    assert to_action[0].get("interactive") is True
+    assert to_action[4].get("interactive") is True
+    assert to_action[5].get("interactive") is True

gradio-web/test/test_option_label_format.py ADDED Viewed

	@@ -0,0 +1,196 @@

+from __future__ import annotations
+import numpy as np
+class _FakeUnwrapped:
+    def __init__(self):
+        self.segmentation_id_map = {}
+class _FakeEnv:
+    def __init__(self):
+        self.unwrapped = _FakeUnwrapped()
+        self.frames = [np.zeros((8, 8, 3), dtype=np.uint8)]
+        self.wrist_frames = []
+class _FakeObsWrapperEnv:
+    def __init__(self, front_rgb_list, wrist_rgb_list):
+        self.unwrapped = _FakeUnwrapped()
+        self._last_obs = {
+            "front_rgb_list": front_rgb_list,
+            "wrist_rgb_list": wrist_rgb_list,
+        }
+def test_available_options_use_label_plus_action(monkeypatch, reload_module):
+    oracle_logic = reload_module("oracle_logic")
+    monkeypatch.setattr(
+        oracle_logic,
+        "_fetch_segmentation",
+        lambda env: np.zeros((1, 8, 8), dtype=np.int64),
+    )
+    monkeypatch.setattr(
+        oracle_logic,
+        "_build_solve_options",
+        lambda env, planner, selected_target, env_id: [
+            {"label": "a", "action": "pick up the cube", "available": [1]},
+            {"label": "b", "action": "put it down", "available": []},
+        ],
+    )
+    session = oracle_logic.OracleSession(dataset_root=None, gui_render=False)
+    session.env = _FakeEnv()
+    session.planner = object()
+    session.env_id = "BinFill"
+    session.color_map = {}
+    _img, msg = session.update_observation()
+    assert msg == "Ready"
+    assert session.available_options == [
+        ("a. pick up the cube", 0),
+        ("b. put it down", 1),
+    ]
+    assert session.raw_solve_options[0]["label"] == "a"
+def test_update_observation_no_seg_vis_base_fallback(monkeypatch, reload_module):
+    oracle_logic = reload_module("oracle_logic")
+    seg_vis = np.zeros((6, 6, 3), dtype=np.uint8)
+    seg_vis[:, :, 0] = 10  # B
+    seg_vis[:, :, 1] = 20  # G
+    seg_vis[:, :, 2] = 30  # R
+    monkeypatch.setattr(
+        oracle_logic,
+        "_fetch_segmentation",
+        lambda env: np.zeros((1, 6, 6), dtype=np.int64),
+    )
+    monkeypatch.setattr(
+        oracle_logic,
+        "_prepare_segmentation_visual",
+        lambda seg, color_map, hw: (seg_vis, np.zeros((6, 6), dtype=np.int64)),
+    )
+    monkeypatch.setattr(
+        oracle_logic,
+        "_build_solve_options",
+        lambda env, planner, selected_target, env_id: [],
+    )
+    session = oracle_logic.OracleSession(dataset_root=None, gui_render=False)
+    session.env = type(
+        "_NoFrameEnv",
+        (),
+        {"unwrapped": _FakeUnwrapped(), "frames": [], "wrist_frames": []},
+    )()
+    session.planner = object()
+    session.env_id = "BinFill"
+    session.color_map = {}
+    _img, msg = session.update_observation(use_segmentation=False)
+    assert msg == "Ready"
+    assert len(session.base_frames) == 0
+    pil_img = session.get_pil_image(use_segmented=False)
+    assert pil_img.size == (255, 255)
+def test_update_observation_uses_only_front_rgb_list(monkeypatch, reload_module):
+    oracle_logic = reload_module("oracle_logic")
+    monkeypatch.setattr(
+        oracle_logic,
+        "_fetch_segmentation",
+        lambda env: np.zeros((1, 8, 8), dtype=np.int64),
+    )
+    monkeypatch.setattr(
+        oracle_logic,
+        "_build_solve_options",
+        lambda env, planner, selected_target, env_id: [],
+    )
+    f1 = np.full((8, 8, 3), 11, dtype=np.uint8)
+    f2 = np.full((8, 8, 3), 22, dtype=np.uint8)
+    session = oracle_logic.OracleSession(dataset_root=None, gui_render=False)
+    session.env = _FakeObsWrapperEnv(front_rgb_list=[f1, f2], wrist_rgb_list=[])
+    session.planner = object()
+    session.env_id = "BinFill"
+    session.color_map = {}
+    _img, msg = session.update_observation(use_segmentation=False)
+    assert msg == "Ready"
+    assert len(session.base_frames) == 2
+    assert len(session.wrist_frames) == 0
+    assert session.base_frames[-1][0, 0, 0] == 22
+def test_update_observation_does_not_duplicate_same_last_obs(monkeypatch, reload_module):
+    oracle_logic = reload_module("oracle_logic")
+    monkeypatch.setattr(
+        oracle_logic,
+        "_fetch_segmentation",
+        lambda env: np.zeros((1, 8, 8), dtype=np.int64),
+    )
+    monkeypatch.setattr(
+        oracle_logic,
+        "_build_solve_options",
+        lambda env, planner, selected_target, env_id: [],
+    )
+    f1 = np.full((8, 8, 3), 10, dtype=np.uint8)
+    f2 = np.full((8, 8, 3), 20, dtype=np.uint8)
+    env = _FakeObsWrapperEnv(front_rgb_list=[f1, f2], wrist_rgb_list=[])
+    session = oracle_logic.OracleSession(dataset_root=None, gui_render=False)
+    session.env = env
+    session.planner = object()
+    session.env_id = "BinFill"
+    session.color_map = {}
+    session.update_observation(use_segmentation=False)
+    session.update_observation(use_segmentation=False)
+    assert len(session.base_frames) == 2
+    f3 = np.full((8, 8, 3), 30, dtype=np.uint8)
+    env._last_obs = {"front_rgb_list": [f3], "wrist_rgb_list": []}
+    session.update_observation(use_segmentation=False)
+    assert len(session.base_frames) == 3
+    assert session.base_frames[-1][0, 0, 0] == 30
+def test_update_observation_does_not_fallback_to_env_frames(monkeypatch, reload_module):
+    oracle_logic = reload_module("oracle_logic")
+    monkeypatch.setattr(
+        oracle_logic,
+        "_fetch_segmentation",
+        lambda env: np.zeros((1, 8, 8), dtype=np.int64),
+    )
+    monkeypatch.setattr(
+        oracle_logic,
+        "_build_solve_options",
+        lambda env, planner, selected_target, env_id: [],
+    )
+    env = _FakeEnv()
+    env.frames = [np.full((8, 8, 3), 99, dtype=np.uint8)]
+    session = oracle_logic.OracleSession(dataset_root=None, gui_render=False)
+    session.env = env
+    session.planner = object()
+    session.env_id = "BinFill"
+    session.color_map = {}
+    _img, msg = session.update_observation(use_segmentation=False)
+    assert msg == "Ready"
+    assert session.base_frames == []

gradio-web/test/test_oracle_builder_integration.py ADDED Viewed

	@@ -0,0 +1,184 @@

+from __future__ import annotations
+from pathlib import Path
+class _DummyPlanner:
+    def __init__(self, *args, **kwargs):
+        self.args = args
+        self.kwargs = kwargs
+class _FakeRobot:
+    def __init__(self):
+        self.pose = object()
+class _FakeAgent:
+    def __init__(self):
+        self.robot = _FakeRobot()
+class _FakeUnwrapped:
+    def __init__(self):
+        self.agent = _FakeAgent()
+        self.segmentation_id_map = {}
+    def evaluate(self, solve_complete_eval=False):
+        return {"success": False, "fail": False}
+class _FakeEnv:
+    def __init__(self):
+        self.unwrapped = _FakeUnwrapped()
+        self.demonstration_data = {"language goal": "test goal", "frames": ["f1", "f2"]}
+        self.non_demonstration_task_length = 7
+        self.frames = []
+        self.wrist_frames = []
+        self.closed = False
+    def reset(self):
+        return None
+    def close(self):
+        self.closed = True
+class _FakeEnvTupleDemo(_FakeEnv):
+    def __init__(self):
+        super().__init__()
+        self.demonstration_data = (
+            {"front_rgb_list": ["tuple_f1", "tuple_f2"]},
+            None,
+            None,
+            None,
+            {"task_goal": ["tuple goal", "backup goal"]},
+        )
+class _BuilderSuccess:
+    last_init_kwargs = None
+    def __init__(self, **kwargs):
+        type(self).last_init_kwargs = kwargs
+    def get_episode_num(self):
+        return 3
+    def resolve_episode(self, episode_idx):
+        return 123, "hard"
+    def make_env_for_episode(self, episode_idx):
+        return _FakeEnv()
+class _BuilderTupleDemo(_BuilderSuccess):
+    def make_env_for_episode(self, episode_idx):
+        return _FakeEnvTupleDemo()
+class _BuilderNoMetadata:
+    def __init__(self, **kwargs):
+        self.kwargs = kwargs
+    def get_episode_num(self):
+        return 0
+class _BuilderRaiseOnMake:
+    def __init__(self, **kwargs):
+        self.kwargs = kwargs
+    def get_episode_num(self):
+        return 1
+    def resolve_episode(self, episode_idx):
+        return None, None
+    def make_env_for_episode(self, episode_idx):
+        raise RuntimeError("boom")
+def test_load_episode_uses_benchmark_builder(monkeypatch, reload_module):
+    oracle_logic = reload_module("oracle_logic")
+    monkeypatch.setenv("ROBOMME_METADATA_ROOT", "/tmp/meta-root")
+    monkeypatch.setattr(oracle_logic, "BenchmarkEnvBuilder", _BuilderSuccess)
+    monkeypatch.setattr(oracle_logic, "FailAwarePandaArmMotionPlanningSolver", _DummyPlanner)
+    monkeypatch.setattr(oracle_logic, "FailAwarePandaStickMotionPlanningSolver", _DummyPlanner)
+    monkeypatch.setattr(oracle_logic.OracleSession, "update_observation", lambda self: ("IMG", "Ready"))
+    session = oracle_logic.OracleSession(dataset_root=None, gui_render=False)
+    img, msg = session.load_episode("BinFill", 1)
+    assert img == "IMG"
+    assert msg == "Ready"
+    assert session.env_id == "BinFill"
+    assert session.episode_idx == 1
+    assert session.seed == 123
+    assert session.difficulty == "hard"
+    assert session.language_goal == "test goal"
+    assert session.demonstration_frames == ["f1", "f2"]
+    init_kwargs = _BuilderSuccess.last_init_kwargs
+    assert init_kwargs["dataset"] == "train"
+    assert init_kwargs["action_space"] == "joint_angle"
+    assert init_kwargs["gui_render"] is False
+    assert init_kwargs["max_steps"] == 3000
+    assert init_kwargs["override_metadata_path"] == Path("/tmp/meta-root")
+def test_load_episode_metadata_missing_returns_stable_error(monkeypatch, reload_module):
+    oracle_logic = reload_module("oracle_logic")
+    monkeypatch.setenv("ROBOMME_METADATA_ROOT", "/tmp/custom-metadata")
+    monkeypatch.setattr(oracle_logic, "BenchmarkEnvBuilder", _BuilderNoMetadata)
+    session = oracle_logic.OracleSession(dataset_root=None, gui_render=False)
+    img, msg = session.load_episode("RouteStick", 0)
+    assert img is None
+    assert "Dataset metadata not found or empty" in msg
+    assert "record_dataset_RouteStick_metadata.json" in msg
+def test_load_episode_out_of_range_returns_stable_error(monkeypatch, reload_module):
+    oracle_logic = reload_module("oracle_logic")
+    monkeypatch.setattr(oracle_logic, "BenchmarkEnvBuilder", _BuilderSuccess)
+    session = oracle_logic.OracleSession(dataset_root=None, gui_render=False)
+    img, msg = session.load_episode("BinFill", 99)
+    assert img is None
+    assert "Episode index out of range" in msg
+    assert "valid 0-2" in msg
+def test_load_episode_init_failure_is_caught(monkeypatch, reload_module):
+    oracle_logic = reload_module("oracle_logic")
+    monkeypatch.setattr(oracle_logic, "BenchmarkEnvBuilder", _BuilderRaiseOnMake)
+    session = oracle_logic.OracleSession(dataset_root=None, gui_render=False)
+    img, msg = session.load_episode("BinFill", 0)
+    assert img is None
+    assert msg.startswith("Error initializing episode:")
+def test_load_episode_supports_tuple_demonstration_data(monkeypatch, reload_module):
+    oracle_logic = reload_module("oracle_logic")
+    monkeypatch.setattr(oracle_logic, "BenchmarkEnvBuilder", _BuilderTupleDemo)
+    monkeypatch.setattr(oracle_logic, "FailAwarePandaArmMotionPlanningSolver", _DummyPlanner)
+    monkeypatch.setattr(oracle_logic, "FailAwarePandaStickMotionPlanningSolver", _DummyPlanner)
+    monkeypatch.setattr(oracle_logic.OracleSession, "update_observation", lambda self: ("IMG", "Ready"))
+    session = oracle_logic.OracleSession(dataset_root=None, gui_render=False)
+    img, msg = session.load_episode("BinFill", 0)
+    assert img == "IMG"
+    assert msg == "Ready"
+    assert session.language_goal == "tuple goal"
+    assert session.demonstration_frames == ["tuple_f1", "tuple_f2"]

gradio-web/test/test_oracle_imports.py ADDED Viewed

	@@ -0,0 +1,18 @@

+from __future__ import annotations
+from pathlib import Path
+def test_oracle_logic_imports_without_historybench(reload_module):
+    oracle_logic = reload_module("oracle_logic")
+    assert oracle_logic is not None
+    module_path = Path(oracle_logic.__file__).resolve()
+    source = module_path.read_text(encoding="utf-8")
+    assert "historybench" not in source
+def test_oracle_logic_exports_builder_and_vqa(reload_module):
+    oracle_logic = reload_module("oracle_logic")
+    assert hasattr(oracle_logic, "BenchmarkEnvBuilder")
+    assert hasattr(oracle_logic, "get_vqa_options")

gradio-web/test/test_precheck_execute_inputs.py ADDED Viewed

	@@ -0,0 +1,53 @@

+from __future__ import annotations
+import pytest
+class _FakeSession:
+    def __init__(self, available=True):
+        self.raw_solve_options = [{"available": available}]
+def test_precheck_execute_inputs_requires_action(monkeypatch, reload_module):
+    callbacks = reload_module("gradio_callbacks")
+    monkeypatch.setattr(callbacks, "get_session", lambda uid: _FakeSession(available=False))
+    monkeypatch.setattr(callbacks, "update_session_activity", lambda uid: None)
+    with pytest.raises(Exception) as excinfo:
+        callbacks.precheck_execute_inputs("uid-1", None, "No need for coordinates")
+    assert "No action selected" in str(excinfo.value)
+def test_precheck_execute_inputs_requires_coords_when_option_needs_it(monkeypatch, reload_module):
+    callbacks = reload_module("gradio_callbacks")
+    monkeypatch.setattr(callbacks, "get_session", lambda uid: _FakeSession(available=True))
+    monkeypatch.setattr(callbacks, "update_session_activity", lambda uid: None)
+    with pytest.raises(Exception) as excinfo:
+        callbacks.precheck_execute_inputs(
+            "uid-1", 0, "please click the keypoint selection image"
+        )
+    assert "before execute" in str(excinfo.value)
+def test_precheck_execute_inputs_accepts_valid_coords(monkeypatch, reload_module):
+    callbacks = reload_module("gradio_callbacks")
+    monkeypatch.setattr(callbacks, "get_session", lambda uid: _FakeSession(available=True))
+    monkeypatch.setattr(callbacks, "update_session_activity", lambda uid: None)
+    result = callbacks.precheck_execute_inputs("uid-1", 0, "11, 22")
+    assert result is None
+def test_precheck_execute_inputs_session_error(monkeypatch, reload_module):
+    callbacks = reload_module("gradio_callbacks")
+    monkeypatch.setattr(callbacks, "get_session", lambda uid: None)
+    monkeypatch.setattr(callbacks, "update_session_activity", lambda uid: None)
+    with pytest.raises(Exception) as excinfo:
+        callbacks.precheck_execute_inputs("uid-missing", 0, "1, 2")
+    assert "Session Error" in str(excinfo.value)

gradio-web/test/test_process_session_sanitize.py ADDED Viewed

	@@ -0,0 +1,39 @@

+from __future__ import annotations
+def test_sanitize_options_removes_solve_and_boolifies_available(reload_module):
+    process_session = reload_module("process_session")
+    raw = [
+        {
+            "label": "a",
+            "action": "pick",
+            "available": ["obj1"],
+            "solve": lambda: None,
+            "extra": 123,
+        },
+        {
+            "label": "b",
+            "action": "place",
+            "available": [],
+            "solve": lambda: None,
+        },
+    ]
+    cleaned = process_session._sanitize_options(raw)
+    assert len(cleaned) == 2
+    assert "solve" not in cleaned[0]
+    assert "solve" not in cleaned[1]
+    assert cleaned[0]["available"] is True
+    assert cleaned[1]["available"] is False
+    assert cleaned[0]["label"] == "a"
+    assert cleaned[0]["action"] == "pick"
+    assert cleaned[0]["extra"] == 123
+def test_sanitize_options_handles_empty_input(reload_module):
+    process_session = reload_module("process_session")
+    assert process_session._sanitize_options(None) == []
+    assert process_session._sanitize_options([]) == []

gradio-web/test/test_reference_action_callbacks.py ADDED Viewed

	@@ -0,0 +1,84 @@

+from __future__ import annotations
+from PIL import Image
+class _FakeSession:
+    def __init__(self, reference_payload):
+        self._reference_payload = reference_payload
+    def get_reference_action(self):
+        return self._reference_payload
+    def get_pil_image(self, use_segmented=True):
+        return Image.new("RGB", (24, 24), color=(0, 0, 0))
+class _FakeOptionSession:
+    def __init__(self):
+        self.raw_solve_options = [{"available": [object()]}]
+        self.available_options = [("pick", 0)]
+def test_on_reference_action_success_updates_option_and_coords(monkeypatch, reload_module):
+    callbacks = reload_module("gradio_callbacks")
+    session = _FakeSession(
+        {
+            "ok": True,
+            "option_idx": 2,
+            "option_label": "c",
+            "option_action": "press the button",
+            "need_coords": True,
+            "coords_xy": [5, 6],
+            "message": "ok",
+        }
+    )
+    monkeypatch.setattr(callbacks, "update_session_activity", lambda uid: None)
+    monkeypatch.setattr(callbacks, "get_session", lambda uid: session)
+    img, option_update, coords_text, log_html = callbacks.on_reference_action("uid-1")
+    assert isinstance(img, Image.Image)
+    assert img.getpixel((5, 6)) != (0, 0, 0)
+    assert option_update.get("value") == 2
+    assert coords_text == "5, 6"
+    assert "Ground Truth Action" in log_html
+def test_on_reference_action_session_missing(monkeypatch, reload_module):
+    callbacks = reload_module("gradio_callbacks")
+    monkeypatch.setattr(callbacks, "update_session_activity", lambda uid: None)
+    monkeypatch.setattr(callbacks, "get_session", lambda uid: None)
+    img, option_update, coords_text, log_html = callbacks.on_reference_action("uid-missing")
+    assert img is None
+    assert option_update.get("__type__") == "update"
+    assert coords_text == "No need for coordinates"
+    assert "Session Error" in log_html
+def test_on_reference_action_error_message_from_reference(monkeypatch, reload_module):
+    callbacks = reload_module("gradio_callbacks")
+    session = _FakeSession({"ok": False, "message": "bad ref"})
+    monkeypatch.setattr(callbacks, "update_session_activity", lambda uid: None)
+    monkeypatch.setattr(callbacks, "get_session", lambda uid: session)
+    _img, _opt, _coords, log_html = callbacks.on_reference_action("uid-1")
+    assert "bad ref" in log_html
+def test_on_option_select_keeps_valid_coords_when_option_needs_coords(monkeypatch, reload_module):
+    callbacks = reload_module("gradio_callbacks")
+    session = _FakeOptionSession()
+    monkeypatch.setattr(callbacks, "update_session_activity", lambda uid: None)
+    monkeypatch.setattr(callbacks, "get_session", lambda uid: session)
+    coords_text, img_update = callbacks.on_option_select("uid-1", 0, "12, 34")
+    assert coords_text == "12, 34"
+    assert img_update.get("interactive") is True

gradio-web/test/test_reference_action_oracle.py ADDED Viewed

	@@ -0,0 +1,117 @@

+from __future__ import annotations
+import numpy as np
+class _FakePose:
+    def __init__(self, p):
+        self.p = np.asarray(p, dtype=np.float64)
+class _FakeActor:
+    def __init__(self, name: str, p):
+        self.name = name
+        self.pose = _FakePose(p)
+class _FakeUnwrapped:
+    def __init__(self, choice_label: str, current_segment=None, seg_map=None):
+        self.current_choice_label = choice_label
+        self.current_segment = current_segment
+        self.segmentation_id_map = seg_map or {}
+    def get_obs(self, unflattened=True):
+        raise RuntimeError("not needed for centroid path")
+class _FakeEnv:
+    def __init__(self, unwrapped):
+        self.unwrapped = unwrapped
+def test_get_reference_action_maps_choice_and_returns_centroid_coords(monkeypatch, reload_module):
+    oracle_logic = reload_module("oracle_logic")
+    actor = _FakeActor("cube", [0.1, 0.2, 0.3])
+    unwrapped = _FakeUnwrapped(
+        choice_label="pick up the cube",
+        current_segment=actor,
+        seg_map={7: actor},
+    )
+    env = _FakeEnv(unwrapped)
+    monkeypatch.setattr(
+        oracle_logic,
+        "_build_solve_options",
+        lambda env, planner, selected_target, env_id: [
+            {"label": "a", "action": "pick up the cube", "available": [actor]}
+        ],
+    )
+    session = oracle_logic.OracleSession(dataset_root=None, gui_render=False)
+    session.env = env
+    session.planner = object()
+    session.env_id = "BinFill"
+    session.seg_raw = np.zeros((10, 10), dtype=np.int64)
+    session.seg_raw[2:5, 6:9] = 7
+    result = session.get_reference_action()
+    assert result["ok"] is True
+    assert result["option_idx"] == 0
+    assert result["option_label"] == "a"
+    assert result["need_coords"] is True
+    assert result["coords_xy"] == [7, 3]
+def test_get_reference_action_for_non_parameter_option(monkeypatch, reload_module):
+    oracle_logic = reload_module("oracle_logic")
+    unwrapped = _FakeUnwrapped(choice_label="press the button")
+    env = _FakeEnv(unwrapped)
+    monkeypatch.setattr(
+        oracle_logic,
+        "_build_solve_options",
+        lambda env, planner, selected_target, env_id: [
+            {"label": "c", "action": "press the button"}
+        ],
+    )
+    session = oracle_logic.OracleSession(dataset_root=None, gui_render=False)
+    session.env = env
+    session.planner = object()
+    session.env_id = "ButtonUnmask"
+    result = session.get_reference_action()
+    assert result["ok"] is True
+    assert result["option_idx"] == 0
+    assert result["need_coords"] is False
+    assert result["coords_xy"] is None
+def test_get_reference_action_when_choice_text_cannot_match(monkeypatch, reload_module):
+    oracle_logic = reload_module("oracle_logic")
+    unwrapped = _FakeUnwrapped(choice_label="unknown action")
+    env = _FakeEnv(unwrapped)
+    monkeypatch.setattr(
+        oracle_logic,
+        "_build_solve_options",
+        lambda env, planner, selected_target, env_id: [
+            {"label": "a", "action": "pick up the cube", "available": []}
+        ],
+    )
+    session = oracle_logic.OracleSession(dataset_root=None, gui_render=False)
+    session.env = env
+    session.planner = object()
+    session.env_id = "BinFill"
+    result = session.get_reference_action()
+    assert result["ok"] is False
+    assert result["option_idx"] is None
+    assert "Cannot map ground truth action" in result["message"]

gradio-web/test/test_ui_native_layout_contract.py ADDED Viewed

	@@ -0,0 +1,88 @@

+from __future__ import annotations
+def test_native_ui_has_no_legacy_runtime_js_or_card_shell_tokens(reload_module):
+    ui_layout = reload_module("ui_layout")
+    assert ui_layout.SYNC_JS.strip() == ""
+    css = ui_layout.CSS
+    assert ".native-card" in css
+    forbidden_tokens = [
+        "card-shell-hit",
+        "card-shell-button",
+        "floating-card",
+        "applyCardShellOnce",
+        "media_card_anchor",
+        "action_selection_card_anchor",
+        "next_task_btn_card_anchor",
+        "MutationObserver",
+    ]
+    for token in forbidden_tokens:
+        assert token not in css
+def test_native_ui_config_contains_phase_machine_and_precheck_chain(reload_module):
+    ui_layout = reload_module("ui_layout")
+    demo = ui_layout.create_ui_blocks()
+    try:
+        cfg = demo.get_config_file()
+        elem_ids = {
+            comp.get("props", {}).get("elem_id")
+            for comp in cfg.get("components", [])
+            if comp.get("props", {}).get("elem_id")
+        }
+        required_ids = {
+            "header_task",
+            "loading_overlay_group",
+            "main_layout_row",
+            "media_card",
+            "log_card",
+            "right_top_row",
+            "right_action_col",
+            "right_log_col",
+            "control_panel_group",
+            "video_phase_group",
+            "action_phase_group",
+            "demo_video",
+            "live_obs",
+            "action_radio",
+            "coords_box",
+            "exec_btn",
+            "reference_action_btn",
+            "restart_episode_btn",
+            "next_task_btn",
+        }
+        missing = required_ids - elem_ids
+        assert not missing, f"missing required elem_ids: {sorted(missing)}"
+        values = [
+            comp.get("props", {}).get("value")
+            for comp in cfg.get("components", [])
+            if "value" in comp.get("props", {})
+        ]
+        assert all("_anchor" not in str(v) for v in values)
+        assert any(
+            "Logging in and setting up environment... Please wait." in str(v)
+            for v in values
+        )
+        assert all("Loading environment, please wait..." not in str(v) for v in values)
+        log_output_comp = next(
+            comp
+            for comp in cfg.get("components", [])
+            if comp.get("props", {}).get("elem_id") == "log_output"
+        )
+        assert log_output_comp.get("props", {}).get("max_lines") is None
+        api_names = [dep.get("api_name") for dep in cfg.get("dependencies", [])]
+        assert "precheck_execute_inputs" in api_names
+        assert "switch_to_execute_phase" in api_names
+        assert "execute_step" in api_names
+        assert "switch_to_action_phase" in api_names
+    finally:
+        demo.close()

gradio-web/test/test_ui_phase_machine_runtime_e2e.py ADDED Viewed

	@@ -0,0 +1,782 @@

+from __future__ import annotations
+import contextlib
+import importlib
+import socket
+import threading
+import time
+from urllib.error import URLError
+from urllib.request import urlopen
+import numpy as np
+import pytest
+from PIL import Image
+gr = pytest.importorskip("gradio")
+pytest.importorskip("fastapi")
+pytest.importorskip("uvicorn")
+pytest.importorskip("playwright.sync_api")
+import uvicorn
+from fastapi import FastAPI
+from playwright.sync_api import sync_playwright
+def _free_port() -> int:
+    with contextlib.closing(socket.socket(socket.AF_INET, socket.SOCK_STREAM)) as sock:
+        sock.bind(("127.0.0.1", 0))
+        return int(sock.getsockname()[1])
+def _wait_http_ready(url: str, timeout_s: float = 20.0) -> None:
+    end = time.time() + timeout_s
+    while time.time() < end:
+        try:
+            with urlopen(url, timeout=1.0) as resp:  # noqa: S310 - local test URL only
+                if int(getattr(resp, "status", 200)) < 500:
+                    return
+        except URLError:
+            time.sleep(0.2)
+        except Exception:
+            time.sleep(0.2)
+    raise RuntimeError(f"Server did not become ready: {url}")
+def _read_header_task_value(page) -> str | None:
+    return page.evaluate(
+        """() => {
+            const root = document.getElementById('header_task');
+            if (!root) return null;
+            const input = root.querySelector('input');
+            if (input && typeof input.value === 'string') {
+                const value = input.value.trim();
+                return value || null;
+            }
+            const selected = root.querySelector('.single-select');
+            if (!selected) return null;
+            const text = (selected.textContent || '').trim();
+            return text || null;
+        }"""
+    )
+@pytest.fixture
+def phase_machine_ui_url():
+    state = {"precheck_calls": 0}
+    demo_video_url = "https://interactive-examples.mdn.mozilla.net/media/cc0-videos/flower.mp4"
+    with gr.Blocks(title="Native phase machine test") as demo:
+        phase_state = gr.State("init")
+        with gr.Column(visible=True, elem_id="login_group") as login_group:
+            login_btn = gr.Button("Login", elem_id="login_btn")
+        with gr.Column(visible=False, elem_id="main_interface") as main_interface:
+            with gr.Column(visible=False, elem_id="video_phase_group") as video_phase_group:
+                video_display = gr.Video(value=None, elem_id="demo_video", autoplay=True)
+            with gr.Column(visible=False, elem_id="action_phase_group") as action_phase_group:
+                img_display = gr.Image(value=np.zeros((24, 24, 3), dtype=np.uint8), elem_id="live_obs")
+            with gr.Column(visible=False, elem_id="control_panel_group") as control_panel_group:
+                options_radio = gr.Radio(choices=[("pick", 0)], value=0, elem_id="action_radio")
+                coords_box = gr.Textbox(value="please click the keypoint selection image", elem_id="coords_box")
+                with gr.Column(visible=False, elem_id="action_buttons_row") as action_buttons_row:
+                    exec_btn = gr.Button("EXECUTE", elem_id="exec_btn")
+                    next_task_btn = gr.Button("Next Task", elem_id="next_task_btn")
+        log_output = gr.Markdown("", elem_id="log_output")
+        def login_fn():
+            return (
+                gr.update(visible=False),
+                gr.update(visible=True),
+                gr.update(visible=True),
+                gr.update(value=demo_video_url, visible=True),
+                gr.update(visible=False),
+                gr.update(visible=False),
+                gr.update(visible=False),
+                gr.update(value="please click the keypoint selection image"),
+                "demo_video",
+            )
+        def on_video_end_fn():
+            return (
+                gr.update(visible=False),
+                gr.update(visible=True),
+                gr.update(visible=True),
+                gr.update(visible=True),
+                "action_keypoint",
+            )
+        def precheck_fn(_option_idx, _coords):
+            state["precheck_calls"] += 1
+            if state["precheck_calls"] == 1:
+                raise gr.Error("please click the keypoint selection image before execute!")
+        def to_execute_fn():
+            return (
+                gr.update(interactive=False),
+                gr.update(interactive=False),
+                gr.update(interactive=False),
+                gr.update(interactive=False),
+                "execution_playback",
+            )
+        def execute_fn():
+            time.sleep(0.8)
+            return (
+                "executed",
+                gr.update(interactive=True),
+                gr.update(interactive=True),
+            )
+        def to_action_fn():
+            return (
+                gr.update(interactive=True),
+                gr.update(interactive=True),
+                gr.update(interactive=True),
+                gr.update(interactive=True),
+                "action_keypoint",
+            )
+        login_btn.click(
+            fn=login_fn,
+            outputs=[
+                login_group,
+                main_interface,
+                video_phase_group,
+                video_display,
+                action_phase_group,
+                control_panel_group,
+                action_buttons_row,
+                coords_box,
+                phase_state,
+            ],
+            queue=False,
+        )
+        video_display.end(
+            fn=on_video_end_fn,
+            outputs=[video_phase_group, action_phase_group, control_panel_group, action_buttons_row, phase_state],
+            queue=False,
+        )
+        exec_btn.click(
+            fn=precheck_fn,
+            inputs=[options_radio, coords_box],
+            outputs=[],
+            queue=False,
+        ).then(
+            fn=to_execute_fn,
+            outputs=[
+                options_radio,
+                exec_btn,
+                next_task_btn,
+                img_display,
+                phase_state,
+            ],
+            queue=False,
+        ).then(
+            fn=execute_fn,
+            outputs=[log_output, next_task_btn, exec_btn],
+            queue=False,
+        ).then(
+            fn=to_action_fn,
+            outputs=[options_radio, exec_btn, next_task_btn, img_display, phase_state],
+            queue=False,
+        )
+    port = _free_port()
+    host = "127.0.0.1"
+    root_url = f"http://{host}:{port}/"
+    app = FastAPI(title="native-phase-machine-test")
+    app = gr.mount_gradio_app(app, demo, path="/")
+    config = uvicorn.Config(app, host=host, port=port, log_level="error")
+    server = uvicorn.Server(config)
+    thread = threading.Thread(target=server.run, daemon=True)
+    thread.start()
+    _wait_http_ready(root_url)
+    try:
+        yield root_url, state
+    finally:
+        server.should_exit = True
+        thread.join(timeout=10)
+        demo.close()
+def test_phase_machine_runtime_flow_and_execute_precheck(phase_machine_ui_url):
+    root_url, state = phase_machine_ui_url
+    with sync_playwright() as p:
+        browser = p.chromium.launch(headless=True)
+        page = browser.new_page(viewport={"width": 1280, "height": 900})
+        page.goto(root_url, wait_until="domcontentloaded")
+        page.wait_for_timeout(2500)
+        page.wait_for_selector("#login_btn", timeout=20000)
+        page.click("#login_btn")
+        page.wait_for_function(
+            """() => {
+                const el = document.getElementById('demo_video');
+                return !!el && getComputedStyle(el).display !== 'none';
+            }"""
+        )
+        phase_after_login = page.evaluate(
+            """() => {
+                const visible = (id) => {
+                    const el = document.getElementById(id);
+                    if (!el) return false;
+                    const st = getComputedStyle(el);
+                    return st.display !== 'none' && st.visibility !== 'hidden' && el.getClientRects().length > 0;
+                };
+                return {
+                    video: visible('demo_video'),
+                    action: visible('live_obs'),
+                    control: visible('action_radio'),
+                };
+            }"""
+        )
+        assert phase_after_login == {
+            "video": True,
+            "action": False,
+            "control": False,
+        }
+        page.wait_for_selector("#demo_video video", timeout=5000)
+        did_dispatch_end = page.evaluate(
+            """() => {
+                const videoEl = document.querySelector('#demo_video video');
+                if (!videoEl) return false;
+                videoEl.dispatchEvent(new Event('ended', { bubbles: true }));
+                return true;
+            }"""
+        )
+        assert did_dispatch_end
+        page.wait_for_function(
+            """() => {
+                const action = document.getElementById('live_obs');
+                const control = document.getElementById('action_radio');
+                if (!action || !control) return false;
+                return getComputedStyle(action).display !== 'none' && getComputedStyle(control).display !== 'none';
+            }"""
+        )
+        did_click_exec = page.evaluate(
+            """() => {
+                const btn = document.getElementById('exec_btn');
+                if (!btn) return false;
+                btn.click();
+                return true;
+            }"""
+        )
+        assert did_click_exec
+        page.wait_for_timeout(300)
+        phase_after_failed_precheck = page.evaluate(
+            """() => {
+                const visible = (id) => {
+                    const el = document.getElementById(id);
+                    if (!el) return false;
+                    return getComputedStyle(el).display !== 'none';
+                };
+                return {
+                    action: visible('live_obs'),
+                };
+            }"""
+        )
+        assert phase_after_failed_precheck == {"action": True}
+        did_click_exec = page.evaluate(
+            """() => {
+                const btn = document.getElementById('exec_btn');
+                if (!btn) return false;
+                btn.click();
+                return true;
+            }"""
+        )
+        assert did_click_exec
+        page.wait_for_function(
+            """() => {
+                const resolveButton = (id) => {
+                    return document.querySelector(`#${id} button`) || document.querySelector(`button#${id}`);
+                };
+                const execBtn = resolveButton('exec_btn');
+                const nextBtn = resolveButton('next_task_btn');
+                return !!execBtn && !!nextBtn && execBtn.disabled === true && nextBtn.disabled === true;
+            }"""
+        )
+        interactive_snapshot = page.evaluate(
+            """() => {
+                const resolveButton = (id) => {
+                    return document.querySelector(`#${id} button`) || document.querySelector(`button#${id}`);
+                };
+                const execBtn = resolveButton('exec_btn');
+                const nextBtn = resolveButton('next_task_btn');
+                return {
+                    execDisabled: execBtn ? execBtn.disabled : null,
+                    nextDisabled: nextBtn ? nextBtn.disabled : null,
+                };
+            }"""
+        )
+        assert interactive_snapshot["execDisabled"] is True
+        assert interactive_snapshot["nextDisabled"] is True
+        page.wait_for_function(
+            """() => {
+                const execBtn = document.querySelector('button#exec_btn') || document.querySelector('#exec_btn button');
+                const action = document.getElementById('live_obs');
+                if (!execBtn || !action) return false;
+                return execBtn.disabled === false && getComputedStyle(action).display !== 'none';
+            }""",
+            timeout=6000,
+        )
+        final_interactive_snapshot = page.evaluate(
+            """() => {
+                const resolveButton = (id) => {
+                    return document.querySelector(`#${id} button`) || document.querySelector(`button#${id}`);
+                };
+                const execBtn = resolveButton('exec_btn');
+                const nextBtn = resolveButton('next_task_btn');
+                return {
+                    execDisabled: execBtn ? execBtn.disabled : null,
+                    nextDisabled: nextBtn ? nextBtn.disabled : null,
+                };
+            }"""
+        )
+        assert final_interactive_snapshot["execDisabled"] is False
+        assert final_interactive_snapshot["nextDisabled"] is False
+        browser.close()
+    assert state["precheck_calls"] >= 2
+def test_unified_loading_overlay_init_flow(monkeypatch):
+    ui_layout = importlib.reload(importlib.import_module("ui_layout"))
+    canonical_copy = "Logging in and setting up environment... Please wait."
+    legacy_copy = "Loading environment, please wait..."
+    fake_obs = np.zeros((24, 24, 3), dtype=np.uint8)
+    fake_obs_img = Image.fromarray(fake_obs)
+    calls = {"init": 0}
+    def fake_show_loading_info():
+        return gr.update(visible=True)
+    def fake_init_app(_request=None):
+        calls["init"] += 1
+        time.sleep(0.8)
+        return (
+            "uid-init",
+            gr.update(visible=True),  # main_interface
+            gr.update(value=fake_obs_img, interactive=False),  # img_display
+            "ready",  # log_output
+            gr.update(choices=[("pick", 0)], value=None),  # options_radio
+            "goal",  # goal_box
+            "No need for coordinates",  # coords_box
+            gr.update(value=None, visible=False),  # video_display
+            "PickXtimes (Episode 1)",  # task_info_box
+            "Completed: 0",  # progress_info_box
+            gr.update(interactive=True),  # restart_episode_btn
+            gr.update(interactive=True),  # next_task_btn
+            gr.update(interactive=True),  # exec_btn
+            gr.update(visible=False),  # video_phase_group
+            gr.update(visible=True),   # action_phase_group
+            gr.update(visible=True),   # control_panel_group
+            gr.update(value="hint"),  # task_hint_display
+            gr.update(visible=False),  # loading_overlay
+            gr.update(interactive=True),  # reference_action_btn
+        )
+    monkeypatch.setattr(ui_layout, "show_loading_info", fake_show_loading_info)
+    monkeypatch.setattr(ui_layout, "init_app", fake_init_app)
+    demo = ui_layout.create_ui_blocks()
+    port = _free_port()
+    host = "127.0.0.1"
+    root_url = f"http://{host}:{port}/"
+    app = FastAPI(title="native-unified-loading-overlay-test")
+    app = gr.mount_gradio_app(app, demo, path="/")
+    config = uvicorn.Config(app, host=host, port=port, log_level="error")
+    server = uvicorn.Server(config)
+    thread = threading.Thread(target=server.run, daemon=True)
+    thread.start()
+    _wait_http_ready(root_url)
+    try:
+        with sync_playwright() as p:
+            browser = p.chromium.launch(headless=True)
+            page = browser.new_page(viewport={"width": 1280, "height": 900})
+            page.goto(root_url, wait_until="domcontentloaded")
+            page.wait_for_selector("#loading_overlay_group", state="visible", timeout=2500)
+            overlay_text = page.evaluate(
+                """() => {
+                    const el = document.getElementById('loading_overlay_group');
+                    return el ? (el.textContent || '') : '';
+                }"""
+            )
+            assert canonical_copy in overlay_text
+            assert legacy_copy not in page.content()
+            page.wait_for_selector("#loading_overlay_group", state="hidden", timeout=15000)
+            page.wait_for_selector("#main_interface_root", state="visible", timeout=15000)
+            page.wait_for_function(
+                """() => {
+                    const root = document.getElementById('header_task');
+                    const input = root ? root.querySelector('input') : null;
+                    return !!input && input.value.trim() === 'PickXtimes';
+                }""",
+                timeout=5000,
+            )
+            assert _read_header_task_value(page) == "PickXtimes"
+            browser.close()
+    finally:
+        server.should_exit = True
+        thread.join(timeout=10)
+        demo.close()
+    assert calls["init"] >= 1
+def test_header_task_shows_env_after_init(monkeypatch):
+    ui_layout = importlib.reload(importlib.import_module("ui_layout"))
+    fake_obs = np.zeros((24, 24, 3), dtype=np.uint8)
+    fake_obs_img = Image.fromarray(fake_obs)
+    def fake_init_app(request=None):
+        _ = request
+        return (
+            "uid-auto",
+            gr.update(visible=True),  # main_interface
+            gr.update(value=fake_obs_img, interactive=False),  # img_display
+            "ready",  # log_output
+            gr.update(choices=[("pick", 0)], value=None),  # options_radio
+            "goal",  # goal_box
+            "No need for coordinates",  # coords_box
+            gr.update(value=None, visible=False),  # video_display
+            "PickXtimes (Episode 1)",  # task_info_box
+            "Completed: 0",  # progress_info_box
+            gr.update(interactive=True),  # restart_episode_btn
+            gr.update(interactive=True),  # next_task_btn
+            gr.update(interactive=True),  # exec_btn
+            gr.update(visible=False),  # video_phase_group
+            gr.update(visible=True),  # action_phase_group
+            gr.update(visible=True),  # control_panel_group
+            gr.update(value="hint"),  # task_hint_display
+            gr.update(visible=False),  # loading_overlay
+            gr.update(interactive=True),  # reference_action_btn
+        )
+    monkeypatch.setattr(ui_layout, "init_app", fake_init_app)
+    demo = ui_layout.create_ui_blocks()
+    port = _free_port()
+    host = "127.0.0.1"
+    root_url = f"http://{host}:{port}/"
+    app = FastAPI(title="header-task-url-auto-login-test")
+    app = gr.mount_gradio_app(app, demo, path="/")
+    config = uvicorn.Config(app, host=host, port=port, log_level="error")
+    server = uvicorn.Server(config)
+    thread = threading.Thread(target=server.run, daemon=True)
+    thread.start()
+    _wait_http_ready(root_url)
+    try:
+        with sync_playwright() as p:
+            browser = p.chromium.launch(headless=True)
+            page = browser.new_page(viewport={"width": 1280, "height": 900})
+            page.goto(f"{root_url}?user=user1", wait_until="domcontentloaded")
+            page.wait_for_selector("#main_interface_root", state="visible", timeout=15000)
+            page.wait_for_function(
+                """() => {
+                    const root = document.getElementById('header_task');
+                    const input = root ? root.querySelector('input') : null;
+                    return !!input && input.value.trim() === 'PickXtimes';
+                }""",
+                timeout=5000,
+            )
+            assert _read_header_task_value(page) == "PickXtimes"
+            browser.close()
+    finally:
+        server.should_exit = True
+        thread.join(timeout=10)
+        demo.close()
+@pytest.mark.parametrize(
+    "task_info_text,expected_header_value",
+    [
+        ("pickxtimes (Episode 1)", "PickXtimes"),
+        ("EnvFromSessionOnly (Episode 1)", "EnvFromSessionOnly"),
+    ],
+)
+def test_header_task_env_normalization_and_fallback(monkeypatch, task_info_text, expected_header_value):
+    ui_layout = importlib.reload(importlib.import_module("ui_layout"))
+    fake_obs = np.zeros((24, 24, 3), dtype=np.uint8)
+    fake_obs_img = Image.fromarray(fake_obs)
+    def fake_init_app(_request=None):
+        return (
+            "uid-auto",
+            gr.update(visible=True),  # main_interface
+            gr.update(value=fake_obs_img, interactive=False),  # img_display
+            "ready",  # log_output
+            gr.update(choices=[("pick", 0)], value=None),  # options_radio
+            "goal",  # goal_box
+            "No need for coordinates",  # coords_box
+            gr.update(value=None, visible=False),  # video_display
+            task_info_text,  # task_info_box
+            "Completed: 0",  # progress_info_box
+            gr.update(interactive=True),  # restart_episode_btn
+            gr.update(interactive=True),  # next_task_btn
+            gr.update(interactive=True),  # exec_btn
+            gr.update(visible=False),  # video_phase_group
+            gr.update(visible=True),  # action_phase_group
+            gr.update(visible=True),  # control_panel_group
+            gr.update(value="hint"),  # task_hint_display
+            gr.update(visible=False),  # loading_overlay
+            gr.update(interactive=True),  # reference_action_btn
+        )
+    monkeypatch.setattr(ui_layout, "init_app", fake_init_app)
+    demo = ui_layout.create_ui_blocks()
+    port = _free_port()
+    host = "127.0.0.1"
+    root_url = f"http://{host}:{port}/"
+    app = FastAPI(title="header-task-normalization-fallback-test")
+    app = gr.mount_gradio_app(app, demo, path="/")
+    config = uvicorn.Config(app, host=host, port=port, log_level="error")
+    server = uvicorn.Server(config)
+    thread = threading.Thread(target=server.run, daemon=True)
+    thread.start()
+    _wait_http_ready(root_url)
+    try:
+        with sync_playwright() as p:
+            browser = p.chromium.launch(headless=True)
+            page = browser.new_page(viewport={"width": 1280, "height": 900})
+            page.goto(root_url, wait_until="domcontentloaded")
+            page.wait_for_selector("#main_interface_root", state="visible", timeout=15000)
+            page.wait_for_function(
+                """(expectedValue) => {
+                    const root = document.getElementById('header_task');
+                    const input = root ? root.querySelector('input') : null;
+                    return !!input && input.value.trim() === expectedValue;
+                }""",
+                arg=expected_header_value,
+                timeout=5000,
+            )
+            assert _read_header_task_value(page) == expected_header_value
+            browser.close()
+    finally:
+        server.should_exit = True
+        thread.join(timeout=10)
+        demo.close()
+def test_phase_machine_runtime_local_video_path_end_transition():
+    import gradio_callbacks as cb
+    demo_video_path = gr.get_video("world.mp4")
+    fake_obs = np.zeros((24, 24, 3), dtype=np.uint8)
+    class FakeSession:
+        def __init__(self):
+            self.env_id = "VideoUnmask"
+            self.language_goal = "place cube on target"
+            self.available_options = [("pick", 0)]
+            self.raw_solve_options = [{"available": False}]
+            self.demonstration_frames = [fake_obs.copy() for _ in range(4)]
+        def load_episode(self, env_id, episode_idx):
+            self.env_id = env_id
+            return fake_obs.copy(), f"loaded {env_id}:{episode_idx}"
+        def get_pil_image(self, use_segmented=False):
+            _ = use_segmented
+            return fake_obs.copy()
+    originals = {
+        "get_session": cb.get_session,
+        "reset_play_button_clicked": cb.reset_play_button_clicked,
+        "reset_execute_count": cb.reset_execute_count,
+        "set_task_start_time": cb.set_task_start_time,
+        "set_ui_phase": cb.set_ui_phase,
+        "save_video": cb.save_video,
+    }
+    fake_session = FakeSession()
+    cb.get_session = lambda uid: fake_session
+    cb.reset_play_button_clicked = lambda uid: None
+    cb.reset_execute_count = lambda uid, env_id, ep_num: None
+    cb.set_task_start_time = lambda uid, env_id, ep_num, start_time: None
+    cb.set_ui_phase = lambda uid, phase: None
+    cb.save_video = lambda frames, suffix="": demo_video_path
+    try:
+        with gr.Blocks(title="Native phase machine local video test") as demo:
+            uid_state = gr.State(value="uid-local-video")
+            with gr.Column(visible=False, elem_id="main_interface") as main_interface:
+                with gr.Column(visible=False, elem_id="video_phase_group") as video_phase_group:
+                    video_display = gr.Video(value=None, elem_id="demo_video", autoplay=False)
+                with gr.Column(visible=True, elem_id="action_phase_group") as action_phase_group:
+                    img_display = gr.Image(value=fake_obs.copy(), elem_id="live_obs")
+                with gr.Column(visible=True, elem_id="control_panel_group") as control_panel_group:
+                    options_radio = gr.Radio(choices=[("pick", 0)], value=None, elem_id="action_radio")
+            log_output = gr.Markdown("", elem_id="log_output")
+            goal_box = gr.Textbox("")
+            coords_box = gr.Textbox("No need for coordinates")
+            task_info_box = gr.Textbox("")
+            progress_info_box = gr.Textbox("")
+            task_hint_display = gr.Textbox("")
+            with gr.Column(visible=False) as loading_overlay:
+                gr.Markdown("Loading...")
+            restart_episode_btn = gr.Button("restart", interactive=False)
+            next_task_btn = gr.Button("next", interactive=False)
+            exec_btn = gr.Button("execute", interactive=False)
+            reference_action_btn = gr.Button("reference", interactive=False)
+            def load_fn():
+                status = {
+                    "current_task": {"env_id": "VideoUnmask", "episode_idx": 1},
+                    "completed_count": 0,
+                }
+                return cb._load_status_task("uid-local-video", status)
+            demo.load(
+                fn=load_fn,
+                outputs=[
+                    uid_state,
+                    main_interface,
+                    img_display,
+                    log_output,
+                    options_radio,
+                    goal_box,
+                    coords_box,
+                    video_display,
+                    task_info_box,
+                    progress_info_box,
+                    restart_episode_btn,
+                    next_task_btn,
+                    exec_btn,
+                    video_phase_group,
+                    action_phase_group,
+                    control_panel_group,
+                    task_hint_display,
+                    loading_overlay,
+                    reference_action_btn,
+                ],
+                queue=False,
+            )
+            video_display.end(
+                fn=cb.on_video_end_transition,
+                inputs=[uid_state],
+                outputs=[video_phase_group, action_phase_group, control_panel_group, log_output],
+                queue=False,
+            )
+        port = _free_port()
+        host = "127.0.0.1"
+        root_url = f"http://{host}:{port}/"
+        app = FastAPI(title="native-phase-machine-local-video-test")
+        app = gr.mount_gradio_app(app, demo, path="/")
+        config = uvicorn.Config(app, host=host, port=port, log_level="error")
+        server = uvicorn.Server(config)
+        thread = threading.Thread(target=server.run, daemon=True)
+        thread.start()
+        _wait_http_ready(root_url)
+        try:
+            with sync_playwright() as p:
+                browser = p.chromium.launch(headless=True)
+                page = browser.new_page(viewport={"width": 1280, "height": 900})
+                page.goto(root_url, wait_until="domcontentloaded")
+                page.wait_for_selector("#main_interface", state="visible", timeout=20000)
+                page.wait_for_selector("#demo_video video", timeout=5000)
+                phase_after_login = page.evaluate(
+                    """() => {
+                        const visible = (id) => {
+                            const el = document.getElementById(id);
+                            if (!el) return false;
+                            const st = getComputedStyle(el);
+                            return st.display !== 'none' && st.visibility !== 'hidden' && el.getClientRects().length > 0;
+                        };
+                        return {
+                            video: visible('demo_video'),
+                            action: visible('live_obs'),
+                            control: visible('action_radio'),
+                        };
+                    }"""
+                )
+                assert phase_after_login == {
+                    "video": True,
+                    "action": False,
+                    "control": False,
+                }
+                did_dispatch_end = page.evaluate(
+                    """() => {
+                        const videoEl = document.querySelector('#demo_video video');
+                        if (!videoEl) return false;
+                        videoEl.dispatchEvent(new Event('ended', { bubbles: true }));
+                        return true;
+                    }"""
+                )
+                assert did_dispatch_end
+                page.wait_for_function(
+                    """() => {
+                        const visible = (id) => {
+                            const el = document.getElementById(id);
+                            if (!el) return false;
+                            const st = getComputedStyle(el);
+                            return st.display !== 'none' && st.visibility !== 'hidden' && el.getClientRects().length > 0;
+                        };
+                        return visible('live_obs') && visible('action_radio') && !visible('demo_video');
+                    }""",
+                    timeout=2000,
+                )
+                browser.close()
+        finally:
+            server.should_exit = True
+            thread.join(timeout=10)
+            demo.close()
+    finally:
+        for name, value in originals.items():
+            setattr(cb, name, value)

gradio-web/test/test_user_manager_random_flow.py ADDED Viewed

	@@ -0,0 +1,96 @@

+from __future__ import annotations
+import json
+def _write_metadata(root, env_id: str, episodes: list[int]) -> None:
+    root.mkdir(parents=True, exist_ok=True)
+    payload = {
+        "env_id": env_id,
+        "records": [
+            {"task": env_id, "episode": ep, "seed": 1000 + ep, "difficulty": "easy"}
+            for ep in episodes
+        ],
+    }
+    (root / f"record_dataset_{env_id}_metadata.json").write_text(
+        json.dumps(payload), encoding="utf-8"
+    )
+def test_fixed_users_login_and_random_task_pool(monkeypatch, reload_module, tmp_path):
+    metadata_root = tmp_path / "metadata"
+    _write_metadata(metadata_root, "EnvA", [0, 1, 2])
+    _write_metadata(metadata_root, "EnvB", [10, 11])
+    monkeypatch.setenv("ROBOMME_METADATA_ROOT", str(metadata_root))
+    user_manager_mod = reload_module("user_manager")
+    monkeypatch.setattr(user_manager_mod.random, "choice", lambda seq: seq[0])
+    manager = user_manager_mod.UserManager()
+    success, _msg, status = manager.init_session("uid1")
+    assert success
+    assert status["current_task"]["env_id"] in {"EnvA", "EnvB"}
+    assert status["current_task"]["episode_idx"] in {0, 1, 2, 10, 11}
+    assert status["is_done_all"] is False
+def test_switch_env_and_next_episode_stays_in_same_env(monkeypatch, reload_module, tmp_path):
+    metadata_root = tmp_path / "metadata"
+    _write_metadata(metadata_root, "EnvA", [0, 1, 2])
+    _write_metadata(metadata_root, "EnvB", [10, 11])
+    monkeypatch.setenv("ROBOMME_METADATA_ROOT", str(metadata_root))
+    user_manager_mod = reload_module("user_manager")
+    monkeypatch.setattr(user_manager_mod.random, "choice", lambda seq: seq[-1])
+    manager = user_manager_mod.UserManager()
+    success, _msg, _status = manager.init_session("uid2")
+    assert success
+    switched = manager.switch_env_and_random_episode("uid2", "EnvA")
+    assert switched is not None
+    assert switched["current_task"]["env_id"] == "EnvA"
+    assert switched["current_task"]["episode_idx"] in {0, 1, 2}
+    nxt = manager.next_episode_same_env("uid2")
+    assert nxt is not None
+    assert nxt["current_task"]["env_id"] == "EnvA"
+    assert nxt["current_task"]["episode_idx"] in {0, 1, 2}
+def test_complete_current_task_increments_completed_count(monkeypatch, reload_module, tmp_path):
+    metadata_root = tmp_path / "metadata"
+    _write_metadata(metadata_root, "EnvA", [0, 1])
+    monkeypatch.setenv("ROBOMME_METADATA_ROOT", str(metadata_root))
+    user_manager_mod = reload_module("user_manager")
+    monkeypatch.setattr(user_manager_mod.random, "choice", lambda seq: seq[0])
+    manager = user_manager_mod.UserManager()
+    success, _msg, status = manager.init_session("uid3")
+    assert success
+    assert status["completed_count"] == 0
+    updated = manager.complete_current_task(
+        "uid3",
+        env_id=status["current_task"]["env_id"],
+        episode_idx=status["current_task"]["episode_idx"],
+        status="success",
+    )
+    assert updated is not None
+    assert updated["completed_count"] == 1
+    assert updated["is_done_all"] is False
+def test_init_session_fails_when_metadata_root_missing(monkeypatch, reload_module, tmp_path):
+    missing_root = tmp_path / "missing-metadata-root"
+    monkeypatch.setenv("ROBOMME_METADATA_ROOT", str(missing_root))
+    user_manager_mod = reload_module("user_manager")
+    manager = user_manager_mod.UserManager()
+    success, msg, status = manager.init_session("uid-missing")
+    assert success is False
+    assert "No available environments" in msg
+    assert status is None

gradio-web/ui_layout.py ADDED Viewed

	@@ -0,0 +1,547 @@

+"""
+Native Gradio UI layout.
+Sequential media phases: Demo Video -> Action+Keypoint.
+Two-column layout: Keypoint Selection | Right Panel.
+"""
+import ast
+import gradio as gr
+from config import (
+    CONTROL_PANEL_SCALE,
+    KEYPOINT_SELECTION_SCALE,
+    RIGHT_TOP_ACTION_SCALE,
+    RIGHT_TOP_LOG_SCALE,
+)
+from gradio_callbacks import (
+    execute_step,
+    init_app,
+    load_next_task_wrapper,
+    on_map_click,
+    on_option_select,
+    on_reference_action,
+    on_video_end_transition,
+    precheck_execute_inputs,
+    refresh_live_obs,
+    restart_episode_wrapper,
+    show_loading_info,
+    switch_env_wrapper,
+    switch_to_action_phase,
+    switch_to_execute_phase,
+)
+from user_manager import user_manager
+PHASE_INIT = "init"
+PHASE_DEMO_VIDEO = "demo_video"
+PHASE_ACTION_KEYPOINT = "action_keypoint"
+PHASE_EXECUTION_PLAYBACK = "execution_playback"
+# Deprecated: no runtime JS logic in native Gradio mode.
+SYNC_JS = ""
+CSS = f"""
+.native-card {{
+}}
+#loading_overlay_group {{
+    position: fixed !important;
+    inset: 0 !important;
+    z-index: 9999 !important;
+    background: rgba(255, 255, 255, 0.92) !important;
+    text-align: center !important;
+}}
+#loading_overlay_group > div {{
+    min-height: 100%;
+    display: flex;
+    align-items: center;
+    justify-content: center;
+}}
+#loading_overlay_group h3 {{
+    margin: 0 !important;
+}}
+button#reference_action_btn:not(:disabled),
+#reference_action_btn:not(:disabled),
+#reference_action_btn button:not(:disabled) {{
+    background: #1f8b4c !important;
+    border-color: #1f8b4c !important;
+    color: #ffffff !important;
+}}
+button#reference_action_btn:not(:disabled):hover,
+#reference_action_btn:not(:disabled):hover,
+#reference_action_btn button:not(:disabled):hover {{
+    background: #19713d !important;
+    border-color: #19713d !important;
+}}
+"""
+def extract_first_goal(goal_text):
+    """Extract first goal from goal text that may be a list representation."""
+    if not goal_text:
+        return ""
+    text = goal_text.strip()
+    if text.startswith("[") and text.endswith("]"):
+        try:
+            goals = ast.literal_eval(text)
+            if isinstance(goals, list) and goals:
+                return str(goals[0]).strip()
+        except Exception:
+            pass
+    return text.split("\n")[0].strip()
+def _phase_from_updates(main_interface_update, video_phase_update):
+    if isinstance(main_interface_update, dict) and main_interface_update.get("visible") is False:
+        return PHASE_INIT
+    if isinstance(video_phase_update, dict) and video_phase_update.get("visible") is True:
+        return PHASE_DEMO_VIDEO
+    return PHASE_ACTION_KEYPOINT
+def _with_phase_from_load(load_result):
+    phase = _phase_from_updates(load_result[1], load_result[13])
+    return (*load_result, phase)
+def create_ui_blocks():
+    """构建 Gradio Blocks，并完成页面阶段状态（phase）的联动绑定。"""
+    def render_header_task(task_text):
+        clean_task = str(task_text or "").strip()
+        if not clean_task:
+            return None
+        if clean_task.lower().startswith("current task:"):
+            clean_task = clean_task.split(":", 1)[1].strip()
+        marker = " (Episode "
+        if marker in clean_task:
+            clean_task = clean_task.split(marker, 1)[0].strip()
+        return " ".join(clean_task.splitlines()).strip() or None
+    def render_header_goal(goal_text):
+        first_goal = extract_first_goal(goal_text or "")
+        return first_goal if first_goal else "—"
+    with gr.Blocks(title="Oracle Planner Interface") as demo:
+        demo.theme = gr.themes.Soft()
+        demo.css = CSS
+        gr.Markdown("## RoboMME Human Evaluation", elem_id="header_title")
+        with gr.Row():
+            with gr.Column(scale=1):
+                header_task_box = gr.Dropdown(
+                    choices=list(user_manager.env_choices),
+                    value=render_header_task(""),
+                    label="Current Task",
+                    show_label=True,
+                    interactive=True,
+                    elem_id="header_task",
+                )
+            with gr.Column(scale=2):
+                header_goal_box = gr.Textbox(
+                    value=render_header_goal(""),
+                    label="Goal",
+                    show_label=True,
+                    interactive=False,
+                    lines=1,
+                    elem_id="header_goal",
+                )
+        with gr.Column(visible=True, elem_id="loading_overlay_group") as loading_overlay:
+            gr.Markdown("### Logging in and setting up environment... Please wait.")
+        uid_state = gr.State(value=None)
+        ui_phase_state = gr.State(value=PHASE_INIT)
+        live_obs_timer = gr.Timer(value=0.1, active=True)
+        task_info_box = gr.Textbox(visible=False, elem_id="task_info_box")
+        progress_info_box = gr.Textbox(visible=False)
+        goal_box = gr.Textbox(visible=False)
+        with gr.Column(visible=False, elem_id="main_interface_root") as main_interface:
+            with gr.Row(elem_id="main_layout_row"):
+                with gr.Column(scale=KEYPOINT_SELECTION_SCALE):
+                    with gr.Column(elem_classes=["native-card"], elem_id="media_card"):
+                        with gr.Column(visible=False, elem_id="video_phase_group") as video_phase_group:
+                            video_display = gr.Video(
+                                label="Demonstration Video",
+                                interactive=False,
+                                elem_id="demo_video",
+                                autoplay=True,
+                                show_label=True,
+                                visible=True,
+                            )
+                        with gr.Column(visible=False, elem_id="action_phase_group") as action_phase_group:
+                            img_display = gr.Image(
+                                label="Keypoint Selection",
+                                interactive=False,
+                                type="pil",
+                                elem_id="live_obs",
+                                show_label=True,
+                                buttons=[],
+                                sources=[],
+                            )
+                with gr.Column(scale=CONTROL_PANEL_SCALE):
+                    with gr.Column(visible=False, elem_id="control_panel_group") as control_panel_group:
+                        with gr.Row(elem_id="right_top_row", equal_height=False):
+                            with gr.Column(scale=RIGHT_TOP_ACTION_SCALE, elem_id="right_action_col"):
+                                with gr.Column(elem_classes=["native-card"], elem_id="action_selection_card"):
+                                    options_radio = gr.Radio(
+                                        choices=[],
+                                        label=" Action Selection",
+                                        type="value",
+                                        show_label=True,
+                                        elem_id="action_radio",
+                                    )
+                                    coords_box = gr.Textbox(
+                                        label="Coords",
+                                        value="",
+                                        interactive=False,
+                                        show_label=False,
+                                        visible=False,
+                                        elem_id="coords_box",
+                                    )
+                            with gr.Column(scale=RIGHT_TOP_LOG_SCALE, elem_id="right_log_col"):
+                                with gr.Column(elem_classes=["native-card"], elem_id="log_card"):
+                                    log_output = gr.Textbox(
+                                        value="",
+                                        lines=4,
+                                        max_lines=None,
+                                        show_label=True,
+                                        interactive=False,
+                                        elem_id="log_output",
+                                        label="System Log",
+                                    )
+                        with gr.Row(elem_id="action_buttons_row"):
+                            with gr.Column(elem_classes=["native-card", "native-button-card"], elem_id="exec_btn_card"):
+                                exec_btn = gr.Button("EXECUTE", variant="stop", size="lg", elem_id="exec_btn")
+                            with gr.Column(
+                                elem_classes=["native-card", "native-button-card"],
+                                elem_id="reference_btn_card",
+                            ):
+                                reference_action_btn = gr.Button(
+                                    "Ground Truth Action",
+                                    variant="secondary",
+                                    interactive=False,
+                                    elem_id="reference_action_btn",
+                                )
+                            with gr.Column(
+                                elem_classes=["native-card", "native-button-card"],
+                                elem_id="restart_episode_btn_card",
+                            ):
+                                restart_episode_btn = gr.Button(
+                                    "restart episode",
+                                    variant="secondary",
+                                    interactive=False,
+                                    elem_id="restart_episode_btn",
+                                )
+                            with gr.Column(
+                                elem_classes=["native-card", "native-button-card"],
+                                elem_id="next_task_btn_card",
+                            ):
+                                next_task_btn = gr.Button(
+                                    "change episode",
+                                    variant="primary",
+                                    interactive=False,
+                                    elem_id="next_task_btn",
+                                )
+                        with gr.Column(visible=True, elem_classes=["native-card"], elem_id="task_hint_card"):
+                            task_hint_display = gr.Textbox(
+                                value="",
+                                lines=8,
+                                max_lines=16,
+                                show_label=True,
+                                label="Task Hint",
+                                interactive=True,
+                                elem_id="task_hint_display",
+                            )
+        def _normalize_env_choice(env_value, choices):
+            if env_value is None:
+                return None
+            env_text = str(env_value).strip()
+            if not env_text:
+                return None
+            lower_map = {}
+            for choice in choices:
+                choice_text = str(choice).strip()
+                if choice_text:
+                    lower_map.setdefault(choice_text.lower(), choice_text)
+            return lower_map.get(env_text.lower(), env_text)
+        def _build_header_task_update(task_text, fallback_env=None):
+            base_choices = list(user_manager.env_choices)
+            parsed_env = render_header_task(task_text)
+            selected_env = _normalize_env_choice(parsed_env, base_choices)
+            if selected_env is None:
+                selected_env = _normalize_env_choice(fallback_env, base_choices)
+            choices = list(base_choices)
+            if selected_env and selected_env not in choices:
+                choices.append(selected_env)
+            return gr.update(choices=choices, value=selected_env)
+        def sync_header_from_task(task_text, goal_text):
+            return _build_header_task_update(task_text), render_header_goal(goal_text)
+        def sync_header_from_goal(goal_text, task_text, current_header_task):
+            return _build_header_task_update(task_text, fallback_env=current_header_task), render_header_goal(goal_text)
+        def init_app_with_phase(request: gr.Request):
+            return _with_phase_from_load(init_app(request))
+        def load_next_task_with_phase(uid):
+            return _with_phase_from_load(load_next_task_wrapper(uid))
+        def restart_episode_with_phase(uid):
+            return _with_phase_from_load(restart_episode_wrapper(uid))
+        def switch_env_with_phase(uid, selected_env):
+            return _with_phase_from_load(switch_env_wrapper(uid, selected_env))
+        task_info_box.change(
+            fn=sync_header_from_task,
+            inputs=[task_info_box, goal_box],
+            outputs=[header_task_box, header_goal_box],
+        )
+        goal_box.change(
+            fn=sync_header_from_goal,
+            inputs=[goal_box, task_info_box, header_task_box],
+            outputs=[header_task_box, header_goal_box],
+        )
+        header_task_box.input(fn=show_loading_info, outputs=[loading_overlay]).then(
+            fn=switch_env_with_phase,
+            inputs=[uid_state, header_task_box],
+            outputs=[
+                uid_state,
+                main_interface,
+                img_display,
+                log_output,
+                options_radio,
+                goal_box,
+                coords_box,
+                video_display,
+                task_info_box,
+                progress_info_box,
+                restart_episode_btn,
+                next_task_btn,
+                exec_btn,
+                video_phase_group,
+                action_phase_group,
+                control_panel_group,
+                task_hint_display,
+                loading_overlay,
+                reference_action_btn,
+                ui_phase_state,
+            ],
+        ).then(
+            fn=sync_header_from_task,
+            inputs=[task_info_box, goal_box],
+            outputs=[header_task_box, header_goal_box],
+        )
+        next_task_btn.click(fn=show_loading_info, outputs=[loading_overlay]).then(
+            fn=load_next_task_with_phase,
+            inputs=[uid_state],
+            outputs=[
+                uid_state,
+                main_interface,
+                img_display,
+                log_output,
+                options_radio,
+                goal_box,
+                coords_box,
+                video_display,
+                task_info_box,
+                progress_info_box,
+                restart_episode_btn,
+                next_task_btn,
+                exec_btn,
+                video_phase_group,
+                action_phase_group,
+                control_panel_group,
+                task_hint_display,
+                loading_overlay,
+                reference_action_btn,
+                ui_phase_state,
+            ],
+        ).then(
+            fn=sync_header_from_task,
+            inputs=[task_info_box, goal_box],
+            outputs=[header_task_box, header_goal_box],
+        )
+        restart_episode_btn.click(fn=show_loading_info, outputs=[loading_overlay]).then(
+            fn=restart_episode_with_phase,
+            inputs=[uid_state],
+            outputs=[
+                uid_state,
+                main_interface,
+                img_display,
+                log_output,
+                options_radio,
+                goal_box,
+                coords_box,
+                video_display,
+                task_info_box,
+                progress_info_box,
+                restart_episode_btn,
+                next_task_btn,
+                exec_btn,
+                video_phase_group,
+                action_phase_group,
+                control_panel_group,
+                task_hint_display,
+                loading_overlay,
+                reference_action_btn,
+                ui_phase_state,
+            ],
+        ).then(
+            fn=sync_header_from_task,
+            inputs=[task_info_box, goal_box],
+            outputs=[header_task_box, header_goal_box],
+        )
+        video_display.end(
+            fn=on_video_end_transition,
+            inputs=[uid_state],
+            outputs=[video_phase_group, action_phase_group, control_panel_group, log_output],
+            queue=False,
+            show_progress="hidden",
+        ).then(
+            fn=lambda: PHASE_ACTION_KEYPOINT,
+            outputs=[ui_phase_state],
+            queue=False,
+            show_progress="hidden",
+        )
+        video_display.stop(
+            fn=on_video_end_transition,
+            inputs=[uid_state],
+            outputs=[video_phase_group, action_phase_group, control_panel_group, log_output],
+            queue=False,
+            show_progress="hidden",
+        ).then(
+            fn=lambda: PHASE_ACTION_KEYPOINT,
+            outputs=[ui_phase_state],
+            queue=False,
+            show_progress="hidden",
+        )
+        img_display.select(
+            fn=on_map_click,
+            inputs=[uid_state, options_radio],
+            outputs=[img_display, coords_box],
+        )
+        options_radio.change(
+            fn=on_option_select,
+            inputs=[uid_state, options_radio, coords_box],
+            outputs=[coords_box, img_display],
+        )
+        reference_action_btn.click(
+            fn=on_reference_action,
+            inputs=[uid_state],
+            outputs=[img_display, options_radio, coords_box, log_output],
+        )
+        exec_btn.click(
+            fn=precheck_execute_inputs,
+            inputs=[uid_state, options_radio, coords_box],
+            outputs=[],
+            show_progress="hidden",
+        ).then(
+            fn=switch_to_execute_phase,
+            inputs=[uid_state],
+            outputs=[
+                options_radio,
+                exec_btn,
+                restart_episode_btn,
+                next_task_btn,
+                img_display,
+                reference_action_btn,
+            ],
+            show_progress="hidden",
+        ).then(
+            fn=lambda: PHASE_EXECUTION_PLAYBACK,
+            outputs=[ui_phase_state],
+            show_progress="hidden",
+        ).then(
+            fn=execute_step,
+            inputs=[uid_state, options_radio, coords_box],
+            outputs=[img_display, log_output, task_info_box, progress_info_box, restart_episode_btn, next_task_btn, exec_btn],
+            show_progress="hidden",
+        ).then(
+            fn=switch_to_action_phase,
+            inputs=[uid_state],
+            outputs=[
+                options_radio,
+                exec_btn,
+                restart_episode_btn,
+                next_task_btn,
+                img_display,
+                reference_action_btn,
+            ],
+            show_progress="hidden",
+        ).then(
+            fn=lambda: PHASE_ACTION_KEYPOINT,
+            outputs=[ui_phase_state],
+            show_progress="hidden",
+        )
+        live_obs_timer.tick(
+            fn=refresh_live_obs,
+            inputs=[uid_state, ui_phase_state],
+            outputs=[img_display],
+            queue=False,
+            show_progress="hidden",
+        )
+        demo.load(
+            fn=init_app_with_phase,
+            inputs=[],
+            outputs=[
+                uid_state,
+                main_interface,
+                img_display,
+                log_output,
+                options_radio,
+                goal_box,
+                coords_box,
+                video_display,
+                task_info_box,
+                progress_info_box,
+                restart_episode_btn,
+                next_task_btn,
+                exec_btn,
+                video_phase_group,
+                action_phase_group,
+                control_panel_group,
+                task_hint_display,
+                loading_overlay,
+                reference_action_btn,
+                ui_phase_state,
+            ],
+        ).then(
+            fn=sync_header_from_task,
+            inputs=[task_info_box, goal_box],
+            outputs=[header_task_box, header_goal_box],
+        )
+    return demo

gradio-web/user_manager.py ADDED Viewed

	@@ -0,0 +1,178 @@

+import json
+import os
+import random
+import threading
+from pathlib import Path
+from state_manager import clear_task_start_time, get_task_start_time
+METADATA_FILE_GLOB = "record_dataset_*_metadata.json"
+class UserManager:
+    def __init__(self):
+        self.base_dir = Path(__file__).resolve().parent
+        self.lock = threading.Lock()
+        self.env_to_episodes = self._load_env_episode_pool()
+        self.env_choices = sorted(self.env_to_episodes.keys())
+        # Session-local progress only (no disk persistence)
+        self.session_progress = {}
+    def _resolve_metadata_root(self) -> Path:
+        env_root = os.environ.get("ROBOMME_METADATA_ROOT")
+        if env_root:
+            return Path(env_root)
+        return self.base_dir.parent / "src" / "robomme" / "env_metadata" / "train"
+    def _load_env_episode_pool(self):
+        env_to_episode_set = {}
+        metadata_root = self._resolve_metadata_root()
+        if not metadata_root.exists():
+            print(f"Warning: metadata root not found: {metadata_root}")
+            return {}
+        for metadata_path in sorted(metadata_root.glob(METADATA_FILE_GLOB)):
+            try:
+                payload = json.loads(metadata_path.read_text(encoding="utf-8"))
+            except Exception as exc:
+                print(f"Warning: failed to read metadata file {metadata_path}: {exc}")
+                continue
+            fallback_env = str(payload.get("env_id") or "").strip()
+            for record in payload.get("records", []):
+                env_id = str(record.get("task") or fallback_env or "").strip()
+                episode = record.get("episode")
+                if not env_id or episode is None:
+                    continue
+                try:
+                    episode_idx = int(episode)
+                except (TypeError, ValueError):
+                    continue
+                env_to_episode_set.setdefault(env_id, set()).add(episode_idx)
+        env_to_episodes = {
+            env_id: sorted(episodes)
+            for env_id, episodes in env_to_episode_set.items()
+            if episodes
+        }
+        print(f"Loaded random env pool: {len(env_to_episodes)} envs from metadata root {metadata_root}")
+        return env_to_episodes
+    def _ensure_session_entry(self, uid):
+        if uid not in self.session_progress:
+            self.session_progress[uid] = {
+                "completed_count": 0,
+                "current_env_id": None,
+                "current_episode_idx": None,
+            }
+    def _set_current_random_task(self, uid, preferred_env=None):
+        if not self.env_choices:
+            return False
+        self._ensure_session_entry(uid)
+        env_id = preferred_env if preferred_env in self.env_to_episodes else random.choice(self.env_choices)
+        episodes = self.env_to_episodes.get(env_id, [])
+        if not episodes:
+            return False
+        episode_idx = int(random.choice(episodes))
+        self.session_progress[uid]["current_env_id"] = env_id
+        self.session_progress[uid]["current_episode_idx"] = episode_idx
+        return True
+    def init_session(self, uid):
+        if not uid:
+            return False, "Session uid cannot be empty", None
+        if not self.env_choices:
+            return False, "No available environments found in metadata.", None
+        with self.lock:
+            self._ensure_session_entry(uid)
+            progress = self.session_progress[uid]
+            if progress.get("current_env_id") is None or progress.get("current_episode_idx") is None:
+                if not self._set_current_random_task(uid):
+                    return False, "Failed to assign random task from metadata.", None
+        return True, "Session initialized", self.get_session_status(uid)
+    def get_session_status(self, uid):
+        if not uid:
+            return None
+        with self.lock:
+            self._ensure_session_entry(uid)
+            progress = self.session_progress[uid]
+            if (
+                (progress.get("current_env_id") is None or progress.get("current_episode_idx") is None)
+                and self.env_choices
+            ):
+                self._set_current_random_task(uid)
+                progress = self.session_progress[uid]
+            current_task = None
+            if progress.get("current_env_id") is not None and progress.get("current_episode_idx") is not None:
+                current_task = {
+                    "env_id": progress["current_env_id"],
+                    "episode_idx": int(progress["current_episode_idx"]),
+                }
+            completed_count = int(progress.get("completed_count", 0))
+        return {
+            "uid": uid,
+            "total_tasks": len(self.env_choices),  # compatibility only
+            "current_index": completed_count,  # compatibility only
+            "completed_count": completed_count,
+            "current_task": current_task,
+            "is_done_all": False,
+            "tasks": [],  # compatibility only
+            "env_choices": list(self.env_choices),
+        }
+    def complete_current_task(self, uid, env_id=None, episode_idx=None, **_kwargs):
+        if not uid:
+            return None
+        with self.lock:
+            self._ensure_session_entry(uid)
+            self.session_progress[uid]["completed_count"] = int(self.session_progress[uid]["completed_count"]) + 1
+        if env_id is not None and episode_idx is not None:
+            _ = get_task_start_time(uid, env_id, episode_idx)
+            clear_task_start_time(uid, env_id, episode_idx)
+        return self.get_session_status(uid)
+    def switch_env_and_random_episode(self, uid, env_id):
+        if not uid or env_id not in self.env_to_episodes:
+            return None
+        with self.lock:
+            self._ensure_session_entry(uid)
+            if not self._set_current_random_task(uid, preferred_env=env_id):
+                return None
+        return self.get_session_status(uid)
+    def next_episode_same_env(self, uid):
+        if not uid:
+            return None
+        with self.lock:
+            self._ensure_session_entry(uid)
+            current_env = self.session_progress[uid].get("current_env_id")
+            if current_env not in self.env_to_episodes:
+                if not self._set_current_random_task(uid):
+                    return None
+            else:
+                if not self._set_current_random_task(uid, preferred_env=current_env):
+                    return None
+        return self.get_session_status(uid)
+user_manager = UserManager()

gradio-web/verify_video_names.py ADDED Viewed

	@@ -0,0 +1,128 @@

+#!/usr/bin/env python3
+"""
+验证和修复视频文件名，确保与 env_id 正确对应
+注意：该脚本是离线校验工具，不参与当前 Gradio 运行时任务分配逻辑。
+"""
+import json
+import os
+from pathlib import Path
+def get_all_env_ids():
+    """从 user_tasks_og.json 获取所有唯一的 env_id"""
+    tasks_file = 'user_tasks_og.json'
+    with open(tasks_file, 'r') as f:
+        data = json.load(f)
+    env_ids = set()
+    for tasks in data.values():
+        for task in tasks:
+            env_ids.add(task['env_id'])
+    return sorted(env_ids)
+def verify_video_files(videos_dir='videos'):
+    """验证视频文件名是否正确对应"""
+    env_ids = get_all_env_ids()
+    videos_path = Path(videos_dir)
+    if not videos_path.exists():
+        print(f"错误: 目录 {videos_dir} 不存在")
+        return
+    # 获取现有文件
+    existing_files = {f.name.lower(): f for f in videos_path.glob('*.mp4')}
+    print("=" * 80)
+    print("视频文件名验证结果")
+    print("=" * 80)
+    print(f"{'Env ID':<25} {'期望文件名':<35} {'状态':<10}")
+    print("-" * 80)
+    correct_files = []
+    missing_files = []
+    incorrect_files = []
+    for env_id in env_ids:
+        expected_filename = env_id.lower() + '.mp4'
+        expected_lower = expected_filename.lower()
+        if expected_lower in existing_files:
+            actual_file = existing_files[expected_lower]
+            if actual_file.name == expected_filename:
+                status = "✓ 正确"
+                correct_files.append((env_id, expected_filename))
+            else:
+                status = f"⚠ 大小写不匹配: {actual_file.name}"
+                incorrect_files.append((env_id, expected_filename, actual_file.name))
+        else:
+            status = "✗ 缺失"
+            missing_files.append((env_id, expected_filename))
+        print(f"{env_id:<25} {expected_filename:<35} {status:<10}")
+    print("=" * 80)
+    print(f"\n总结:")
+    print(f"  ✓ 正确匹配: {len(correct_files)} 个")
+    print(f"  ✗ 缺失文件: {len(missing_files)} 个")
+    print(f"  ⚠ 需要修复: {len(incorrect_files)} 个")
+    if incorrect_files:
+        print(f"\n需要重命名的文件:")
+        for env_id, expected, actual in incorrect_files:
+            print(f"  {actual} -> {expected}")
+    if missing_files:
+        print(f"\n缺失的视频文件 (这些 env_id 没有对应的视频):")
+        for env_id, expected in missing_files:
+            print(f"  {env_id} -> {expected}")
+    return correct_files, missing_files, incorrect_files
+def fix_incorrect_names(videos_dir='videos', dry_run=True):
+    """修复不正确的文件名"""
+    env_ids = get_all_env_ids()
+    videos_path = Path(videos_dir)
+    if not videos_path.exists():
+        print(f"错误: 目录 {videos_dir} 不存在")
+        return
+    existing_files = {f.name.lower(): f for f in videos_path.glob('*.mp4')}
+    fixed = []
+    for env_id in env_ids:
+        expected_filename = env_id.lower() + '.mp4'
+        expected_lower = expected_filename.lower()
+        if expected_lower in existing_files:
+            actual_file = existing_files[expected_lower]
+            if actual_file.name != expected_filename:
+                # 文件名大小写不匹配，需要重命名
+                new_path = actual_file.parent / expected_filename
+                if dry_run:
+                    print(f"[DRY RUN] 将重命名: {actual_file.name} -> {expected_filename}")
+                else:
+                    try:
+                        actual_file.rename(new_path)
+                        print(f"✓ 已重命名: {actual_file.name} -> {expected_filename}")
+                        fixed.append((actual_file.name, expected_filename))
+                    except Exception as e:
+                        print(f"✗ 重命名失败 {actual_file.name}: {e}")
+                fixed.append((actual_file.name, expected_filename))
+    if not fixed:
+        print("没有需要修复的文件名")
+    elif dry_run:
+        print(f"\n[DRY RUN 模式] 共 {len(fixed)} 个文件需要重命名")
+        print("运行时不加 --dry-run 参数以执行实际重命名")
+    return fixed
+if __name__ == '__main__':
+    import sys
+    if '--fix' in sys.argv:
+        dry_run = '--dry-run' in sys.argv or '--fix' not in sys.argv
+        fix_incorrect_names(dry_run=dry_run)
+    else:
+        verify_video_files()

pyproject.toml ADDED Viewed

	@@ -0,0 +1,34 @@

+[project]
+name = "robomme"
+version = "0.1.0"
+description = "Add your description here"
+readme = "readme.md"
+requires-python = ">=3.11"
+dependencies = [
+    "mani-skill",
+    "opencv-python>=4.11.0.86",
+    "setuptools==80.9.0",
+    "torch==2.9.1",
+    "torchvision==0.24.1",
+]
+[project.optional-dependencies]
+dev = ["opencv-python", "pytest"]
+[tool.uv.sources]
+mani-skill = { git = "https://github.com/YinpeiDai/ManiSkill.git", rev = "07be6fbc66350ddca200abfb0a11b692f078f7fd" }
+[build-system]
+requires = ["hatchling"]
+build-backend = "hatchling.build"
+[tool.hatch.build.targets.wheel]
+packages = ["src/robomme"]
+[tool.pytest.ini_options]
+markers = [
+    "slow: slow-running tests",
+    "gpu: tests requiring GPU/display/headless rendering stack",
+    "dataset: tests that generate/use temporary datasets",
+    "lightweight: tests that do not require generated dataset",
+]

readme.md ADDED Viewed

	@@ -0,0 +1,135 @@

+# RoboMME: A Robotic Benchmark for Memory-Augmented Manipulation
+![Robomme bench](assets/robomme_bench.jpg)
+## 📢 Announcements
+[03/2026] We are thrilled to release RoboMME, the first large-scale robotic benchmark dedicated to memory-augmented manipulation! Spanning 4 cognitively motivated task suites with 16 carefully designed tasks, RoboMME pushes robots to remember, reason, and act.
+## 📦 Installation
+After cloning the repo, install [uv](https://docs.astral.sh/uv/getting-started/installation/), then run:
+```bash
+uv sync
+uv pip install -e .
+```
+## 🚀 Quick Start
+Start an environment with a specified setup:
+```bash
+uv run scripts/run_example.py
+```
+This generates a rollout video in the `sample_run_videos` directory.
+We provide four action types: `joint_action`, `ee_pose`, `waypoint`, and `multi_choice`, e.g., predict continuous actions with `joint_action` or `ee_pose`, discrete waypoint actions with `waypoint`, or use `multi_choice` for VideoQA-style problems.
+## 📁 Benchmark
+### 🤖 Tasks
+We have four task suites, each with 4 tasks:
+| Suite      | Focus             | Task ID                                                                 |
+| ---------- | ----------------- | --------------------------------------------------------------------- |
+| Counting   | Temporal memory   | BinFill, PickXtimes, SwingXtimes, StopCube                            |
+| Permanence | Spatial memory    | VideoUnmask, VideoUnmaskSwap, ButtonUnmask, ButtonUnmaskSwap         |
+| Reference  | Object memory     | PickHighlight, VideoRepick, VideoPlaceButton, VideoPlaceOrder         |
+| Imitation  | Procedural memory | MoveCube, InsertPeg, PatternLock, RouteStick                          |
+All tasks are defined in `src/robomme/robomme_env`. A detailed description can be found in our paper appendix.
+### 📥 Training Data
+Training data can be downloaded [here](https://huggingface.co/datasets/Yinpei/robomme_data). There are 1,600 demonstrations in total (100 per task). The HDF5 format is described in [doc/h5_data_format.md](doc/h5_data_format.md).
+After downloading, replay the dataset for a sanity check:
+```bash
+uv run scripts/dataset_replay.py --h5-data-dir <your_downloaded_data_dir>
+```
+### 📊 Evaluation
+To evaluate on the test set, set the `dataset` argument of `BenchmarkEnvBuilder`:
+```python
+task_id = "PickXtimes"
+episode_idx = 0
+env_builder = BenchmarkEnvBuilder(
+    env_id=task_id,
+    dataset="test",
+    ...
+)
+env = env_builder.make_env_for_episode(episode_idx)
+obs, info = env.reset() # initial step
+...
+obs, _, terminated, truncated, info = env.step(action) # each step
+```
+The train split has 100 episodes. The val/test splits each have 50 episodes. All seeds are fixed for benchmarking.
+The environment input/output format is described in [doc/env_format.md](doc/env_format.md).
+> Currently, environment spawning is set up only for imitation learning. We are working on extending it to support more general parallel environments for reinforcement learning in the future.
+### 🔧 Data Generation
+You can also re-generate your own HDF5 data via parallel processing using
+@hongze
+```bash
+uv run scripts/dev/xxxx
+```
+## 🧠 Model Training
+### 🌟 MME-VLA-Suite
+The [MME Policy Learning](https://github.com/RoboMME/robomme_policy_learning) repo provides MME-VLA model training and evaluation used in our paper. It contains a family of memory-augmented VLA models built on [pi05](https://github.com/Physical-Intelligence/openpi) backbone and our implementation of [MemER](https://jen-pan.github.io/memer/).
+### 📚 Prior Methods
+**MemER**: The [MME Policy Learning](https://github.com/RoboMME/robomme_policy_learning) repo also provides our implementation of the [MemER](https://jen-pan.github.io/memer/), using the same GroundSG policy model as in MME-VLA.
+**SAM2Act+**: The [RoboMME_SAM2Act](https://github.com/RoboMME/SAM2Act) repo provides our implementation adapted from the [SAM2Act](https://github.com/sam2act/sam2act) repo.
+**MemoryVLA**: The [RoboMME_MemoryVLA](https://github.com/RoboMME/MemoryVLA) repo provides our implementation adapted from the [MemoryVLA](https://github.com/shihao1895/MemoryVLA) repo.
+**Diffusion Policy**: The [RoboMME_DP](https://github.com/RoboMME/DP) repo provides our implementation adapted from the [diffusion_policy](https://github.com/real-stanford/diffusion_policy) repo.
+## 🏆 Submit Your Models
+Want to add your model? Download the [dataset](https://huggingface.co/datasets/Yinpei/robomme_data) from Hugging Face, run evaluation using our [eval scripts](scripts/evaluation.py), then submit a PR with your results by adding `<your_model>.md` to the `doc/submission/` [directory](https://github.com/RoboMME/robomme_benchmark/tree/main/doc/submission). We will review it and update our leaderboard.
+## 🔧 Troubleshooting
+**Q1: RuntimeError: Create window failed: Renderer does not support display.**
+A1: Use a physical display or set up a virtual display for GUI rendering (e.g. install a VNC server and set the `DISPLAY` variable correctly).
+**Q2: Failure related to Vulkan installation.**
+A2: We recommend reinstalling the NVIDIA driver and Vulkan packages. We use NVIDIA driver 570.211.01 and Vulkan 1.3.275. If it still does not work, switch to CPU rendering:
+```python
+os.environ['SAPIEN_RENDER_DEVICE'] = 'cpu'
+os.environ['MUJOCO_GL'] = 'osmesa'
+```
+## 🙏 Acknowledgements
+This work was supported in part by NSF SES-2128623, NSF CAREER #2337870, NSF NRI #2220876, NSF NAIRR250085. We would also like to thank the wonderful [OpenPi](https://github.com/Physical-Intelligence/openpi/tree/main) codebase from Physical-Intelligence.
+## 📄 Citation
+```
+...
+```

requirements.txt ADDED Viewed

	@@ -0,0 +1,11 @@

+gradio
+numpy
+Pillow
+opencv-python>=4.11.0.86
+gymnasium
+h5py
+imageio
+setuptools==80.9.0
+torch==2.9.1
+torchvision==0.24.1
+mani-skill @ git+https://github.com/YinpeiDai/ManiSkill.git@07be6fbc66350ddca200abfb0a11b692f078f7fd

scripts/dataset_replay.py ADDED Viewed

	@@ -0,0 +1,268 @@

+"""
+Replay episodes from HDF5 datasets and save rollout videos.
+Loads recorded actions from record_dataset_<Task>.h5, steps the environment
+"""
+import os
+os.environ["CUDA_VISIBLE_DEVICES"] = "1"
+import json
+from pathlib import Path
+from typing import Any, Dict, Literal, Union
+import cv2
+import h5py
+import imageio
+import numpy as np
+import torch
+from robomme.env_record_wrapper import BenchmarkEnvBuilder
+GUI_RENDER = False
+REPLAY_VIDEO_DIR = "replay_videos"
+VIDEO_FPS = 30
+VIDEO_BORDER_COLOR = (255, 0, 0)
+VIDEO_BORDER_THICKNESS = 10
+TaskID = Literal[
+    # "BinFill",
+    # "PickXtimes",
+    # "SwingXtimes",
+    # "StopCube",
+    # "VideoUnmask",
+    "VideoUnmaskSwap",
+    # "ButtonUnmask",
+    # "ButtonUnmaskSwap",
+    # "PickHighlight",
+    # "VideoRepick",
+    # "VideoPlaceButton",
+    # "VideoPlaceOrder",
+    # "MoveCube",
+    # "InsertPeg",
+    # "PatternLock",
+    # "RouteStick",
+]
+ActionSpaceType = Literal["joint_angle", "ee_pose", "waypoint", "multi_choice"]
+def _to_numpy(t) -> np.ndarray:
+    return t.cpu().numpy() if isinstance(t, torch.Tensor) else np.asarray(t)
+def _frame_from_obs(
+    front: np.ndarray | torch.Tensor,
+    wrist: np.ndarray | torch.Tensor,
+    is_video_demo: bool = False,
+) -> np.ndarray:
+    frame = np.hstack([_to_numpy(front), _to_numpy(wrist)]).astype(np.uint8)
+    if is_video_demo:
+        h, w = frame.shape[:2]
+        cv2.rectangle(frame, (0, 0), (w, h),
+                      VIDEO_BORDER_COLOR, VIDEO_BORDER_THICKNESS)
+    return frame
+def _extract_frames(obs: dict, is_video_demo_fn=None) -> list[np.ndarray]:
+    n = len(obs["front_rgb_list"])
+    return [
+        _frame_from_obs(
+            obs["front_rgb_list"][i],
+            obs["wrist_rgb_list"][i],
+            is_video_demo=(is_video_demo_fn(i) if is_video_demo_fn else False),
+        )
+        for i in range(n)
+    ]
+def _is_video_demo(ts: h5py.Group) -> bool:
+    info = ts.get("info")
+    if info is None or "is_video_demo" not in info:
+        return False
+    return bool(np.reshape(np.asarray(info["is_video_demo"][()]), -1)[0])
+def _is_subgoal_boundary(ts: h5py.Group) -> bool:
+    info = ts.get("info")
+    if info is None or "is_subgoal_boundary" not in info:
+        return False
+    return bool(np.reshape(np.asarray(info["is_subgoal_boundary"][()]), -1)[0])
+def _decode_h5_str(raw) -> str:
+    """Uniformly decode bytes / numpy bytes / str from HDF5 to str."""
+    if isinstance(raw, np.ndarray):
+        raw = raw.flatten()[0]
+    if isinstance(raw, (bytes, np.bytes_)):
+        raw = raw.decode("utf-8")
+    return raw
+def _build_action_sequence(
+    episode_data: h5py.Group, action_space_type: str
+) -> list[Union[np.ndarray, Dict[str, Any]]]:
+    """
+    Scan the entire episode and return the deduplicated action sequence:
+    - joint_angle / ee_pose: actions of all non-video-demo steps (sequential, not deduplicated)
+    - waypoint: remove adjacent duplicate waypoint_action (like EpisodeDatasetResolver)
+    - multi_choice: choice_action (JSON dict) only for steps where is_subgoal_boundary=True
+    """
+    timestep_keys = sorted(
+        (k for k in episode_data.keys() if k.startswith("timestep_")),
+        key=lambda k: int(k.split("_")[1]),
+    )
+    actions: list[Union[np.ndarray, Dict[str, Any]]] = []
+    prev_waypoint: np.ndarray | None = None
+    for key in timestep_keys:
+        ts = episode_data[key]
+        if _is_video_demo(ts):
+            continue
+        action_grp = ts.get("action")
+        if action_grp is None:
+            continue
+        if action_space_type == "joint_angle":
+            if "joint_action" not in action_grp:
+                continue
+            actions.append(np.asarray(action_grp["joint_action"][()], dtype=np.float32))
+        elif action_space_type == "ee_pose":
+            if "eef_action" not in action_grp:
+                continue
+            actions.append(np.asarray(action_grp["eef_action"][()], dtype=np.float32))
+        elif action_space_type == "waypoint":
+            if "waypoint_action" not in action_grp:
+                continue
+            wa = np.asarray(action_grp["waypoint_action"][()], dtype=np.float32).flatten()
+            if wa.shape != (7,) or not np.all(np.isfinite(wa)):
+                continue
+            # Remove adjacent duplicates
+            if prev_waypoint is None or not np.array_equal(wa, prev_waypoint):
+                actions.append(wa)
+                prev_waypoint = wa.copy()
+        elif action_space_type == "multi_choice":
+            if not _is_subgoal_boundary(ts):
+                continue
+            if "choice_action" not in action_grp:
+                continue
+            raw = _decode_h5_str(action_grp["choice_action"][()])
+            try:
+                payload = json.loads(raw)
+            except (TypeError, ValueError, json.JSONDecodeError):
+                continue
+            if not isinstance(payload, dict):
+                continue
+            choice = payload.get("choice")
+            if not isinstance(choice, str) or not choice.strip():
+                continue
+            if "point" not in payload:
+                continue
+            actions.append({"choice": choice, "point": payload.get("point")})
+        else:
+            raise ValueError(f"Unknown action space type: {action_space_type}")
+    return actions
+def _save_video(
+    frames: list[np.ndarray],
+    task_id: str,
+    episode_idx: int,
+    task_goal: str,
+    outcome: str,
+    action_space_type: str,
+) -> Path:
+    video_dir = Path(REPLAY_VIDEO_DIR) / action_space_type
+    video_dir.mkdir(parents=True, exist_ok=True)
+    name = f"{outcome}_{task_id}_ep{episode_idx}_{task_goal}.mp4"
+    path = video_dir / name
+    imageio.mimsave(str(path), frames, fps=VIDEO_FPS)
+    return path
+def _get_episode_indices(data: h5py.File) -> list[int]:
+    return sorted(
+        int(key.split("_")[1])
+        for key in data.keys()
+        if key.startswith("episode_")
+    )
+def process_episode(
+    env_data: h5py.File,
+    episode_idx: int,
+    task_id: str,
+    action_space_type: ActionSpaceType,
+) -> None:
+    """Replay one episode from HDF5 data, record frames, and save a video."""
+    episode_data = env_data[f"episode_{episode_idx}"]
+    task_goal = episode_data["setup"]["task_goal"][()][0].decode()
+    action_sequence = _build_action_sequence(episode_data, action_space_type)
+    env = BenchmarkEnvBuilder(
+        env_id=task_id,
+        dataset="train",
+        action_space=action_space_type,
+        gui_render=GUI_RENDER,
+    ).make_env_for_episode(episode_idx)
+    print(f"\nTask: {task_id}, Episode: {episode_idx}, ",
+          f"Seed: {env.unwrapped.seed}, Difficulty: {env.unwrapped.difficulty}")
+    print(f"Task goal: {task_goal}")
+    print(f"Total actions after dedup: {len(action_sequence)}")
+    obs, _ = env.reset()
+    frames = _extract_frames(
+        obs, is_video_demo_fn=lambda i, n=len(obs["front_rgb_list"]): i < n - 1
+    )
+    outcome = "unknown"
+    for seq_idx, action in enumerate(action_sequence):
+        try:
+            obs, _, terminated, truncated, info = env.step(action)
+            frames.extend(_extract_frames(obs))
+        except Exception as e:
+            print(f"Error at seq_idx {seq_idx}: {e}")
+            break
+        if GUI_RENDER:
+            env.render()
+        if terminated or truncated:
+            outcome = info.get("status", "unknown")
+            print(f"Outcome: {outcome}")
+            break
+    env.close()
+    path = _save_video(frames, task_id, episode_idx, task_goal, outcome, action_space_type)
+    print(f"Saved video to {path}\n")
+def replay(
+    h5_data_dir: str = "/data/hongzefu/data_0226",
+    action_space_type: ActionSpaceType = "ee_pose",
+    replay_number: int = 10,
+) -> None:
+    """Replay episodes from HDF5 dataset files and save rollout videos."""
+    #for task_id in BenchmarkEnvBuilder.get_task_list():
+    for task_id in ["VideoUnmaskSwap"]:
+        file_path = Path(h5_data_dir) / f"record_dataset_{task_id}.h5"
+        if not file_path.exists():
+            print(f"Skipping {task_id}: file not found: {file_path}")
+            continue
+        with h5py.File(file_path, "r") as data:
+            episode_indices = _get_episode_indices(data)
+            for episode_idx in episode_indices[:min(replay_number, len(episode_indices))]:
+                process_episode(data, episode_idx, task_id, action_space_type)
+if __name__ == "__main__":
+    import tyro
+    tyro.cli(replay)

scripts/dev/compare_multi_choice_readers.py ADDED Viewed

	@@ -0,0 +1,334 @@

+#!/usr/bin/env python3
+"""Compare how v3 and v4 replay pipelines read multi_choice actions.
+v3 source:
+- EpisodeDatasetResolver.get_step("multi_choice", step)
+v4-noresolver source:
+- scripts.dataset_replay._build_action_sequence(..., "multi_choice")
+- then _parse_oracle_command() in replay loop
+"""
+import argparse
+import importlib.util
+import json
+import re
+import sys
+from pathlib import Path
+from typing import Any, Optional
+import h5py
+import numpy as np
+REPO_ROOT = Path(__file__).resolve().parents[2]
+if str(REPO_ROOT) not in sys.path:
+    sys.path.insert(0, str(REPO_ROOT))
+SRC_ROOT = REPO_ROOT / "src"
+if str(SRC_ROOT) not in sys.path:
+    sys.path.insert(0, str(SRC_ROOT))
+def _load_episode_dataset_resolver_cls():
+    resolver_path = SRC_ROOT / "robomme" / "env_record_wrapper" / "episode_dataset_resolver.py"
+    spec = importlib.util.spec_from_file_location(
+        "episode_dataset_resolver_direct",
+        resolver_path,
+    )
+    if spec is None or spec.loader is None:
+        raise RuntimeError(f"Failed to load resolver module from {resolver_path}")
+    module = importlib.util.module_from_spec(spec)
+    spec.loader.exec_module(module)
+    resolver_cls = getattr(module, "EpisodeDatasetResolver", None)
+    if resolver_cls is None:
+        raise RuntimeError(f"EpisodeDatasetResolver not found in {resolver_path}")
+    return resolver_cls
+EpisodeDatasetResolver = _load_episode_dataset_resolver_cls()
+DEFAULT_ENV_ID = "PatternLock"
+DEFAULT_DATASET_ROOT = "/data/hongzefu/data_0226-test"
+def _parse_oracle_command_v4(choice_action: Optional[Any]) -> Optional[dict[str, Any]]:
+    """Exact validation logic used in evaluate_dataset_replay-parallelv4-noresolver.py."""
+    if not isinstance(choice_action, dict):
+        return None
+    choice = choice_action.get("choice")
+    if not isinstance(choice, str) or not choice.strip():
+        return None
+    point = choice_action.get("point")
+    if not isinstance(point, (list, tuple, np.ndarray)) or len(point) != 2:
+        return None
+    return choice_action
+def _is_video_demo_v4(ts: h5py.Group) -> bool:
+    info = ts.get("info")
+    if info is None or "is_video_demo" not in info:
+        return False
+    return bool(np.reshape(np.asarray(info["is_video_demo"][()]), -1)[0])
+def _is_subgoal_boundary_v4(ts: h5py.Group) -> bool:
+    info = ts.get("info")
+    if info is None or "is_subgoal_boundary" not in info:
+        return False
+    return bool(np.reshape(np.asarray(info["is_subgoal_boundary"][()]), -1)[0])
+def _decode_h5_str_v4(raw: Any) -> str:
+    if isinstance(raw, np.ndarray):
+        raw = raw.flatten()[0]
+    if isinstance(raw, (bytes, np.bytes_)):
+        raw = raw.decode("utf-8")
+    return raw
+def _build_multi_choice_sequence_v4(episode_data: h5py.Group) -> list[Any]:
+    """
+    Re-implementation of dataset_replay._build_action_sequence(..., \"multi_choice\")
+    without importing cv2/imageio/torch dependencies.
+    """
+    timestep_keys = sorted(
+        (k for k in episode_data.keys() if k.startswith("timestep_")),
+        key=lambda k: int(k.split("_")[1]),
+    )
+    out: list[Any] = []
+    for key in timestep_keys:
+        ts = episode_data[key]
+        if _is_video_demo_v4(ts):
+            continue
+        action_grp = ts.get("action")
+        if action_grp is None:
+            continue
+        if not _is_subgoal_boundary_v4(ts):
+            continue
+        if "choice_action" not in action_grp:
+            continue
+        raw = _decode_h5_str_v4(action_grp["choice_action"][()])
+        try:
+            out.append(json.loads(raw))
+        except (TypeError, ValueError, json.JSONDecodeError):
+            continue
+    return out
+def _resolve_h5_path(env_id: str, dataset_root: Optional[str], h5_path: Optional[str]) -> Path:
+    if h5_path:
+        return Path(h5_path)
+    if not dataset_root:
+        raise ValueError("Either --h5_path or --dataset_root must be provided")
+    return Path(dataset_root) / f"record_dataset_{env_id}.h5"
+def _episode_indices(data: h5py.File) -> list[int]:
+    return sorted(
+        int(m.group(1))
+        for key in data.keys()
+        for m in [re.match(r"episode_(\d+)$", key)]
+        if m
+    )
+def _parse_episode_filter(raw: Optional[str], all_eps: list[int]) -> list[int]:
+    if not raw:
+        return all_eps
+    selected: set[int] = set()
+    for token in [x.strip() for x in raw.split(",") if x.strip()]:
+        if "-" in token:
+            lo_s, hi_s = token.split("-", 1)
+            lo = int(lo_s)
+            hi = int(hi_s)
+            if lo > hi:
+                lo, hi = hi, lo
+            selected.update(range(lo, hi + 1))
+        else:
+            selected.add(int(token))
+    return [ep for ep in all_eps if ep in selected]
+def _canonical_command(cmd: Any) -> str:
+    """Stable string form for diffing and readable output."""
+    try:
+        return json.dumps(cmd, ensure_ascii=False, sort_keys=True)
+    except TypeError:
+        if isinstance(cmd, dict):
+            safe = {
+                str(k): (v.tolist() if isinstance(v, np.ndarray) else v)
+                for k, v in cmd.items()
+            }
+            return json.dumps(safe, ensure_ascii=False, sort_keys=True)
+        return repr(cmd)
+def _read_v4_commands(episode_group: h5py.Group) -> tuple[list[Any], list[dict[str, Any]], int]:
+    raw_list = _build_multi_choice_sequence_v4(episode_group)
+    parsed_list: list[dict[str, Any]] = []
+    skipped = 0
+    for item in raw_list:
+        parsed = _parse_oracle_command_v4(item)
+        if parsed is None:
+            skipped += 1
+            continue
+        parsed_list.append(parsed)
+    return raw_list, parsed_list, skipped
+def _read_v3_commands(env_id: str, episode: int, dataset_ref: str) -> list[dict[str, Any]]:
+    out: list[dict[str, Any]] = []
+    with EpisodeDatasetResolver(
+        env_id=env_id,
+        episode=episode,
+        dataset_directory=dataset_ref,
+    ) as resolver:
+        step = 0
+        while True:
+            cmd = resolver.get_step("multi_choice", step)
+            if cmd is None:
+                break
+            if isinstance(cmd, dict):
+                out.append(cmd)
+            step += 1
+    return out
+def compare_episode(
+    env_id: str,
+    episode: int,
+    episode_group: h5py.Group,
+    dataset_ref: str,
+    max_show: int,
+) -> None:
+    v4_raw, v4_effective, v4_skipped = _read_v4_commands(episode_group)
+    v3_resolver = _read_v3_commands(env_id=env_id, episode=episode, dataset_ref=dataset_ref)
+    print(f"\n=== episode_{episode} ===")
+    print(
+        "counts: "
+        f"v4_raw={len(v4_raw)}, "
+        f"v4_effective={len(v4_effective)} (skipped_by_parse={v4_skipped}), "
+        f"v3_resolver={len(v3_resolver)}"
+    )
+    v4_effective_c = [_canonical_command(x) for x in v4_effective]
+    v3_c = [_canonical_command(x) for x in v3_resolver]
+    if v4_effective_c == v3_c:
+        print("effective sequence compare: SAME")
+    else:
+        print("effective sequence compare: DIFFERENT")
+        max_len = max(len(v4_effective_c), len(v3_c))
+        shown = 0
+        for idx in range(max_len):
+            left = v4_effective_c[idx] if idx < len(v4_effective_c) else "<MISSING>"
+            right = v3_c[idx] if idx < len(v3_c) else "<MISSING>"
+            if left == right:
+                continue
+            print(f"  idx={idx}")
+            print(f"    v4_effective: {left}")
+            print(f"    v3_resolver : {right}")
+            shown += 1
+            if shown >= max_show:
+                remaining = max_len - idx - 1
+                if remaining > 0:
+                    print(f"  ... more differences omitted ({remaining} remaining positions)")
+                break
+    print(f"sample v4_raw (first {max_show}):")
+    for i, item in enumerate(v4_raw[:max_show]):
+        print(f"  [{i}] {_canonical_command(item)}")
+    print(f"sample v4_effective (first {max_show}):")
+    for i, item in enumerate(v4_effective[:max_show]):
+        print(f"  [{i}] {_canonical_command(item)}")
+    print(f"sample v3_resolver (first {max_show}):")
+    for i, item in enumerate(v3_resolver[:max_show]):
+        print(f"  [{i}] {_canonical_command(item)}")
+def main() -> None:
+    parser = argparse.ArgumentParser(
+        description=(
+            "Compare multi_choice read results between "
+            "evaluate_dataset_replay-parallelv3 and parallelv4-noresolver."
+        )
+    )
+    parser.add_argument(
+        "--env_id",
+        type=str,
+        default=DEFAULT_ENV_ID,
+        help=f"Task/env id. Default: {DEFAULT_ENV_ID}",
+    )
+    parser.add_argument(
+        "--dataset_root",
+        type=str,
+        default=DEFAULT_DATASET_ROOT,
+        help=(
+            "Directory that contains record_dataset_<env_id>.h5. "
+            f"Default: {DEFAULT_DATASET_ROOT}"
+        ),
+    )
+    parser.add_argument(
+        "--h5_path",
+        type=str,
+        default=None,
+        help="Direct path to .h5 file (overrides --dataset_root)",
+    )
+    parser.add_argument(
+        "--episodes",
+        type=str,
+        default=0,
+        help="Episode filter, e.g. '0,3,8-10'. Default: all episodes in h5",
+    )
+    parser.add_argument(
+        "--max_show",
+        type=int,
+        default=50,
+        help="Max number of diff/sample rows per episode",
+    )
+    args = parser.parse_args()
+    h5_file = _resolve_h5_path(args.env_id, args.dataset_root, args.h5_path)
+    if not h5_file.exists():
+        raise FileNotFoundError(f"h5 file not found: {h5_file}")
+    dataset_ref = str(h5_file) if h5_file.suffix == ".h5" else str(h5_file.parent)
+    print(f"env_id={args.env_id}")
+    print(f"h5={h5_file}")
+    with h5py.File(h5_file, "r") as data:
+        all_eps = _episode_indices(data)
+        selected_eps = _parse_episode_filter(args.episodes, all_eps)
+        if not selected_eps:
+            print("No episodes selected.")
+            return
+        print(f"episodes={selected_eps}")
+        for ep in selected_eps:
+            key = f"episode_{ep}"
+            if key not in data:
+                print(f"\n=== episode_{ep} ===")
+                print("missing in h5, skip")
+                continue
+            compare_episode(
+                env_id=args.env_id,
+                episode=ep,
+                episode_group=data[key],
+                dataset_ref=dataset_ref,
+                max_show=args.max_show,
+            )
+if __name__ == "__main__":
+    main()

scripts/dev/dataset_replay_printType.py ADDED Viewed

	@@ -0,0 +1,254 @@

+# -*- coding: utf-8 -*-
+# Script function: Unified dataset replay entry point, supporting 4 action spaces: joint_angle / ee_pose / waypoint / multi_choice.
+# Consistent with subgoal_evaluate_func.py main loop; difference is actions come from EpisodeDatasetResolver.
+import os
+from typing import Any, Optional
+import numpy as np
+import torch
+from robomme.robomme_env import *
+from robomme.robomme_env.utils import *
+from robomme.env_record_wrapper import (
+    BenchmarkEnvBuilder,
+    EpisodeDatasetResolver,
+)
+# Only enable one ACTION_SPACE; others are commented out for manual switching
+#ACTION_SPACE = "joint_angle"
+ACTION_SPACE = "waypoint"
+GUI_RENDER = False
+DATASET_ROOT = "/data/hongzefu/data_0225"
+DEFAULT_ENV_IDS = [
+    # "PickXtimes",
+    # "StopCube",
+    # "SwingXtimes",
+    #"BinFill",
+    "VideoUnmaskSwap",
+    # "VideoUnmask",
+    # "ButtonUnmaskSwap",
+    # "ButtonUnmask",
+     #"VideoRepick",
+    # "VideoPlaceButton",
+    # "VideoPlaceOrder",
+   # "PickHighlight",
+    # "InsertPeg",
+    # "MoveCube",
+    # "PatternLock",
+    # "RouteStick",
+]
+MAX_STEPS = 1000
+def _describe(value: Any, indent: int = 0) -> str:
+    """Recursively describe a value's type, shape, and content summary."""
+    prefix = "  " * indent
+    if isinstance(value, torch.Tensor):
+        return f"{prefix}Tensor  dtype={value.dtype}  shape={tuple(value.shape)}  device={value.device}"
+    elif isinstance(value, np.ndarray):
+        return f"{prefix}ndarray  dtype={value.dtype}  shape={value.shape}"
+    elif isinstance(value, list):
+        if len(value) == 0:
+            return f"{prefix}list[]  (empty)"
+        lines = [f"{prefix}list[{len(value)}]"]
+        for i, item in enumerate(value):
+            lines.append(f"{prefix}  [{i}]: {_describe(item, 0)}")
+            if i >= 2:
+                lines.append(f"{prefix}  ... (only first 3 shown)")
+                break
+        return "\n".join(lines)
+    elif isinstance(value, dict):
+        lines = [f"{prefix}dict  keys={list(value.keys())}"]
+        for k, v in value.items():
+            lines.append(f"{prefix}  '{k}': {_describe(v, 0)}")
+        return "\n".join(lines)
+    elif isinstance(value, (int, float, bool, str)):
+        return f"{prefix}{type(value).__name__}  value={repr(value)}"
+    elif value is None:
+        return f"{prefix}None"
+    else:
+        return f"{prefix}{type(value).__name__}  repr={repr(value)[:80]}"
+def _print_obs(obs: dict, tag: str):
+    """Print data formats of all fields in the obs dict."""
+    print(f"\n{'='*60}")
+    print(f"[{tag}] obs fields:")
+    print(f"{'='*60}")
+    # maniskill_obs not printed (data volume is large)
+    _ = obs["maniskill_obs"]
+    front_rgb_list = obs["front_rgb_list"]
+    wrist_rgb_list = obs["wrist_rgb_list"]
+    front_depth_list = obs["front_depth_list"]
+    wrist_depth_list = obs["wrist_depth_list"]
+    end_effector_pose_raw = obs["end_effector_pose_raw"]
+    eef_state_list = obs["eef_state_list"]
+    joint_state_list = obs["joint_state_list"]
+    gripper_state_list = obs["gripper_state_list"]
+    front_camera_extrinsic_list = obs["front_camera_extrinsic_list"]
+    wrist_camera_extrinsic_list = obs["wrist_camera_extrinsic_list"]
+    fields = {
+        "front_rgb_list": front_rgb_list,
+        "wrist_rgb_list": wrist_rgb_list,
+        "front_depth_list": front_depth_list,
+        "wrist_depth_list": wrist_depth_list,
+        "end_effector_pose_raw": end_effector_pose_raw,
+        "eef_state_list": eef_state_list,
+        "joint_state_list": joint_state_list,
+        "gripper_state_list": gripper_state_list,
+        "front_camera_extrinsic_list": front_camera_extrinsic_list,
+        "wrist_camera_extrinsic_list": wrist_camera_extrinsic_list,
+    }
+    for name, val in fields.items():
+        print(f"  obs['{name}']:")
+        print(_describe(val, indent=2))
+    return fields
+def _print_info(info: dict, tag: str):
+    """Print data formats of all fields in the info dict."""
+    print(f"\n[{tag}] info fields:")
+    print(f"{'-'*60}")
+    task_goal = info["task_goal"]
+    simple_subgoal_online = info["simple_subgoal_online"]
+    grounded_subgoal_online = info["grounded_subgoal_online"]
+    available_multi_choices = info.get("available_multi_choices")
+    front_camera_intrinsic = info["front_camera_intrinsic"]
+    wrist_camera_intrinsic = info["wrist_camera_intrinsic"]
+    status = info.get("status")
+    fields = {
+        "task_goal": task_goal,
+        "simple_subgoal_online": simple_subgoal_online,
+        "grounded_subgoal_online": grounded_subgoal_online,
+        "available_multi_choices": available_multi_choices,
+        "front_camera_intrinsic": front_camera_intrinsic,
+        "wrist_camera_intrinsic": wrist_camera_intrinsic,
+        "status": status,
+    }
+    for name, val in fields.items():
+        print(f"  info['{name}']:")
+        print(_describe(val, indent=2))
+    return fields
+def _print_step_extras(reward, terminated, truncated, tag: str):
+    """Print data formats of reward / terminated / truncated."""
+    print(f"\n[{tag}] reward / terminated / truncated:")
+    print(f"{'-'*60}")
+    print(f"  reward:     {_describe(reward, 0)}")
+    print(f"  terminated: {_describe(terminated, 0)}")
+    print(f"  truncated:  {_describe(truncated, 0)}")
+def _parse_oracle_command(choice_action: Optional[Any]) -> Optional[dict[str, Any]]:
+    if not isinstance(choice_action, dict):
+        return None
+    choice = choice_action.get("choice")
+    if not isinstance(choice, str) or not choice.strip():
+        return None
+    point = choice_action.get("point")
+    if not isinstance(point, (list, tuple, np.ndarray)) or len(point) != 2:
+        return None
+    return choice_action
+def main():
+    env_id_list = BenchmarkEnvBuilder.get_task_list()
+    print(f"Running envs: {env_id_list}")
+    print(f"Using action_space: {ACTION_SPACE}")
+    #for env_id in env_id_list:
+    for env_id in DEFAULT_ENV_IDS:
+        env_builder = BenchmarkEnvBuilder(
+            env_id=env_id,
+            dataset="train",
+            action_space=ACTION_SPACE,
+            gui_render=GUI_RENDER,
+        )
+        episode_count = env_builder.get_episode_num()
+        print(f"[{env_id}] episode_count from metadata: {episode_count}")
+        env = None
+        for episode in range(episode_count):
+            env = env_builder.make_env_for_episode(
+                episode,
+                max_steps=MAX_STEPS,
+                include_maniskill_obs=True,
+                include_front_depth=True,
+                include_wrist_depth=True,
+                include_front_camera_extrinsic=True,
+                include_wrist_camera_extrinsic=True,
+                include_available_multi_choices=True,
+                include_front_camera_intrinsic=True,
+                include_wrist_camera_intrinsic=True,
+            )
+            dataset_resolver = EpisodeDatasetResolver(
+                env_id=env_id,
+                episode=episode,
+                dataset_directory=DATASET_ROOT,
+            )
+            # obs: dict-of-lists (columnar batch, list length = number of demo frames)
+            # info: flat dict (last frame values only)
+            obs, info = env.reset()
+            # --- Print all obs / info field types (reset) ---
+            _print_obs(obs, tag=f"{env_id} ep{episode} RESET")
+            _print_info(info, tag=f"{env_id} ep{episode} RESET")
+            step = 0
+            episode_success = False
+            # ======== Step loop ========
+            while True:
+                replay_key = ACTION_SPACE
+                action = dataset_resolver.get_step(replay_key, step)
+                if ACTION_SPACE == "multi_choice":
+                    action = _parse_oracle_command(action)
+                if action is None:
+                    break
+                # step returns: obs (dict-of-lists), reward (scalar tensor),
+                #               terminated (scalar tensor), truncated (scalar tensor), info (flat dict)
+                obs, reward, terminated, truncated, info = env.step(action)
+                # --- Print all obs / info / reward / terminated / truncated field types (step) ---
+                _print_obs(obs, tag=f"{env_id} ep{episode} STEP{step}")
+                _print_info(info, tag=f"{env_id} ep{episode} STEP{step}")
+                _print_step_extras(reward, terminated, truncated, tag=f"{env_id} ep{episode} STEP{step}")
+                terminated_flag = bool(terminated.item())
+                truncated_flag = bool(truncated.item())
+                step += 1
+                if GUI_RENDER:
+                    env.render()
+                if truncated_flag:
+                    print(f"[{env_id}] episode {episode} steps exceeded, step {step}.")
+                    break
+                if terminated_flag:
+                    status = info.get("status")
+                    if status == "success":
+                        print(f"[{env_id}] episode {episode} success.")
+                        episode_success = True
+                    elif status == "fail":
+                        print(f"[{env_id}] episode {episode} failed.")
+                    break
+        if env is not None:
+            env.close()
+if __name__ == "__main__":
+    main()

scripts/dev/deprecated/dataset_replay-FK-parallel.py ADDED Viewed

	@@ -0,0 +1,335 @@

+"""
+Replay episodes from an HDF5 dataset and save videos.
+Read recorded joint actions (joint_action) from record_dataset_<Task>.h5,
+convert them to end-effector pose actions (EE pose actions) via forward kinematics (FK),
+replay them in an environment wrapped by EE_POSE_ACTION_SPACE,
+and finally save side-by-side front/wrist camera videos to disk.
+"""
+import os
+from typing import Optional, Tuple
+import cv2
+import h5py
+import imageio
+import numpy as np
+import sapien
+import torch
+from mani_skill.examples.motionplanning.panda.motionplanner import (
+    PandaArmMotionPlanningSolver,
+)
+from robomme.robomme_env import *
+from robomme.robomme_env.utils import *
+from robomme.env_record_wrapper import BenchmarkEnvBuilder
+from robomme.robomme_env.utils import EE_POSE_ACTION_SPACE
+from robomme.robomme_env.utils.rpy_util import build_endeffector_pose_dict
+# --- Configuration ---
+GUI_RENDER = False
+REPLAY_VIDEO_DIR = "replay_videos"
+VIDEO_FPS = 30
+MAX_STEPS = 1000
+def _init_fk_planner(env) -> Tuple:
+    """Create PandaArmMotionPlanningSolver and return helper objects needed for FK.
+    Returns:
+        (mplib_planner, ee_link_idx, robot_base_pose)
+        - mplib_planner: mplib.Planner instance used for FK computation
+        - ee_link_idx: end-effector link index in the pinocchio model
+        - robot_base_pose: robot base pose in world coordinates
+    """
+    solver = PandaArmMotionPlanningSolver(
+        env,
+        debug=False,
+        vis=False,
+        base_pose=env.unwrapped.agent.robot.pose,
+        visualize_target_grasp_pose=False,
+        print_env_info=False,
+    )
+    mplib_planner = solver.planner
+    ee_link_idx = mplib_planner.link_name_2_idx[mplib_planner.move_group]
+    robot_base_pose = env.unwrapped.agent.robot.pose
+    print(f"[FK] move_group: {mplib_planner.move_group}, "
+          f"ee_link_idx: {ee_link_idx}, "
+          f"link_names: {mplib_planner.user_link_names}")
+    return mplib_planner, ee_link_idx, robot_base_pose
+def _joint_action_to_ee_pose(
+    mplib_planner,
+    joint_action: np.ndarray,
+    robot_base_pose: sapien.Pose,
+    ee_link_idx: int,
+    prev_ee_quat_wxyz: Optional[torch.Tensor] = None,
+    prev_ee_rpy_xyz: Optional[torch.Tensor] = None,
+) -> Tuple[np.ndarray, torch.Tensor, torch.Tensor]:
+    """Convert 8D joint action to 7D end-effector pose action via forward kinematics (FK).
+    Args:
+        mplib_planner: mplib.Planner instance (from PandaArmMotionPlanningSolver).
+        joint_action: 8D array [q1..q7, gripper].
+        robot_base_pose: robot base pose as a Sapien Pose.
+        ee_link_idx: end-effector link index in the pinocchio model.
+        prev_ee_quat_wxyz: previous-frame quaternion cache (for sign alignment).
+        prev_ee_rpy_xyz: previous-frame RPY cache (for continuity unwrapping).
+    Returns:
+        ee_action: 7D [x, y, z, roll, pitch, yaw, gripper].
+        new_prev_quat: updated quaternion cache.
+        new_prev_rpy: updated RPY cache.
+    """
+    action = np.asarray(joint_action, dtype=np.float64).flatten()
+    arm_qpos = action[:7]
+    gripper = float(action[7]) if action.size > 7 else -1.0
+    # Build full qpos: 7 arm joints + 2 gripper finger joints
+    finger_pos = max(gripper, 0.0) if gripper >= 0 else 0.04
+    full_qpos = np.concatenate([arm_qpos, [finger_pos, finger_pos]])
+    # Compute forward kinematics in the robot-base coordinate frame
+    pmodel = mplib_planner.pinocchio_model
+    pmodel.compute_forward_kinematics(full_qpos)
+    fk_result = pmodel.get_link_pose(ee_link_idx)  # 7D [x,y,z, qw,qx,qy,qz]
+    p_base = fk_result[:3]
+    q_base_wxyz = fk_result[3:]  # wxyz quaternion format
+    # base frame -> world frame transform
+    pose_in_base = sapien.Pose(p_base, q_base_wxyz)
+    world_pose = robot_base_pose * pose_in_base
+    # Use shared utilities to build continuous RPY (quaternion normalization, sign alignment, RPY unwrapping)
+    position_t = torch.as_tensor(
+        np.asarray(world_pose.p, dtype=np.float64), dtype=torch.float64
+    )
+    quat_wxyz_t = torch.as_tensor(
+        np.asarray(world_pose.q, dtype=np.float64), dtype=torch.float64
+    )
+    pose_dict, new_prev_quat, new_prev_rpy = build_endeffector_pose_dict(
+        position_t, quat_wxyz_t,
+        prev_ee_quat_wxyz, prev_ee_rpy_xyz,
+    )
+    # Concatenate into 7D EE pose action: [position(3), RPY(3), gripper(1)]
+    pos_np = pose_dict["pose"].detach().cpu().numpy().flatten()[:3]
+    rpy_np = pose_dict["rpy"].detach().cpu().numpy().flatten()[:3]
+    ee_action = np.concatenate([pos_np, rpy_np, [gripper]]).astype(np.float64)
+    return ee_action, new_prev_quat, new_prev_rpy
+def _frame_from_obs(obs: dict, is_video_frame: bool = False) -> np.ndarray:
+    """Build one side-by-side frame from front and wrist camera observations."""
+    front = obs["front_camera"][0].cpu().numpy()
+    wrist = obs["wrist_camera"][0].cpu().numpy()
+    frame = np.concatenate([front, wrist], axis=1).astype(np.uint8)
+    if is_video_frame:
+        # Mark video-demo frames with a red border
+        frame = cv2.rectangle(
+            frame, (0, 0), (frame.shape[1], frame.shape[0]), (255, 0, 0), 10
+        )
+    return frame
+def _first_execution_step(episode_data) -> int:
+    """Return the first non-video-demo step index (actual execution start step)."""
+    step_idx = 0
+    while episode_data[f"timestep_{step_idx}"]["info"]["is_video_demo"][()]:
+        step_idx += 1
+    return step_idx
+def process_episode(
+    h5_file_path: str, episode_idx: int, env_id: str, gui_render: bool = False,
+) -> None:
+    """Replay one episode in HDF5: read joint actions, run FK conversion, execute the environment, and save video.
+    Each worker process opens the HDF5 file independently to avoid cross-process shared file handles.
+    """
+    with h5py.File(h5_file_path, "r") as env_data:
+        episode_data = env_data[f"episode_{episode_idx}"]
+        task_goal = episode_data["setup"]["task_goal"][()].decode()
+        total_steps = sum(1 for k in episode_data.keys() if k.startswith("timestep_"))
+        step_idx = _first_execution_step(episode_data)
+        print(f"[ep{episode_idx}] execution start step index: {step_idx}")
+        # Create environment with EE_POSE_ACTION_SPACE (wrapped by EndeffectorDemonstrationWrapper)
+        env_builder = BenchmarkEnvBuilder(
+            env_id=env_id,
+            dataset="train",
+            action_space=EE_POSE_ACTION_SPACE,
+            gui_render=gui_render,
+        )
+        env = env_builder.make_env_for_episode(
+            episode_idx,
+            max_steps=MAX_STEPS,
+            include_maniskill_obs=True,
+            include_front_depth=True,
+            include_wrist_depth=True,
+            include_front_camera_extrinsic=True,
+            include_wrist_camera_extrinsic=True,
+            include_available_multi_choices=True,
+            include_front_camera_intrinsic=True,
+            include_wrist_camera_intrinsic=True,
+        )
+        print(f"[ep{episode_idx}] task: {env_id}, goal: {task_goal}")
+        obs, info = env.reset()
+        # Initialize FK planner (must be called after env.reset())
+        mplib_planner, ee_link_idx, robot_base_pose = _init_fk_planner(env)
+        # Observation list: length 1 means no demo video, length >1 means includes demo video; last element is current frame
+        frames = []
+        n_obs = len(obs["front_camera"])
+        for i in range(n_obs):
+            single_obs = {k: [v[i]] for k, v in obs.items()}
+            frames.append(_frame_from_obs(single_obs, is_video_frame=(i < n_obs - 1)))
+        print(f"[ep{episode_idx}] initial frame count (demo video + current frame): {len(frames)}")
+        outcome = "unknown"
+        prev_quat: Optional[torch.Tensor] = None
+        prev_rpy: Optional[torch.Tensor] = None
+        try:
+            while step_idx < total_steps:
+                # Read joint action from HDF5
+                joint_action = np.asarray(
+                    episode_data[f"timestep_{step_idx}"]["action"]["joint_action"][()],
+                    dtype=np.float64,
+                )
+                # Forward kinematics: joint_action -> ee_pose action
+                ee_action, prev_quat, prev_rpy = _joint_action_to_ee_pose(
+                    mplib_planner, joint_action, robot_base_pose, ee_link_idx,
+                    prev_ee_quat_wxyz=prev_quat,
+                    prev_ee_rpy_xyz=prev_rpy,
+                )
+                # Print debug info on the first step to verify FK conversion
+                if step_idx == _first_execution_step(episode_data):
+                    print(f"[ep{episode_idx}][FK] first step joint_action: {joint_action}")
+                    print(f"[ep{episode_idx}][FK] first step ee_action:    {ee_action}")
+                # Execute EE pose action in the environment
+                obs, _, terminated, _, info = env.step(ee_action)
+                frames.append(_frame_from_obs(obs))
+                if gui_render:
+                    env.render()
+                # TODO: hongze fix nested-list handling
+                if terminated:
+                    if info.get("success", False)[-1][-1]:
+                        outcome = "success"
+                    if info.get("fail", False)[-1][-1]:
+                        outcome = "fail"
+                    break
+                step_idx += 1
+        finally:
+            env.close()
+    # Save replay video
+    safe_goal = task_goal.replace(" ", "_").replace("/", "_")
+    os.makedirs(REPLAY_VIDEO_DIR, exist_ok=True)
+    video_name = f"{outcome}_{env_id}_ep{episode_idx}_{safe_goal}_step-{len(frames)}.mp4"
+    video_path = os.path.join(REPLAY_VIDEO_DIR, video_name)
+    imageio.mimsave(video_path, frames, fps=VIDEO_FPS)
+    print(f"[ep{episode_idx}] Video saved to {video_path}")
+def _worker_init(gpu_id_queue) -> None:
+    """Pool worker initializer that binds a GPU before CUDA initialization.
+    When each worker starts, it takes one GPU ID from the queue and sets env vars,
+    ensuring all later CUDA ops in that process run on the assigned GPU.
+    """
+    gpu_id = gpu_id_queue.get()
+    os.environ["CUDA_VISIBLE_DEVICES"] = str(gpu_id)
+    print(f"[Worker PID {os.getpid()}] bind GPU {gpu_id}")
+def _process_episode_worker(args: Tuple[str, int, str, bool]) -> str:
+    """multiprocessing worker entrypoint: unpack args and call process_episode."""
+    h5_file_path, episode_idx, env_id, gui_render = args
+    gpu_id = os.environ.get("CUDA_VISIBLE_DEVICES", "?")
+    try:
+        process_episode(h5_file_path, episode_idx, env_id, gui_render=gui_render)
+        return f"OK: {env_id} ep{episode_idx} (GPU {gpu_id})"
+    except Exception as e:
+        import traceback
+        traceback.print_exc()
+        return f"FAIL: {env_id} ep{episode_idx} (GPU {gpu_id}): {e}"
+def replay(
+    h5_data_dir: str = "/data/hongzefu/data_0214",
+    num_workers: int = 20,
+    gui_render: bool = False,
+    gpu_ids: str = "0,1",
+) -> None:
+    """Iterate through all task HDF5 files in the given directory and replay multiple episodes per env in parallel.
+    Args:
+        h5_data_dir: Directory containing HDF5 datasets.
+        num_workers: Number of parallel workers per env.
+        gui_render: Whether to enable GUI rendering (recommended off in multiprocessing).
+        gpu_ids: Comma-separated GPU ID list; workers use them in round-robin order.
+                 For example, "0,1" alternates assignment between GPU 0 and GPU 1.
+    """
+    import multiprocessing as mp
+    ctx = mp.get_context("spawn")
+    gpu_id_list = [int(g.strip()) for g in gpu_ids.split(",")]
+    print(f"Using GPUs: {gpu_id_list}, workers: {num_workers}")
+    env_id_list = BenchmarkEnvBuilder.get_task_list()
+    for env_id in env_id_list:
+        file_name = f"record_dataset_{env_id}.h5"
+        file_path = os.path.join(h5_data_dir, file_name)
+        if not os.path.exists(file_path):
+            print(f"Skip {env_id}: file does not exist: {file_path}")
+            continue
+        # Quickly read episode list and close file
+        with h5py.File(file_path, "r") as data:
+            episode_indices = sorted(
+                int(k.split("_")[1])
+                for k in data.keys()
+                if k.startswith("episode_")
+            )
+        print(f"task: {env_id}, total {len(episode_indices)} episodes, "
+              f"workers: {num_workers}, GPUs: {gpu_id_list}")
+        # Build worker argument list
+        worker_args = [
+            (file_path, ep_idx, env_id, gui_render)
+            for ep_idx in episode_indices
+        ]
+        # Create a new GPU assignment queue for each round; each worker grabs one GPU ID at startup
+        gpu_id_queue = ctx.Queue()
+        for i in range(num_workers):
+            gpu_id_queue.put(gpu_id_list[i % len(gpu_id_list)])
+        # Parallel replay (initializer binds GPU when each worker starts)
+        with ctx.Pool(
+            processes=num_workers,
+            initializer=_worker_init,
+            initargs=(gpu_id_queue,),
+        ) as pool:
+            results = pool.map(_process_episode_worker, worker_args)
+        for r in results:
+            print(r)
+if __name__ == "__main__":
+    import tyro
+    tyro.cli(replay)

scripts/dev/deprecated/dataset_replay-FK.py ADDED Viewed

	@@ -0,0 +1,264 @@

+"""
+Replay episodes from an HDF5 dataset and save videos.
+Read recorded joint actions (joint_action) from record_dataset_<Task>.h5,
+convert them to end-effector pose actions (EE pose actions) via forward kinematics (FK),
+replay them in an environment wrapped by EE_POSE_ACTION_SPACE,
+and finally save side-by-side front/wrist camera videos to disk.
+"""
+import os
+from typing import Optional, Tuple
+import cv2
+import h5py
+import imageio
+import numpy as np
+import sapien
+import torch
+from mani_skill.examples.motionplanning.panda.motionplanner import (
+    PandaArmMotionPlanningSolver,
+)
+from robomme.robomme_env import *
+from robomme.robomme_env.utils import *
+from robomme.env_record_wrapper import BenchmarkEnvBuilder
+from robomme.robomme_env.utils import EE_POSE_ACTION_SPACE
+from robomme.robomme_env.utils.rpy_util import build_endeffector_pose_dict
+# --- Configuration ---
+GUI_RENDER = True
+REPLAY_VIDEO_DIR = "replay_videos"
+VIDEO_FPS = 30
+MAX_STEPS = 1000
+def _init_fk_planner(env) -> Tuple:
+    """Create PandaArmMotionPlanningSolver and return helper objects needed for FK.
+    Returns:
+        (mplib_planner, ee_link_idx, robot_base_pose)
+        - mplib_planner: mplib.Planner instance used for FK computation
+        - ee_link_idx: end-effector link index in the pinocchio model
+        - robot_base_pose: robot base pose in world coordinates
+    """
+    solver = PandaArmMotionPlanningSolver(
+        env,
+        debug=False,
+        vis=False,
+        base_pose=env.unwrapped.agent.robot.pose,
+        visualize_target_grasp_pose=False,
+        print_env_info=False,
+    )
+    mplib_planner = solver.planner
+    ee_link_idx = mplib_planner.link_name_2_idx[mplib_planner.move_group]
+    robot_base_pose = env.unwrapped.agent.robot.pose
+    print(f"[FK] move_group: {mplib_planner.move_group}, "
+          f"ee_link_idx: {ee_link_idx}, "
+          f"link_names: {mplib_planner.user_link_names}")
+    return mplib_planner, ee_link_idx, robot_base_pose
+def _joint_action_to_ee_pose(
+    mplib_planner,
+    joint_action: np.ndarray,
+    robot_base_pose: sapien.Pose,
+    ee_link_idx: int,
+    prev_ee_quat_wxyz: Optional[torch.Tensor] = None,
+    prev_ee_rpy_xyz: Optional[torch.Tensor] = None,
+) -> Tuple[np.ndarray, torch.Tensor, torch.Tensor]:
+    """Convert 8D joint action to 7D end-effector pose action via forward kinematics (FK).
+    Args:
+        mplib_planner: mplib.Planner instance (from PandaArmMotionPlanningSolver).
+        joint_action: 8D array [q1..q7, gripper].
+        robot_base_pose: robot base pose as a Sapien Pose.
+        ee_link_idx: end-effector link index in the pinocchio model.
+        prev_ee_quat_wxyz: previous-frame quaternion cache (for sign alignment).
+        prev_ee_rpy_xyz: previous-frame RPY cache (for continuity unwrapping).
+    Returns:
+        ee_action: 7D [x, y, z, roll, pitch, yaw, gripper].
+        new_prev_quat: updated quaternion cache.
+        new_prev_rpy: updated RPY cache.
+    """
+    action = np.asarray(joint_action, dtype=np.float64).flatten()
+    arm_qpos = action[:7]
+    gripper = float(action[7]) if action.size > 7 else -1.0
+    # Build full qpos: 7 arm joints + 2 gripper finger joints
+    finger_pos = max(gripper, 0.0) if gripper >= 0 else 0.04
+    full_qpos = np.concatenate([arm_qpos, [finger_pos, finger_pos]])
+    # Compute forward kinematics in the robot-base coordinate frame
+    pmodel = mplib_planner.pinocchio_model
+    pmodel.compute_forward_kinematics(full_qpos)
+    fk_result = pmodel.get_link_pose(ee_link_idx)  # 7D [x,y,z, qw,qx,qy,qz]
+    p_base = fk_result[:3]
+    q_base_wxyz = fk_result[3:]  # wxyz quaternion format
+    # base frame -> world frame transform
+    pose_in_base = sapien.Pose(p_base, q_base_wxyz)
+    world_pose = robot_base_pose * pose_in_base
+    # Use shared utilities to build continuous RPY (quaternion normalization, sign alignment, RPY unwrapping)
+    position_t = torch.as_tensor(
+        np.asarray(world_pose.p, dtype=np.float64), dtype=torch.float64
+    )
+    quat_wxyz_t = torch.as_tensor(
+        np.asarray(world_pose.q, dtype=np.float64), dtype=torch.float64
+    )
+    pose_dict, new_prev_quat, new_prev_rpy = build_endeffector_pose_dict(
+        position_t, quat_wxyz_t,
+        prev_ee_quat_wxyz, prev_ee_rpy_xyz,
+    )
+    # Concatenate into 7D EE pose action: [position(3), RPY(3), gripper(1)]
+    pos_np = pose_dict["pose"].detach().cpu().numpy().flatten()[:3]
+    rpy_np = pose_dict["rpy"].detach().cpu().numpy().flatten()[:3]
+    ee_action = np.concatenate([pos_np, rpy_np, [gripper]]).astype(np.float64)
+    return ee_action, new_prev_quat, new_prev_rpy
+def _frame_from_obs(obs: dict, is_video_frame: bool = False) -> np.ndarray:
+    """Build one side-by-side frame from front and wrist camera observations."""
+    front = obs["front_camera"][0].cpu().numpy()
+    wrist = obs["wrist_camera"][0].cpu().numpy()
+    frame = np.concatenate([front, wrist], axis=1).astype(np.uint8)
+    if is_video_frame:
+        # Mark video-demo frames with a red border
+        frame = cv2.rectangle(
+            frame, (0, 0), (frame.shape[1], frame.shape[0]), (255, 0, 0), 10
+        )
+    return frame
+def _first_execution_step(episode_data) -> int:
+    """Return the first non-video-demo step index (actual execution start step)."""
+    step_idx = 0
+    while episode_data[f"timestep_{step_idx}"]["info"]["is_video_demo"][()]:
+        step_idx += 1
+    return step_idx
+def process_episode(env_data: h5py.File, episode_idx: int, env_id: str) -> None:
+    """Replay one episode in HDF5: read joint actions, run FK conversion, execute the environment, and save video."""
+    episode_data = env_data[f"episode_{episode_idx}"]
+    task_goal = episode_data["setup"]["task_goal"][()].decode()
+    total_steps = sum(1 for k in episode_data.keys() if k.startswith("timestep_"))
+    step_idx = _first_execution_step(episode_data)
+    print(f"execution start step index: {step_idx}")
+    # Create environment with EE_POSE_ACTION_SPACE (wrapped by EndeffectorDemonstrationWrapper)
+    env_builder = BenchmarkEnvBuilder(
+        env_id=env_id,
+        dataset="train",
+        action_space=EE_POSE_ACTION_SPACE,
+        gui_render=GUI_RENDER,
+    )
+    env = env_builder.make_env_for_episode(
+        episode_idx,
+        max_steps=MAX_STEPS,
+        include_maniskill_obs=True,
+        include_front_depth=True,
+        include_wrist_depth=True,
+        include_front_camera_extrinsic=True,
+        include_wrist_camera_extrinsic=True,
+        include_available_multi_choices=True,
+        include_front_camera_intrinsic=True,
+        include_wrist_camera_intrinsic=True,
+    )
+    print(f"task: {env_id}, episode: {episode_idx}, goal: {task_goal}")
+    obs, info = env.reset()
+    # Initialize FK planner (must be called after env.reset())
+    mplib_planner, ee_link_idx, robot_base_pose = _init_fk_planner(env)
+    # Observation list: length 1 means no demo video, length >1 means includes demo video; last element is current frame
+    frames = []
+    n_obs = len(obs["front_camera"])
+    for i in range(n_obs):
+        single_obs = {k: [v[i]] for k, v in obs.items()}
+        frames.append(_frame_from_obs(single_obs, is_video_frame=(i < n_obs - 1)))
+    print(f"initial frame count (demo video + current frame): {len(frames)}")
+    outcome = "unknown"
+    prev_quat: Optional[torch.Tensor] = None
+    prev_rpy: Optional[torch.Tensor] = None
+    try:
+        while step_idx < total_steps:
+            # Read joint action from HDF5
+            joint_action = np.asarray(
+                episode_data[f"timestep_{step_idx}"]["action"]["joint_action"][()],
+                dtype=np.float64,
+            )
+            # Forward kinematics: joint_action -> ee_pose action
+            ee_action, prev_quat, prev_rpy = _joint_action_to_ee_pose(
+                mplib_planner, joint_action, robot_base_pose, ee_link_idx,
+                prev_ee_quat_wxyz=prev_quat,
+                prev_ee_rpy_xyz=prev_rpy,
+            )
+            # Print debug info on the first step to verify FK conversion
+            if step_idx == _first_execution_step(episode_data):
+                print(f"[FK] first step joint_action: {joint_action}")
+                print(f"[FK] first step ee_action:    {ee_action}")
+            # Execute EE pose action in the environment
+            obs, _, terminated, _, info = env.step(ee_action)
+            frames.append(_frame_from_obs(obs))
+            if GUI_RENDER:
+                env.render()
+            # TODO: hongze fix nested-list handling
+            if terminated:
+                if info.get("success", False)[-1][-1]:
+                    outcome = "success"
+                if info.get("fail", False)[-1][-1]:
+                    outcome = "fail"
+                break
+            step_idx += 1
+    finally:
+        env.close()
+    # Save replay video
+    safe_goal = task_goal.replace(" ", "_").replace("/", "_")
+    os.makedirs(REPLAY_VIDEO_DIR, exist_ok=True)
+    video_name = f"{outcome}_{env_id}_ep{episode_idx}_{safe_goal}_step-{len(frames)}.mp4"
+    video_path = os.path.join(REPLAY_VIDEO_DIR, video_name)
+    imageio.mimsave(video_path, frames, fps=VIDEO_FPS)
+    print(f"Video saved to {video_path}")
+def replay(h5_data_dir: str = "/data/hongzefu/data_0214") -> None:
+    """Iterate through all task HDF5 files in the given directory and replay episodes one by one."""
+    env_id_list = BenchmarkEnvBuilder.get_task_list()
+    for env_id in env_id_list:
+        file_name = f"record_dataset_{env_id}.h5"
+        file_path = os.path.join(h5_data_dir, file_name)
+        if not os.path.exists(file_path):
+            print(f"Skip {env_id}: file does not exist: {file_path}")
+            continue
+        with h5py.File(file_path, "r") as data:
+            episode_indices = sorted(
+                int(k.split("_")[1])
+                for k in data.keys()
+                if k.startswith("episode_")
+            )
+            print(f"task: {env_id}, total {len(episode_indices)} episodes")
+            for episode_idx in episode_indices:
+                process_episode(data, episode_idx, env_id)
+if __name__ == "__main__":
+    import tyro
+    tyro.cli(replay)

scripts/dev/deprecated/dataset_replay-ee-parallel.py ADDED Viewed

	@@ -0,0 +1,214 @@

+"""
+Replay episodes from an HDF5 dataset and save videos.
+Read recorded end-effector pose actions (eef_action) from record_dataset_<Task>.h5,
+replay them in an environment wrapped by EE_POSE_ACTION_SPACE,
+and finally save side-by-side front/wrist camera videos to disk.
+"""
+import os
+from typing import Tuple
+import cv2
+import h5py
+import imageio
+import numpy as np
+from robomme.robomme_env import *
+from robomme.robomme_env.utils import *
+from robomme.env_record_wrapper import BenchmarkEnvBuilder
+from robomme.robomme_env.utils import EE_POSE_ACTION_SPACE
+# --- Config ---
+GUI_RENDER = False
+REPLAY_VIDEO_DIR = "replay_videos"
+VIDEO_FPS = 30
+MAX_STEPS = 1000
+def _frame_from_obs(obs: dict, is_video_frame: bool = False) -> np.ndarray:
+    """Build a single side-by-side frame from front and wrist camera obs."""
+    front = obs["front_camera"][0].cpu().numpy()
+    wrist = obs["wrist_camera"][0].cpu().numpy()
+    frame = np.concatenate([front, wrist], axis=1).astype(np.uint8)
+    if is_video_frame:
+        frame = cv2.rectangle(
+            frame, (0, 0), (frame.shape[1], frame.shape[0]), (255, 0, 0), 10
+        )
+    return frame
+def _first_execution_step(episode_data) -> int:
+    """Return the first step index that is not a video-demo step."""
+    step_idx = 0
+    while episode_data[f"timestep_{step_idx}"]["info"]["is_video_demo"][()]:
+        step_idx += 1
+    return step_idx
+def process_episode(
+    h5_file_path: str, episode_idx: int, env_id: str, gui_render: bool = False,
+) -> None:
+    """Replay one episode in HDF5: read EE pose actions, run the environment, and save video.
+    Each worker process opens the HDF5 file independently to avoid cross-process shared file handles.
+    """
+    with h5py.File(h5_file_path, "r") as env_data:
+        episode_data = env_data[f"episode_{episode_idx}"]
+        task_goal = episode_data["setup"]["task_goal"][()].decode()
+        total_steps = sum(1 for k in episode_data.keys() if k.startswith("timestep_"))
+        step_idx = _first_execution_step(episode_data)
+        print(f"[ep{episode_idx}] execution start step index: {step_idx}")
+        env_builder = BenchmarkEnvBuilder(
+            env_id=env_id,
+            dataset="train",
+            action_space=EE_POSE_ACTION_SPACE,
+            gui_render=gui_render,
+        )
+        env = env_builder.make_env_for_episode(
+            episode_idx,
+            max_steps=MAX_STEPS,
+            include_maniskill_obs=True,
+            include_front_depth=True,
+            include_wrist_depth=True,
+            include_front_camera_extrinsic=True,
+            include_wrist_camera_extrinsic=True,
+            include_available_multi_choices=True,
+            include_front_camera_intrinsic=True,
+            include_wrist_camera_intrinsic=True,
+        )
+        print(f"[ep{episode_idx}] task: {env_id}, goal: {task_goal}")
+        obs, info = env.reset()
+        frames = []
+        n_obs = len(obs["front_camera"])
+        for i in range(n_obs):
+            single_obs = {k: [v[i]] for k, v in obs.items()}
+            frames.append(_frame_from_obs(single_obs, is_video_frame=(i < n_obs - 1)))
+        print(f"[ep{episode_idx}] initial frame count (demo video + current frame): {len(frames)}")
+        outcome = "unknown"
+        try:
+            while step_idx < total_steps:
+                action = np.asarray(
+                    episode_data[f"timestep_{step_idx}"]["action"]["eef_action"][()],
+                    dtype=np.float32,
+                )
+                obs, _, terminated, _, info = env.step(action)
+                frames.append(_frame_from_obs(obs))
+                if gui_render:
+                    env.render()
+                # TODO: hongze makes this correct
+                # there are two many nested lists here, need to flatten them
+                if terminated:
+                    if info.get("success", False)[-1][-1]:
+                        outcome = "success"
+                    if info.get("fail", False)[-1][-1]:
+                        outcome = "fail"
+                    break
+                step_idx += 1
+        finally:
+            env.close()
+    # Save replay video
+    safe_goal = task_goal.replace(" ", "_").replace("/", "_")
+    os.makedirs(REPLAY_VIDEO_DIR, exist_ok=True)
+    video_name = f"{outcome}_{env_id}_ep{episode_idx}_{safe_goal}_step-{len(frames)}.mp4"
+    video_path = os.path.join(REPLAY_VIDEO_DIR, video_name)
+    imageio.mimsave(video_path, frames, fps=VIDEO_FPS)
+    print(f"[ep{episode_idx}] Video saved to {video_path}")
+def _worker_init(gpu_id_queue) -> None:
+    """Pool worker initializer that binds a GPU before CUDA initialization.
+    When each worker starts, it takes one GPU ID from the queue and sets env vars,
+    ensuring all later CUDA ops in that process run on the assigned GPU.
+    """
+    gpu_id = gpu_id_queue.get()
+    os.environ["CUDA_VISIBLE_DEVICES"] = str(gpu_id)
+    print(f"[Worker PID {os.getpid()}] bind GPU {gpu_id}")
+def _process_episode_worker(args: Tuple[str, int, str, bool]) -> str:
+    """multiprocessing worker entrypoint: unpack args and call process_episode."""
+    h5_file_path, episode_idx, env_id, gui_render = args
+    gpu_id = os.environ.get("CUDA_VISIBLE_DEVICES", "?")
+    try:
+        process_episode(h5_file_path, episode_idx, env_id, gui_render=gui_render)
+        return f"OK: {env_id} ep{episode_idx} (GPU {gpu_id})"
+    except Exception as e:
+        import traceback
+        traceback.print_exc()
+        return f"FAIL: {env_id} ep{episode_idx} (GPU {gpu_id}): {e}"
+def replay(
+    h5_data_dir: str = "/data/hongzefu/dataset_generate",
+    num_workers: int = 20,
+    gui_render: bool = False,
+    gpu_ids: str = "0,1",
+) -> None:
+    """Iterate through all task HDF5 files in the given directory and replay multiple episodes per env in parallel.
+    Args:
+        h5_data_dir: Directory containing HDF5 datasets.
+        num_workers: Number of parallel workers per env.
+        gui_render: Whether to enable GUI rendering (recommended off in multiprocessing).
+        gpu_ids: Comma-separated GPU ID list; workers use them in round-robin order.
+                 For example, "0,1" alternates assignment between GPU 0 and GPU 1.
+    """
+    import multiprocessing as mp
+    ctx = mp.get_context("spawn")
+    gpu_id_list = [int(g.strip()) for g in gpu_ids.split(",")]
+    print(f"Using GPUs: {gpu_id_list}, workers: {num_workers}")
+    env_id_list = BenchmarkEnvBuilder.get_task_list()
+    for env_id in env_id_list:
+        file_name = f"record_dataset_{env_id}.h5"
+        file_path = os.path.join(h5_data_dir, file_name)
+        if not os.path.exists(file_path):
+            print(f"Skip {env_id}: file does not exist: {file_path}")
+            continue
+        # Quickly read episode list and close file
+        with h5py.File(file_path, "r") as data:
+            episode_indices = sorted(
+                int(k.split("_")[1])
+                for k in data.keys()
+                if k.startswith("episode_")
+            )
+        print(f"task: {env_id}, total {len(episode_indices)} episodes, "
+              f"workers: {num_workers}, GPUs: {gpu_id_list}")
+        # Build worker argument list
+        worker_args = [
+            (file_path, ep_idx, env_id, gui_render)
+            for ep_idx in episode_indices
+        ]
+        # Create a new GPU assignment queue for each round; each worker grabs one GPU ID at startup
+        gpu_id_queue = ctx.Queue()
+        for i in range(num_workers):
+            gpu_id_queue.put(gpu_id_list[i % len(gpu_id_list)])
+        # Parallel replay (initializer binds GPU when each worker starts)
+        with ctx.Pool(
+            processes=num_workers,
+            initializer=_worker_init,
+            initargs=(gpu_id_queue,),
+        ) as pool:
+            results = pool.map(_process_episode_worker, worker_args)
+        for r in results:
+            print(r)
+if __name__ == "__main__":
+    import tyro
+    tyro.cli(replay)

scripts/dev/deprecated/dataset_replay-ee.py ADDED Viewed

	@@ -0,0 +1,163 @@

+"""
+Replay episodes from HDF5 datasets and save rollout videos.
+Loads recorded joint actions from record_dataset_<Task>.h5, steps the environment,
+and writes side-by-side front/wrist camera videos to disk.
+"""
+import os
+import cv2
+import h5py
+import imageio
+import numpy as np
+from robomme.robomme_env import *
+from robomme.robomme_env.utils import *
+from robomme.env_record_wrapper import BenchmarkEnvBuilder
+from robomme.robomme_env.utils import EE_POSE_ACTION_SPACE
+# --- Config ---
+GUI_RENDER = False
+REPLAY_VIDEO_DIR = "replay_videos"
+VIDEO_FPS = 30
+MAX_STEPS = 1000
+def _frame_from_obs(obs: dict, is_video_frame: bool = False) -> np.ndarray:
+    """Build a single side-by-side frame from front and wrist camera obs."""
+    front = obs["front_camera"][0].cpu().numpy()
+    wrist = obs["wrist_camera"][0].cpu().numpy()
+    frame = np.concatenate([front, wrist], axis=1).astype(np.uint8)
+    if is_video_frame:
+        frame = cv2.rectangle(
+            frame, (0, 0), (frame.shape[1], frame.shape[0]), (255, 0, 0), 10
+        )
+    return frame
+def _first_execution_step(episode_data) -> int:
+    """Return the first step index that is not a video-demo step."""
+    step_idx = 0
+    while episode_data[f"timestep_{step_idx}"]["info"]["is_video_demo"][()]:
+        step_idx += 1
+    return step_idx
+def process_episode(env_data: h5py.File, episode_idx: int, env_id: str) -> None:
+    """Replay one episode from HDF5 data, record frames, and save a video."""
+    episode_data = env_data[f"episode_{episode_idx}"]
+    task_goal = episode_data["setup"]["task_goal"][()].decode()
+    total_steps = sum(1 for k in episode_data.keys() if k.startswith("timestep_"))
+    step_idx = _first_execution_step(episode_data)
+    print(f"Execution start step index: {step_idx}")
+    env_builder = BenchmarkEnvBuilder(
+        env_id=env_id,
+        dataset="test",
+        action_space=EE_POSE_ACTION_SPACE,
+        gui_render=GUI_RENDER,
+    )
+    env = env_builder.make_env_for_episode(
+        episode_idx,
+        max_steps=MAX_STEPS,
+        include_maniskill_obs=True,
+        include_front_depth=True,
+        include_wrist_depth=True,
+        include_front_camera_extrinsic=True,
+        include_wrist_camera_extrinsic=True,
+        include_available_multi_choices=True,
+        include_front_camera_intrinsic=True,
+        include_wrist_camera_intrinsic=True,
+    )
+    print(f"task_name: {env_id}, episode_idx: {episode_idx}, task_goal: {task_goal}")
+    obs, info = env.reset()
+    # Obs lists: length 1 = no video, length > 1 = video; last element is current.
+    frames = []
+    n_obs = len(obs["front_camera"])
+    for i in range(n_obs):
+        single_obs = {k: [v[i]] for k, v in obs.items()}
+        frames.append(_frame_from_obs(single_obs, is_video_frame=(i < n_obs - 1)))
+    print(f"Initial frames (video + current): {len(frames)}")
+    outcome = "unknown"
+    try:
+        while step_idx < total_steps:
+            action = np.asarray(
+                episode_data[f"timestep_{step_idx}"]["action"]["eef_action"][()],
+                dtype=np.float32,
+            )
+            obs, _, terminated, _, info = env.step(action)
+            frames.append(_frame_from_obs(obs))
+            if GUI_RENDER:
+                env.render()
+            # TODO: hongze makes this correct
+            # there are two many nested lists here, need to flatten them
+            if terminated:
+                if info.get("success", False)[-1][-1]:
+                    outcome = "success"
+                if info.get("fail", False)[-1][-1]:
+                    outcome = "fail"
+                break
+            step_idx += 1
+    finally:
+        env.close()
+    safe_goal = task_goal.replace(" ", "_").replace("/", "_")
+    os.makedirs(REPLAY_VIDEO_DIR, exist_ok=True)
+    video_name = f"{outcome}_{env_id}_ep{episode_idx}_{safe_goal}_step-{len(frames)}.mp4"
+    video_path = os.path.join(REPLAY_VIDEO_DIR, video_name)
+    imageio.mimsave(video_path, frames, fps=VIDEO_FPS)
+    print(f"Saved video to {video_path}")
+def replay(h5_data_dir: str = "/data/hongzefu/dataset_generate") -> None:
+    """Replay all episodes from all task HDF5 files in the given directory."""
+    env_id_list = BenchmarkEnvBuilder.get_task_list()
+    env_id_list =[
+            "PickXtimes",
+            # "StopCube",
+            # "SwingXtimes",
+            # "BinFill",
+            # "VideoUnmaskSwap",
+            # "VideoUnmask",
+            # "ButtonUnmaskSwap",
+            # "ButtonUnmask",
+            # "VideoRepick",
+            # "VideoPlaceButton",
+            # "VideoPlaceOrder",
+            # "PickHighlight",
+            # "InsertPeg",
+            # 'MoveCube',
+            # "PatternLock",
+            # "RouteStick"
+                ]
+    for env_id in env_id_list:
+        file_name = f"record_dataset_{env_id}.h5"
+        file_path = os.path.join(h5_data_dir, file_name)
+        if not os.path.exists(file_path):
+            print(f"Skipping {env_id}: file not found: {file_path}")
+            continue
+        with h5py.File(file_path, "r") as data:
+            episode_indices = sorted(
+                int(k.split("_")[1])
+                for k in data.keys()
+                if k.startswith("episode_")
+            )
+            print(f"Task: {env_id}, has {len(episode_indices)} episodes")
+            for episode_idx in episode_indices[:1]:
+                process_episode(data, episode_idx, env_id)
+if __name__ == "__main__":
+    import tyro
+    tyro.cli(replay)

scripts/dev/eval-dataset-offline-rpy.py ADDED Viewed

	@@ -0,0 +1,195 @@

+import os
+import sys
+import json
+import h5py
+import numpy as np
+import argparse
+from pathlib import Path
+from typing import Any
+from robomme.robomme_env.utils.rpy_util import summarize_and_print_rpy_sequence
+def _write_split_rpy_summaries_json(
+    path: str,
+    demo_summaries: list[dict[str, Any]],
+    non_demo_summaries: list[dict[str, Any]],
+) -> None:
+    """
+    Summarize both demo and non-demo parts and write to JSON.
+    """
+    if os.path.dirname(path):
+        os.makedirs(os.path.dirname(path), exist_ok=True)
+    payload = {
+        "demo_summaries": demo_summaries,
+        "non_demo_summaries": non_demo_summaries,
+    }
+    with open(path, "w", encoding="utf-8") as f:
+        json.dump(payload, f, ensure_ascii=False, indent=2)
+def _read_is_video_demo(ts_group: h5py.Group) -> bool:
+    """Read info/is_video_demo from timestep group, default to False if missing."""
+    info_grp = ts_group.get("info")
+    if info_grp is not None and "is_video_demo" in info_grp:
+        val = info_grp["is_video_demo"][()]
+        if isinstance(val, (bytes, np.bytes_)):
+            return val in (b"True", b"true", b"1")
+        return bool(val)
+    return False
+def _extract_rpy_from_timestep(ts_group: h5py.Group) -> list[np.ndarray]:
+    """Extract RPY vector list from a single timestep."""
+    if (
+        "action" in ts_group
+        and "eef_action_raw" in ts_group["action"]
+        and "rpy" in ts_group["action"]["eef_action_raw"]
+    ):
+        rpy_data = ts_group["action"]["eef_action_raw"]["rpy"][()]
+        rpy_arr = np.asarray(rpy_data, dtype=np.float64)
+        if rpy_arr.ndim == 1:
+            rpy_arr = rpy_arr.reshape(1, -1)
+        else:
+            rpy_arr = rpy_arr.reshape(-1, rpy_arr.shape[-1])
+        if rpy_arr.shape[-1] == 3:
+            return [row.copy() for row in rpy_arr]
+    return []
+def main():
+    # Hardcoded dataset directory as requested
+    DATASET_DIR = Path("/data/hongzefu/dataset_generate")
+    parser = argparse.ArgumentParser(description="Read generated HDF5 dataset and verify RPY consistency.")
+    parser.add_argument("--dataset_path", type=str, default=str(DATASET_DIR), help="Path to the HDF5 file or directory to verify.")
+    args = parser.parse_args()
+    input_path = Path(args.dataset_path).resolve()
+    if not input_path.exists():
+        print(f"Error: Path not found: {input_path}")
+        sys.exit(1)
+    # Determine files to process
+    files_to_process = []
+    if input_path.is_file():
+        if input_path.suffix in ['.h5', '.hdf5']:
+            files_to_process.append(input_path)
+    elif input_path.is_dir():
+        files_to_process.extend(sorted(input_path.glob("*.h5")))
+        files_to_process.extend(sorted(input_path.glob("*.hdf5")))
+    if not files_to_process:
+        print(f"No HDF5 files found in {input_path}")
+        sys.exit(0)
+    print(f"Found {len(files_to_process)} files to process in {input_path}")
+    for dataset_path in files_to_process:
+        print(f"\n{'='*50}")
+        print(f"Processing dataset: {dataset_path}")
+        print(f"{'='*50}")
+        # Generate output JSON path
+        output_json_path = Path("/data/hongzefu/dataset_replay") / f"{dataset_path.stem}_rpy_summary.json"
+        demo_summaries: list[dict[str, Any]] = []
+        non_demo_summaries: list[dict[str, Any]] = []
+        try:
+            with h5py.File(dataset_path, "r") as f:
+                # Iterate through environments (e.g., env_PickXtimes...)
+                env_groups = [key for key in f.keys() if key.startswith("env_")]
+                env_groups.sort()
+                if not env_groups:
+                    print(f"Warning: No 'env_*' groups found in {dataset_path.name}")
+                for env_group_name in env_groups:
+                    env_group = f[env_group_name]
+                    print(f"Processing environment group: {env_group_name}")
+                    # Extract env_id from group name (remove 'env_' prefix)
+                    env_id = env_group_name[4:]
+                    # Iterate through episodes
+                    episode_keys = [key for key in env_group.keys() if key.startswith("episode_")]
+                    # Sort numerically by episode ID
+                    episode_keys.sort(key=lambda x: int(x.split('_')[1]) if '_' in x and x.split('_')[1].isdigit() else x)
+                    for episode_key in episode_keys:
+                        print(f"  Processing {episode_key}...")
+                        episode_group = env_group[episode_key]
+                        try:
+                            episode_idx = int(episode_key.split('_')[1])
+                        except (IndexError, ValueError):
+                             episode_idx = -1
+                        # Iterate through timesteps to reconstruct sequence
+                        timestep_keys = [key for key in episode_group.keys() if key.startswith("record_timestep_")]
+                        def get_timestep_idx(key):
+                            parts = key.split('_')
+                            try:
+                                return int(parts[2])
+                            except (IndexError, ValueError):
+                                return -1
+                        timestep_keys.sort(key=get_timestep_idx)
+                        # Separate RPY sequences by is_video_demo flag
+                        demo_rpy_seq: list[np.ndarray] = []
+                        non_demo_rpy_seq: list[np.ndarray] = []
+                        for ts_key in timestep_keys:
+                            ts_group = episode_group[ts_key]
+                            rpy_rows = _extract_rpy_from_timestep(ts_group)
+                            if rpy_rows:
+                                if _read_is_video_demo(ts_group):
+                                    demo_rpy_seq.extend(rpy_rows)
+                                else:
+                                    non_demo_rpy_seq.extend(rpy_rows)
+                        # Summarize demo portion
+                        if demo_rpy_seq:
+                            demo_summary = summarize_and_print_rpy_sequence(
+                                demo_rpy_seq,
+                                label=f"[{env_id}] episode {episode_idx} (demo)",
+                            )
+                            demo_summaries.append({
+                                "order_index": len(demo_summaries),
+                                "env_id": env_id,
+                                "episode": episode_idx,
+                                "action_space": "eef_pose",
+                                "summary": demo_summary,
+                            })
+                        # Summarize non-demo portion
+                        if non_demo_rpy_seq:
+                            non_demo_summary = summarize_and_print_rpy_sequence(
+                                non_demo_rpy_seq,
+                                label=f"[{env_id}] episode {episode_idx} (non-demo)",
+                            )
+                            non_demo_summaries.append({
+                                "order_index": len(non_demo_summaries),
+                                "env_id": env_id,
+                                "episode": episode_idx,
+                                "action_space": "eef_pose",
+                                "summary": non_demo_summary,
+                            })
+        except Exception as e:
+            print(f"An error occurred while reading {dataset_path.name}: {e}")
+            import traceback
+            traceback.print_exc()
+        # Write summary to JSON
+        if demo_summaries or non_demo_summaries:
+            _write_split_rpy_summaries_json(str(output_json_path), demo_summaries, non_demo_summaries)
+            print(f"Saved split RPY summaries to: {output_json_path}")
+            print(f"  demo entries: {len(demo_summaries)}, non-demo entries: {len(non_demo_summaries)}")
+        else:
+            print(f"No summaries generated for {dataset_path.name}")
+if __name__ == "__main__":
+    main()

scripts/dev/eval_dataset_replay.py ADDED Viewed

	@@ -0,0 +1,476 @@

+# -*- coding: utf-8 -*-
+# Script function: Unified dataset replay entry point, supporting 4 action spaces: joint_angle / ee_pose / waypoint / multi_choice.
+# Consistent with subgoal_evaluate_func.py main loop; difference is actions come from EpisodeDatasetResolver.
+import os
+from typing import Any, Optional
+import os
+os.environ["CUDA_VISIBLE_DEVICES"] = "1"
+import cv2
+import numpy as np
+import torch
+from robomme.robomme_env import *
+from robomme.robomme_env.utils import *
+from robomme.env_record_wrapper import (
+    BenchmarkEnvBuilder,
+    EpisodeDatasetResolver,
+)
+from robomme.env_record_wrapper.OraclePlannerDemonstrationWrapper import (
+    OraclePlannerDemonstrationWrapper,
+)
+from robomme.robomme_env.utils.choice_action_mapping import (
+    _unique_candidates,
+    extract_actor_position_xyz,
+    project_world_to_pixel,
+    select_target_with_pixel,
+)
+from robomme.robomme_env.utils.save_reset_video import save_robomme_video
+# Only enable one ACTION_SPACE; others are commented out for manual switching
+ACTION_SPACE = "joint_angle"
+GUI_RENDER = False
+DATASET_ROOT = "/data/hongzefu/data_0226"
+DEFAULT_ENV_IDS = [
+#"PickXtimes",
+ #"StopCube",
+#"SwingXtimes",
+ "BinFill",
+# "VideoUnmaskSwap",
+# "VideoUnmask",
+# "ButtonUnmaskSwap",
+# "ButtonUnmask",
+# "VideoRepick",
+# "VideoPlaceButton",
+# "VideoPlaceOrder",
+#"PickHighlight",
+#"InsertPeg",
+#"MoveCube",
+ #"PatternLock",
+ #"RouteStick",
+]
+OUT_VIDEO_DIR = "/data/hongzefu/dataset_replay"
+MAX_STEPS = 1000
+def _parse_oracle_command(choice_action: Optional[Any]) -> Optional[dict[str, Any]]:
+    if not isinstance(choice_action, dict):
+        return None
+    choice = choice_action.get("choice")
+    if not isinstance(choice, str) or not choice.strip():
+        return None
+    point = choice_action.get("point")
+    if not isinstance(point, (list, tuple, np.ndarray)) or len(point) != 2:
+        return None
+    return choice_action
+def _to_numpy_copy(value: Any) -> np.ndarray:
+    if isinstance(value, torch.Tensor):
+        value = value.detach().cpu().numpy()
+    else:
+        value = np.asarray(value)
+    return np.array(value, copy=True)
+def _to_frame_list(frames_like: Any) -> list[np.ndarray]:
+    if frames_like is None:
+        return []
+    if isinstance(frames_like, torch.Tensor):
+        arr = frames_like.detach().cpu().numpy()
+        if arr.ndim == 3:
+            return [np.array(arr, copy=True)]
+        if arr.ndim == 4:
+            return [np.array(x, copy=True) for x in arr]
+        return []
+    if isinstance(frames_like, np.ndarray):
+        if frames_like.ndim == 3:
+            return [np.array(frames_like, copy=True)]
+        if frames_like.ndim == 4:
+            return [np.array(x, copy=True) for x in frames_like]
+        return []
+    if isinstance(frames_like, (list, tuple)):
+        out = []
+        for frame in frames_like:
+            if frame is None:
+                continue
+            out.append(_to_numpy_copy(frame))
+        return out
+    try:
+        arr = np.asarray(frames_like)
+    except Exception:
+        return []
+    if arr.ndim == 3:
+        return [np.array(arr, copy=True)]
+    if arr.ndim == 4:
+        return [np.array(x, copy=True) for x in arr]
+    return []
+def _normalize_pixel_xy(pixel_like: Any) -> Optional[list[int]]:
+    if not isinstance(pixel_like, (list, tuple, np.ndarray)):
+        return None
+    if len(pixel_like) < 2:
+        return None
+    try:
+        x = float(pixel_like[0])
+        y = float(pixel_like[1])
+    except (TypeError, ValueError):
+        return None
+    if not np.isfinite(x) or not np.isfinite(y):
+        return None
+    return [int(np.rint(x)), int(np.rint(y))]
+def _normalize_point_yx_to_pixel_xy(point_like: Any) -> Optional[list[int]]:
+    if not isinstance(point_like, (list, tuple, np.ndarray)):
+        return None
+    if len(point_like) < 2:
+        return None
+    try:
+        y = float(point_like[0])
+        x = float(point_like[1])
+    except (TypeError, ValueError):
+        return None
+    if not np.isfinite(x) or not np.isfinite(y):
+        return None
+    return [int(np.rint(x)), int(np.rint(y))]
+def _find_oracle_wrapper(env_like: Any) -> Optional[OraclePlannerDemonstrationWrapper]:
+    current = env_like
+    visited: set[int] = set()
+    for _ in range(16):
+        if current is None:
+            return None
+        if isinstance(current, OraclePlannerDemonstrationWrapper):
+            return current
+        obj_id = id(current)
+        if obj_id in visited:
+            return None
+        visited.add(obj_id)
+        current = getattr(current, "env", None)
+    return None
+def _collect_multi_choice_visualization(
+    env_like: Any,
+    command: dict[str, Any],
+) -> tuple[list[list[int]], Optional[list[int]], Optional[list[int]]]:
+    clicked_pixel = _normalize_point_yx_to_pixel_xy(command.get("point"))
+    oracle_wrapper = _find_oracle_wrapper(env_like)
+    if oracle_wrapper is None:
+        return [], clicked_pixel, None
+    try:
+        _selected_target, solve_options = oracle_wrapper._build_step_options()
+        found_idx, _ = oracle_wrapper._resolve_command(command, solve_options)
+    except Exception:
+        return [], clicked_pixel, None
+    if found_idx is None or found_idx < 0 or found_idx >= len(solve_options):
+        return [], clicked_pixel, None
+    option = solve_options[found_idx]
+    available = option.get("available")
+    intrinsic_cv = getattr(oracle_wrapper, "_front_camera_intrinsic_cv", None)
+    extrinsic_cv = getattr(oracle_wrapper, "_front_camera_extrinsic_cv", None)
+    image_shape = getattr(oracle_wrapper, "_front_rgb_shape", None)
+    candidate_pixels: list[list[int]] = []
+    if available is not None:
+        for actor in _unique_candidates(available):
+            actor_pos = extract_actor_position_xyz(actor)
+            if actor_pos is None:
+                continue
+            projected = project_world_to_pixel(
+                actor_pos,
+                intrinsic_cv=intrinsic_cv,
+                extrinsic_cv=extrinsic_cv,
+                image_shape=image_shape,
+            )
+            if projected is None:
+                continue
+            candidate_pixels.append([int(projected[0]), int(projected[1])])
+    matched_pixel: Optional[list[int]] = None
+    if available is not None and clicked_pixel is not None:
+        matched = select_target_with_pixel(
+            available=available,
+            pixel_like=clicked_pixel,
+            intrinsic_cv=intrinsic_cv,
+            extrinsic_cv=extrinsic_cv,
+            image_shape=image_shape,
+        )
+        if isinstance(matched, dict):
+            matched_pixel = _normalize_pixel_xy(matched.get("projected_pixel"))
+    return candidate_pixels, clicked_pixel, matched_pixel
+def _make_blackboard(frame_like: Any) -> np.ndarray:
+    frame = _to_numpy_copy(frame_like)
+    if frame.ndim < 2:
+        return np.zeros((1, 1, 3), dtype=np.uint8)
+    h, w = int(frame.shape[0]), int(frame.shape[1])
+    if h <= 0 or w <= 0:
+        return np.zeros((1, 1, 3), dtype=np.uint8)
+    return np.zeros((h, w, 3), dtype=np.uint8)
+def _draw_candidate_blackboard(
+    frame_like: Any,
+    candidate_pixels: list[list[int]],
+) -> np.ndarray:
+    board = _make_blackboard(frame_like)
+    for pixel in candidate_pixels:
+        if len(pixel) < 2:
+            continue
+        cv2.circle(board, (int(pixel[0]), int(pixel[1])), 4, (0, 255, 255), 1)
+    return board
+def _draw_selection_blackboard(
+    frame_like: Any,
+    clicked_pixel: Optional[list[int]],
+    matched_pixel: Optional[list[int]],
+) -> np.ndarray:
+    board = _make_blackboard(frame_like)
+    if clicked_pixel is not None:
+        cv2.drawMarker(
+            board,
+            (int(clicked_pixel[0]), int(clicked_pixel[1])),
+            (255, 255, 0),
+            markerType=cv2.MARKER_TILTED_CROSS,
+            markerSize=10,
+            thickness=1,
+        )
+    if matched_pixel is not None:
+        cv2.circle(board, (int(matched_pixel[0]), int(matched_pixel[1])), 5, (255, 0, 0), 2)
+    return board
+def main():
+    from robomme.logging_utils import setup_logging
+    setup_logging(level="DEBUG")
+    env_id_list = BenchmarkEnvBuilder.get_task_list()
+    print(f"Running envs: {env_id_list}")
+    print(f"Using action_space: {ACTION_SPACE}")
+    #for env_id in env_id_list:
+    for env_id in DEFAULT_ENV_IDS:
+        env_builder = BenchmarkEnvBuilder(
+            env_id=env_id,
+            dataset="train",
+            action_space=ACTION_SPACE,
+            gui_render=GUI_RENDER,
+        )
+        episode_count = env_builder.get_episode_num()
+        print(f"[{env_id}] episode_count from metadata: {episode_count}")
+        env = None
+        for episode in range(episode_count):
+            if episode !=15:
+                continue
+            env = env_builder.make_env_for_episode(
+                episode,
+                max_steps=MAX_STEPS,
+                include_maniskill_obs=True,
+                include_front_depth=True,
+                include_wrist_depth=True,
+                include_front_camera_extrinsic=True,
+                include_wrist_camera_extrinsic=True,
+                include_available_multi_choices=True,
+                include_front_camera_intrinsic=True,
+                include_wrist_camera_intrinsic=True,
+            )
+            try:
+                dataset_resolver = EpisodeDatasetResolver(
+                    env_id=env_id,
+                    episode=episode,
+                    dataset_directory=DATASET_ROOT,
+                )
+            except KeyError as e:
+                print(f"[{env_id}] Episode {episode} missing in H5, skipping. ({e})")
+                if env is not None:
+                    env.close()
+                continue
+            # ======== Reset ========
+            # obs: dict-of-lists (columnar batch, list length = number of demo frames)
+            # info: flat dict (last frame values only)
+            obs, info = env.reset()
+            # --- Explicitly read all obs fields (each is a list) ---
+            maniskill_obs = obs["maniskill_obs"]
+            front_rgb_list = _to_frame_list(obs["front_rgb_list"])
+            wrist_rgb_list = _to_frame_list(obs["wrist_rgb_list"])
+            front_depth_list = obs["front_depth_list"]
+            wrist_depth_list = obs["wrist_depth_list"]
+            end_effector_pose_raw = obs["end_effector_pose_raw"]
+            eef_state_list = obs["eef_state_list"]
+            joint_state_list = obs["joint_state_list"]
+            # velocity = obs["velocity"]
+            gripper_state_list = obs["gripper_state_list"]
+            front_camera_extrinsic_list = obs["front_camera_extrinsic_list"]
+            wrist_camera_extrinsic_list = obs["wrist_camera_extrinsic_list"]
+            # --- Explicitly read all info fields (flat dict, last frame values) ---
+            task_goal = info["task_goal"]
+            simple_subgoal_online = info["simple_subgoal_online"]
+            grounded_subgoal_online = info["grounded_subgoal_online"]
+            available_multi_choices = info.get("available_multi_choices")
+            front_camera_intrinsic = info["front_camera_intrinsic"]
+            wrist_camera_intrinsic = info["wrist_camera_intrinsic"]
+            status = info.get("status")
+            # --- Video saving variable preparation (reset phase) ---
+            reset_base_frames = [_to_numpy_copy(f) for f in front_rgb_list]
+            reset_wrist_frames = [_to_numpy_copy(f) for f in wrist_rgb_list]
+            reset_right_frames = (
+                [_make_blackboard(f) for f in reset_base_frames]
+                if ACTION_SPACE == "multi_choice"
+                else None
+            )
+            reset_far_right_frames = (
+                [_make_blackboard(f) for f in reset_base_frames]
+                if ACTION_SPACE == "multi_choice"
+                else None
+            )
+            reset_subgoal_grounded = [grounded_subgoal_online] * len(front_rgb_list)
+            step = 0
+            episode_success = False
+            rollout_base_frames: list[np.ndarray] = []
+            rollout_wrist_frames: list[np.ndarray] = []
+            rollout_right_frames: list[np.ndarray] = []
+            rollout_far_right_frames: list[np.ndarray] = []
+            rollout_subgoal_grounded: list[Any] = []
+            # ======== Step loop ========
+            while True:
+                replay_key = ACTION_SPACE
+                action = dataset_resolver.get_step(replay_key, step)
+                if ACTION_SPACE == "multi_choice":
+                    action = _parse_oracle_command(action)
+                if action is None:
+                    break
+                candidate_pixels: list[list[int]] = []
+                clicked_pixel: Optional[list[int]] = None
+                matched_pixel: Optional[list[int]] = None
+                if ACTION_SPACE == "multi_choice":
+                    candidate_pixels, clicked_pixel, matched_pixel = _collect_multi_choice_visualization(
+                        env, action
+                    )
+                # step returns: obs (dict-of-lists), reward (scalar tensor),
+                #               terminated (scalar tensor), truncated (scalar tensor), info (flat dict)
+                obs, reward, terminated, truncated, info = env.step(action)
+                # --- Explicitly read all obs fields (dict-of-lists, typically 1 element per list) --                maniskill_obs = obs["maniskill_obs"]
+                front_rgb_list = _to_frame_list(obs["front_rgb_list"])
+                wrist_rgb_list = _to_frame_list(obs["wrist_rgb_list"])
+                front_depth_list = obs["front_depth_list"]
+                wrist_depth_list = obs["wrist_depth_list"]
+                end_effector_pose_raw = obs["end_effector_pose_raw"]
+                eef_state_list = obs["eef_state_list"]
+                joint_state_list = obs["joint_state_list"]
+                gripper_state_list = obs["gripper_state_list"]
+                front_camera_extrinsic_list = obs["front_camera_extrinsic_list"]
+                wrist_camera_extrinsic_list = obs["wrist_camera_extrinsic_list"]
+                # --- Explicitly read all info fields (flat dict) ---
+                task_goal = info["task_goal"]
+                simple_subgoal_online = info["simple_subgoal_online"]
+                grounded_subgoal_online = info["grounded_subgoal_online"]
+                available_multi_choices = info.get("available_multi_choices")
+                front_camera_intrinsic = info["front_camera_intrinsic"]
+                wrist_camera_intrinsic = info["wrist_camera_intrinsic"]
+                status = info.get("status")
+                # --- Video saving variable preparation (replay phase) ---
+                rollout_base_frames.extend(
+                    _to_numpy_copy(f) for f in front_rgb_list
+                )
+                rollout_wrist_frames.extend(
+                    _to_numpy_copy(f) for f in wrist_rgb_list
+                )
+                if ACTION_SPACE == "multi_choice":
+                    for base_frame in front_rgb_list:
+                        rollout_right_frames.append(
+                            _draw_candidate_blackboard(
+                                base_frame,
+                                candidate_pixels=candidate_pixels,
+                            )
+                        )
+                        rollout_far_right_frames.append(
+                            _draw_selection_blackboard(
+                                base_frame,
+                                clicked_pixel=clicked_pixel,
+                                matched_pixel=matched_pixel,
+                            )
+                        )
+                rollout_subgoal_grounded.extend([grounded_subgoal_online] * len(front_rgb_list))
+                terminated_flag = bool(terminated.item())
+                truncated_flag = bool(truncated.item())
+                step += 1
+                if GUI_RENDER:
+                    env.render()
+                if truncated_flag:
+                    print(f"[{env_id}] episode {episode} steps exceeded, step {step}.")
+                    break
+                if terminated_flag:
+                    if status == "success":
+                        print(f"[{env_id}] episode {episode} success.")
+                        episode_success = True
+                    elif status == "fail":
+                        print(f"[{env_id}] episode {episode} failed.")
+                    break
+            # ======== Video saving ========
+            save_robomme_video(
+                reset_base_frames=reset_base_frames,
+                reset_wrist_frames=reset_wrist_frames,
+                rollout_base_frames=rollout_base_frames,
+                rollout_wrist_frames=rollout_wrist_frames,
+                reset_subgoal_grounded=reset_subgoal_grounded,
+                rollout_subgoal_grounded=rollout_subgoal_grounded,
+                out_video_dir=OUT_VIDEO_DIR,
+                action_space=ACTION_SPACE,
+                env_id=env_id,
+                episode=episode,
+                episode_success=episode_success,
+                reset_right_frames=reset_right_frames if ACTION_SPACE == "multi_choice" else None,
+                rollout_right_frames=rollout_right_frames if ACTION_SPACE == "multi_choice" else None,
+                reset_far_right_frames=(
+                    reset_far_right_frames if ACTION_SPACE == "multi_choice" else None
+                ),
+                rollout_far_right_frames=(
+                    rollout_far_right_frames if ACTION_SPACE == "multi_choice" else None
+                ),
+            )
+        if env is not None:
+            env.close()
+if __name__ == "__main__":
+    main()

scripts/dev/evaluate_dataset_replay-parallelv3.py ADDED Viewed

	@@ -0,0 +1,669 @@

+# -*- coding: utf-8 -*-
+# Script function: Unified dataset replay entry point, supports four action_spaces: joint_angle / ee_pose / waypoint / multi_choice.
+# Consistent with subgoal_evaluate_func.py's main loop and debug fields; the difference is that actions come from EpisodeDatasetResolver.
+# [New] Support parallel multi-process replay and alternate task assignment between two GPUs.
+import os
+import sys
+import argparse
+import concurrent.futures
+import multiprocessing as mp
+from typing import Any, Optional
+import cv2
+import numpy as np
+import torch
+from robomme.robomme_env import *
+from robomme.robomme_env.utils import *
+from robomme.env_record_wrapper import (
+    BenchmarkEnvBuilder,
+    EpisodeDatasetResolver,
+)
+from robomme.env_record_wrapper.OraclePlannerDemonstrationWrapper import (
+    OraclePlannerDemonstrationWrapper,
+)
+from robomme.robomme_env.utils.choice_action_mapping import (
+    _unique_candidates,
+    extract_actor_position_xyz,
+    project_world_to_pixel,
+    select_target_with_pixel,
+)
+from robomme.robomme_env.utils.save_reset_video import save_robomme_video
+AVAILABLE_ACTION_SPACES = [
+    "joint_angle",
+    "ee_pose",
+    "waypoint",
+    "multi_choice",
+]
+GUI_RENDER = False
+DATASET_ROOT = "/data/hongzefu/data_0226-test"
+OVERRIDE_METADATA_PATH = "/data/hongzefu/data_0226-test"
+# ######## Video saving variables (output directory) start ########
+# Video output directory: Independently hardcoded, not aligned with h5 path or env_id
+OUT_VIDEO_DIR = "/data/hongzefu/dataset_replay-0226-test"
+# ######## Video saving variables (output directory) end ########
+MAX_STEPS = 2000
+DEFAULT_ENV_IDS = [
+# "PickXtimes",
+# "StopCube",
+# "SwingXtimes",
+# "BinFill",
+# "VideoUnmaskSwap",
+# "VideoUnmask",
+# "ButtonUnmaskSwap",
+# "ButtonUnmask",
+# "VideoRepick",
+# "VideoPlaceButton",
+# "VideoPlaceOrder",
+# "PickHighlight",
+# "InsertPeg",
+# "MoveCube",
+"PatternLock",
+# "RouteStick",
+ ]
+def _parse_oracle_command(choice_action: Optional[Any]) -> Optional[dict[str, Any]]:
+    if not isinstance(choice_action, dict):
+        return None
+    choice = choice_action.get("choice")
+    if not isinstance(choice, str) or not choice.strip():
+        return None
+    if "point" not in choice_action:
+        return None
+    return {
+        "choice": choice_action.get("choice"),
+        "point": choice_action.get("point"),
+    }
+def _to_numpy_copy(value: Any) -> np.ndarray:
+    if isinstance(value, torch.Tensor):
+        value = value.detach().cpu().numpy()
+    else:
+        value = np.asarray(value)
+    return np.array(value, copy=True)
+def _to_frame_list(frames_like: Any) -> list[np.ndarray]:
+    if frames_like is None:
+        return []
+    if isinstance(frames_like, torch.Tensor):
+        arr = frames_like.detach().cpu().numpy()
+        if arr.ndim == 3:
+            return [np.array(arr, copy=True)]
+        if arr.ndim == 4:
+            return [np.array(x, copy=True) for x in arr]
+        return []
+    if isinstance(frames_like, np.ndarray):
+        if frames_like.ndim == 3:
+            return [np.array(frames_like, copy=True)]
+        if frames_like.ndim == 4:
+            return [np.array(x, copy=True) for x in frames_like]
+        return []
+    if isinstance(frames_like, (list, tuple)):
+        out = []
+        for frame in frames_like:
+            if frame is None:
+                continue
+            out.append(_to_numpy_copy(frame))
+        return out
+    try:
+        arr = np.asarray(frames_like)
+    except Exception:
+        return []
+    if arr.ndim == 3:
+        return [np.array(arr, copy=True)]
+    if arr.ndim == 4:
+        return [np.array(x, copy=True) for x in arr]
+    return []
+def _normalize_pixel_xy(pixel_like: Any) -> Optional[list[int]]:
+    if not isinstance(pixel_like, (list, tuple, np.ndarray)):
+        return None
+    if len(pixel_like) < 2:
+        return None
+    try:
+        x = float(pixel_like[0])
+        y = float(pixel_like[1])
+    except (TypeError, ValueError):
+        return None
+    if not np.isfinite(x) or not np.isfinite(y):
+        return None
+    return [int(np.rint(x)), int(np.rint(y))]
+def _normalize_point_yx_to_pixel_xy(point_like: Any) -> Optional[list[int]]:
+    if not isinstance(point_like, (list, tuple, np.ndarray)):
+        return None
+    if len(point_like) < 2:
+        return None
+    try:
+        y = float(point_like[0])
+        x = float(point_like[1])
+    except (TypeError, ValueError):
+        return None
+    if not np.isfinite(x) or not np.isfinite(y):
+        return None
+    return [int(np.rint(x)), int(np.rint(y))]
+def _find_oracle_wrapper(env_like: Any) -> Optional[OraclePlannerDemonstrationWrapper]:
+    current = env_like
+    visited: set[int] = set()
+    for _ in range(16):
+        if current is None:
+            return None
+        if isinstance(current, OraclePlannerDemonstrationWrapper):
+            return current
+        obj_id = id(current)
+        if obj_id in visited:
+            return None
+        visited.add(obj_id)
+        current = getattr(current, "env", None)
+    return None
+def _collect_multi_choice_visualization(
+    env_like: Any,
+    command: dict[str, Any],
+) -> tuple[list[list[int]], Optional[list[int]], Optional[list[int]]]:
+    clicked_pixel = _normalize_point_yx_to_pixel_xy(command.get("point"))
+    oracle_wrapper = _find_oracle_wrapper(env_like)
+    if oracle_wrapper is None:
+        return [], clicked_pixel, None
+    try:
+        _selected_target, solve_options = oracle_wrapper._build_step_options()
+        found_idx, _ = oracle_wrapper._resolve_command(command, solve_options)
+    except Exception:
+        return [], clicked_pixel, None
+    if found_idx is None or found_idx < 0 or found_idx >= len(solve_options):
+        return [], clicked_pixel, None
+    option = solve_options[found_idx]
+    available = option.get("available")
+    intrinsic_cv = getattr(oracle_wrapper, "_front_camera_intrinsic_cv", None)
+    extrinsic_cv = getattr(oracle_wrapper, "_front_camera_extrinsic_cv", None)
+    image_shape = getattr(oracle_wrapper, "_front_rgb_shape", None)
+    candidate_pixels: list[list[int]] = []
+    if available is not None:
+        for actor in _unique_candidates(available):
+            actor_pos = extract_actor_position_xyz(actor)
+            if actor_pos is None:
+                continue
+            projected = project_world_to_pixel(
+                actor_pos,
+                intrinsic_cv=intrinsic_cv,
+                extrinsic_cv=extrinsic_cv,
+                image_shape=image_shape,
+            )
+            if projected is None:
+                continue
+            candidate_pixels.append([int(projected[0]), int(projected[1])])
+    matched_pixel: Optional[list[int]] = None
+    if available is not None and clicked_pixel is not None:
+        matched = select_target_with_pixel(
+            available=available,
+            pixel_like=clicked_pixel,
+            intrinsic_cv=intrinsic_cv,
+            extrinsic_cv=extrinsic_cv,
+            image_shape=image_shape,
+        )
+        if isinstance(matched, dict):
+            matched_pixel = _normalize_pixel_xy(matched.get("projected_pixel"))
+    return candidate_pixels, clicked_pixel, matched_pixel
+def _make_blackboard(frame_like: Any) -> np.ndarray:
+    frame = _to_numpy_copy(frame_like)
+    if frame.ndim < 2:
+        return np.zeros((1, 1, 3), dtype=np.uint8)
+    h, w = int(frame.shape[0]), int(frame.shape[1])
+    if h <= 0 or w <= 0:
+        return np.zeros((1, 1, 3), dtype=np.uint8)
+    return np.zeros((h, w, 3), dtype=np.uint8)
+def _draw_candidate_blackboard(
+    frame_like: Any,
+    candidate_pixels: list[list[int]],
+) -> np.ndarray:
+    board = _make_blackboard(frame_like)
+    for pixel in candidate_pixels:
+        if len(pixel) < 2:
+            continue
+        cv2.circle(board, (int(pixel[0]), int(pixel[1])), 4, (0, 255, 255), 1)
+    return board
+def _draw_selection_blackboard(
+    frame_like: Any,
+    clicked_pixel: Optional[list[int]],
+    matched_pixel: Optional[list[int]],
+) -> np.ndarray:
+    board = _make_blackboard(frame_like)
+    if clicked_pixel is not None:
+        cv2.drawMarker(
+            board,
+            (int(clicked_pixel[0]), int(clicked_pixel[1])),
+            (255, 255, 0),
+            markerType=cv2.MARKER_TILTED_CROSS,
+            markerSize=10,
+            thickness=1,
+        )
+    if matched_pixel is not None:
+        cv2.circle(board, (int(matched_pixel[0]), int(matched_pixel[1])), 5, (255, 0, 0), 2)
+    return board
+def init_worker(gpu_id: int):
+    """
+    Worker process initialization function, sets CUDA_VISIBLE_DEVICES.
+    """
+    from robomme.logging_utils import setup_logging
+    setup_logging(level="DEBUG")
+    os.environ["CUDA_VISIBLE_DEVICES"] = str(gpu_id)
+    # print(f"[Worker] Initialized on GPU {gpu_id} (PID: {os.getpid()})")
+def evaluate_episode(
+    env_id: str,
+    episode: int,
+    dataset_root: str,
+    override_metadata_path: str,
+    action_space: str,
+    out_video_dir: str,
+    gui_render: bool
+) -> str:
+    """
+    Evaluation logic for a single Episode.
+    """
+    # Reconstruct Envs and Resolver (avoid passing complex objects across processes)
+    env_builder = BenchmarkEnvBuilder(
+        env_id=env_id,
+        dataset="train",
+        action_space=action_space,
+        gui_render=gui_render,
+        override_metadata_path=override_metadata_path,
+    )
+    env = None
+    dataset_resolver = None
+    try:
+        env = env_builder.make_env_for_episode(
+            episode,
+            max_steps=MAX_STEPS,
+            include_maniskill_obs=True,
+            include_front_depth=True,
+            include_wrist_depth=True,
+            include_front_camera_extrinsic=True,
+            include_wrist_camera_extrinsic=True,
+            include_available_multi_choices=True,
+            include_front_camera_intrinsic=True,
+            include_wrist_camera_intrinsic=True,
+        )
+        dataset_resolver = EpisodeDatasetResolver(
+            env_id=env_id,
+            episode=episode,
+            dataset_directory=dataset_root,
+        )
+        # obs_batch, reward_batch, terminated_batch, truncated_batch, info_batch = env.reset()
+        obs_batch, info_batch = env.reset()
+        # Maintain debug variable semantics from subgoal_evaluate_func.py
+        # Note: These local variables in multi-processing can be simplified if printing is not needed, but unpacking logic is retained for consistency.
+        maniskill_obs = obs_batch["maniskill_obs"]
+        front_camera = _to_frame_list(obs_batch["front_rgb_list"])
+        wrist_camera = _to_frame_list(obs_batch["wrist_rgb_list"])
+        # Other variables unpacking skipped unless used downstream
+        task_goal_list = info_batch["task_goal"]
+        # task_goal = task_goal_list[0] if task_goal_list else None
+        info = {k: v[-1] if isinstance(v, list) and v else v for k, v in info_batch.items()}
+        # terminated = bool(terminated_batch[-1].item())
+        # truncated = bool(truncated_batch[-1].item())
+        # ######## Video saving variable preparation (reset phase) start ########
+        reset_base_frames = [_to_numpy_copy(f) for f in front_camera]
+        reset_wrist_frames = [_to_numpy_copy(f) for f in wrist_camera]
+        reset_right_frames = (
+            [_make_blackboard(f) for f in reset_base_frames]
+            if action_space == "multi_choice"
+            else None
+        )
+        reset_far_right_frames = (
+            [_make_blackboard(f) for f in reset_base_frames]
+            if action_space == "multi_choice"
+            else None
+        )
+        _subgoal = info_batch.get("grounded_subgoal_online", "")
+        reset_subgoal_grounded = _subgoal if isinstance(_subgoal, list) else [_subgoal] * len(reset_base_frames)
+        # ######## Video saving variable preparation (reset phase) end ########
+        # ######## Video saving variable initialization start ########
+        step = 0
+        read_step = 0
+        episode_success = False
+        rollout_base_frames: list[np.ndarray] = []
+        rollout_wrist_frames: list[np.ndarray] = []
+        rollout_right_frames: list[np.ndarray] = []
+        rollout_far_right_frames: list[np.ndarray] = []
+        rollout_subgoal_grounded: list[Any] = []
+        # ######## Video saving variable initialization end ########
+        while True:
+            replay_key = action_space
+            action = dataset_resolver.get_step(replay_key, read_step)
+            read_step += 1
+            if action is None:
+                break
+            if action_space == "multi_choice":
+                action = _parse_oracle_command(action)
+            if action is None:
+                continue
+            candidate_pixels: list[list[int]] = []
+            clicked_pixel: Optional[list[int]] = None
+            matched_pixel: Optional[list[int]] = None
+            if action_space == "multi_choice":
+                candidate_pixels, clicked_pixel, matched_pixel = _collect_multi_choice_visualization(
+                    env, action
+                )
+            obs_batch, reward_batch, terminated_batch, truncated_batch, info_batch = env.step(action)
+            # Maintain debug variable semantics from subgoal_evaluate_func.py
+            front_camera = _to_frame_list(obs_batch["front_rgb_list"])
+            wrist_camera = _to_frame_list(obs_batch["wrist_rgb_list"])
+            subgoal_grounded = info_batch["grounded_subgoal_online"]
+            # ######## Video saving variable preparation (replay phase) start ########
+            rollout_base_frames.extend(_to_numpy_copy(f) for f in front_camera)
+            rollout_wrist_frames.extend(_to_numpy_copy(f) for f in wrist_camera)
+            if action_space == "multi_choice":
+                for base_frame in front_camera:
+                    rollout_right_frames.append(
+                        _draw_candidate_blackboard(
+                            base_frame,
+                            candidate_pixels=candidate_pixels,
+                        )
+                    )
+                    rollout_far_right_frames.append(
+                        _draw_selection_blackboard(
+                            base_frame,
+                            clicked_pixel=clicked_pixel,
+                            matched_pixel=matched_pixel,
+                        )
+                    )
+            if isinstance(subgoal_grounded, list):
+                rollout_subgoal_grounded.extend(subgoal_grounded)
+            else:
+                rollout_subgoal_grounded.extend([subgoal_grounded] * len(front_camera))
+            # ######## Video saving variable preparation (replay phase) end ########
+            info = {k: v[-1] if isinstance(v, list) and v else v for k, v in info_batch.items()}
+            terminated = bool(terminated_batch.item())
+            truncated = bool(truncated_batch.item())
+            step += 1
+            if gui_render:
+                env.render()
+            if truncated:
+                # print(f"[{env_id}] episode {episode} step limit exceeded, step {step}.")
+                break
+            if terminated:
+                succ = info.get("success")
+                if succ == torch.tensor([True]) or (
+                    isinstance(succ, torch.Tensor) and succ.item()
+                ):
+                    # print(f"[{env_id}] episode {episode} success.")
+                    episode_success = True
+                elif info.get("fail", False):
+                    # print(f"[{env_id}] episode {episode} failed.")
+                    pass
+                break
+        # ######## Video saving section start ########
+        save_robomme_video(
+            reset_base_frames=reset_base_frames,
+            reset_wrist_frames=reset_wrist_frames,
+            rollout_base_frames=rollout_base_frames,
+            rollout_wrist_frames=rollout_wrist_frames,
+            reset_subgoal_grounded=reset_subgoal_grounded,
+            rollout_subgoal_grounded=rollout_subgoal_grounded,
+            out_video_dir=out_video_dir,
+            action_space=action_space,
+            env_id=env_id,
+            episode=episode,
+            episode_success=episode_success,
+            reset_right_frames=reset_right_frames if action_space == "multi_choice" else None,
+            rollout_right_frames=rollout_right_frames if action_space == "multi_choice" else None,
+            reset_far_right_frames=(
+                reset_far_right_frames if action_space == "multi_choice" else None
+            ),
+            rollout_far_right_frames=(
+                rollout_far_right_frames if action_space == "multi_choice" else None
+            ),
+        )
+        # ######## Video saving section end ########
+        status = "Success" if episode_success else "Ended"
+        if not episode_success and info.get("fail", False):
+            status = "Failed"
+        return f"[{env_id}] episode {episode} {status} (step {step})"
+    except (FileNotFoundError, KeyError) as exc:
+        return f"[{env_id}] episode {episode} data missing, skip. {exc}"
+    except Exception as exc:
+        # import traceback
+        # traceback.print_exc()
+        return f"[{env_id}] episode {episode} replay exception, skip. {exc}"
+    finally:
+        if dataset_resolver is not None:
+            dataset_resolver.close()
+        if env is not None:
+            env.close()
+def _parse_gpus(s: str) -> list[int]:
+    """Parse --gpus: '0' -> [0], '1' -> [1], '0,1' -> [0, 1]."""
+    allowed = {"0", "1", "0,1", "1,0"}
+    v = s.strip()
+    if v not in allowed:
+        raise argparse.ArgumentTypeError(
+            f"--gpus must be one of: 0, 1, 0,1 (got {s!r})"
+        )
+    if "," in v:
+        return [int(x) for x in v.split(",")]
+    return [int(v)]
+def _parse_action_spaces(s: str) -> list[str]:
+    tokens = [x.strip() for x in s.split(",") if x.strip()]
+    if not tokens:
+        raise argparse.ArgumentTypeError(
+            "--action_spaces cannot be empty. "
+            f"Allowed action spaces: {AVAILABLE_ACTION_SPACES}"
+        )
+    selected: list[str] = []
+    seen: set[str] = set()
+    invalid: list[str] = []
+    for token in tokens:
+        if token not in AVAILABLE_ACTION_SPACES:
+            invalid.append(token)
+            continue
+        if token in seen:
+            continue
+        seen.add(token)
+        selected.append(token)
+    if invalid:
+        raise argparse.ArgumentTypeError(
+            f"Invalid action space(s): {invalid}. "
+            f"Allowed action spaces: {AVAILABLE_ACTION_SPACES}"
+        )
+    if not selected:
+        raise argparse.ArgumentTypeError(
+            "--action_spaces has no valid value after parsing. "
+            f"Allowed action spaces: {AVAILABLE_ACTION_SPACES}"
+        )
+    return selected
+def _parse_args() -> argparse.Namespace:
+    parser = argparse.ArgumentParser(description="Replay dataset for one env_id in parallel.")
+    parser.add_argument(
+        "--envid",
+        required=False,
+        type=str,
+        default=None,
+        help="Single environment id to replay.",
+    )
+    parser.add_argument(
+        "--max_workers",
+        type=int,
+        default=20,
+        help="Total max workers (split across GPUs when using 2 GPUs).",
+    )
+    parser.add_argument(
+        "--gpus",
+        type=_parse_gpus,
+        default=[1],
+        help="GPUs to use: '0' (GPU 0 only), '1' (GPU 1 only), '0,1' (both). Default: 0.",
+    )
+    parser.add_argument(
+        "--action_spaces",
+        type=_parse_action_spaces,
+        #default=AVAILABLE_ACTION_SPACES.copy(),
+        default=["multi_choice",],
+        help=(
+            "Comma-separated action spaces to replay in order. "
+            "Available: joint_angle,ee_pose,waypoint,multi_choice. "
+            "Default: joint_angle,ee_pose,waypoint,multi_choice."
+        ),
+    )
+    return parser.parse_args()
+def process_env_id(
+    env_id: str,
+    max_workers_total: int,
+    gpu_ids: list[int],
+    action_spaces: list[str],
+):
+    # Simple calculation of episode count (do not instantiate env_builder to avoid overhead, or lightweight instantiation)
+    # To get episode_count, we need to instantiate env_builder once
+    # But we only need the metadata parsing part
+    temp_builder = BenchmarkEnvBuilder(
+        env_id=env_id,
+        dataset="train",
+        action_space=action_spaces[0],
+        gui_render=False, # Just to read metadata
+        override_metadata_path=OVERRIDE_METADATA_PATH,
+    )
+    episode_count = temp_builder.get_episode_num()
+    print(f"[{env_id}] episodes={episode_count}")
+    print(f"Parallel execution with max_workers={max_workers_total} on GPU(s) {gpu_ids}")
+    if episode_count == 0:
+        print(f"[{env_id}] No episodes to replay, skip.")
+        return
+    n_gpus = len(gpu_ids)
+    if n_gpus == 1:
+        mw0 = max(max_workers_total, 1)
+        mw1 = 0
+        print(f"Pool (GPU {gpu_ids[0]}): {mw0} workers")
+    else:
+        mw0 = (max_workers_total + 1) // 2
+        mw1 = max_workers_total // 2
+        if mw0 == 0:
+            mw0 = 1
+        if mw1 == 0 and max_workers_total > 1:
+            mw1 = 1
+        print(f"Pool 0 (GPU {gpu_ids[0]}): {mw0} workers")
+        print(f"Pool 1 (GPU {gpu_ids[1]}): {mw1} workers")
+    for action_space in action_spaces:
+        print(f"[{env_id}] >>> action_space={action_space}")
+        futures = []
+        if n_gpus == 1:
+            g0 = gpu_ids[0]
+            with concurrent.futures.ProcessPoolExecutor(max_workers=mw0, initializer=init_worker, initargs=(g0,)) as executor0:
+                for episode in range(episode_count):
+                    future = executor0.submit(
+                        evaluate_episode,
+                        env_id=env_id,
+                        episode=episode,
+                        dataset_root=DATASET_ROOT,
+                        override_metadata_path=OVERRIDE_METADATA_PATH,
+                        action_space=action_space,
+                        out_video_dir=OUT_VIDEO_DIR,
+                        gui_render=GUI_RENDER
+                    )
+                    futures.append(future)
+                for future in concurrent.futures.as_completed(futures):
+                    res = future.result()
+                    print(res)
+        else:
+            g0, g1 = gpu_ids[0], gpu_ids[1]
+            with concurrent.futures.ProcessPoolExecutor(max_workers=mw0, initializer=init_worker, initargs=(g0,)) as executor0, \
+                 concurrent.futures.ProcessPoolExecutor(max_workers=mw1, initializer=init_worker, initargs=(g1,)) as executor1:
+                for episode in range(episode_count):
+                    if episode % 2 == 0:
+                        executor = executor0
+                    else:
+                        executor = executor1
+                        if mw1 == 0:
+                            executor = executor0
+                    future = executor.submit(
+                        evaluate_episode,
+                        env_id=env_id,
+                        episode=episode,
+                        dataset_root=DATASET_ROOT,
+                        override_metadata_path=OVERRIDE_METADATA_PATH,
+                        action_space=action_space,
+                        out_video_dir=OUT_VIDEO_DIR,
+                        gui_render=GUI_RENDER
+                    )
+                    futures.append(future)
+                for future in concurrent.futures.as_completed(futures):
+                    res = future.result()
+                    print(res)
+        print(f"[{env_id}] <<< action_space={action_space} done")
+def main():
+    from robomme.logging_utils import setup_logging
+    setup_logging(level="DEBUG")
+    # Force use of spawn to avoid PyTorch/CUDA fork issues
+    mp.set_start_method("spawn", force=True)
+    args = _parse_args()
+    env_ids = [args.envid] if args.envid else DEFAULT_ENV_IDS
+    max_workers_total = args.max_workers
+    gpu_ids = args.gpus
+    action_spaces = args.action_spaces
+    print(f"Plan to replay envs: {env_ids} (gpus={gpu_ids})")
+    print(f"Available action spaces: {AVAILABLE_ACTION_SPACES}")
+    print(f"Selected action spaces: {action_spaces}")
+    for env_id in env_ids:
+        print(f"=== Processing {env_id} ===")
+        process_env_id(env_id, max_workers_total, gpu_ids, action_spaces)
+if __name__ == "__main__":
+    main()

scripts/dev/evaluate_dataset_replay-parallelv4-noresolver.py ADDED Viewed

	@@ -0,0 +1,676 @@

+# -*- coding: utf-8 -*-
+# Script function: Unified dataset replay entry point, supports four action_spaces: joint_angle / ee_pose / waypoint / multi_choice.
+# Consistent with subgoal_evaluate_func.py's main loop and debug fields; actions are read directly from HDF5 dataset files.
+# [New] Support parallel multi-process replay and alternate task assignment between two GPUs.
+import os
+import sys
+import argparse
+import concurrent.futures
+import multiprocessing as mp
+from pathlib import Path
+from typing import Any, Optional
+import cv2
+import h5py
+import numpy as np
+import torch
+# Support running this file directly: python scripts/dev/<script>.py
+REPO_ROOT = Path(__file__).resolve().parents[2]
+if str(REPO_ROOT) not in sys.path:
+    sys.path.insert(0, str(REPO_ROOT))
+from scripts.dataset_replay import _build_action_sequence
+from robomme.robomme_env import *
+from robomme.robomme_env.utils import *
+from robomme.env_record_wrapper import (
+    BenchmarkEnvBuilder,
+)
+from robomme.env_record_wrapper.OraclePlannerDemonstrationWrapper import (
+    OraclePlannerDemonstrationWrapper,
+)
+from robomme.robomme_env.utils.choice_action_mapping import (
+    _unique_candidates,
+    extract_actor_position_xyz,
+    project_world_to_pixel,
+    select_target_with_pixel,
+)
+from robomme.robomme_env.utils.save_reset_video import save_robomme_video
+AVAILABLE_ACTION_SPACES = [
+    "joint_angle",
+    "ee_pose",
+    "waypoint",
+    "multi_choice",
+]
+GUI_RENDER = False
+DATASET_ROOT = "/data/hongzefu/data_0226-test"
+OVERRIDE_METADATA_PATH = "/data/hongzefu/data_0226-test"
+# ######## Video saving variables (output directory) start ########
+# Video output directory: Independently hardcoded, not aligned with h5 path or env_id
+OUT_VIDEO_DIR = "/data/hongzefu/dataset_replay-0226-test"
+# ######## Video saving variables (output directory) end ########
+MAX_STEPS = 2000
+DEFAULT_ENV_IDS = [
+"PickXtimes",
+"StopCube",
+"SwingXtimes",
+"BinFill",
+"VideoUnmaskSwap",
+"VideoUnmask",
+"ButtonUnmaskSwap",
+"ButtonUnmask",
+"VideoRepick",
+"VideoPlaceButton",
+"VideoPlaceOrder",
+"PickHighlight",
+"InsertPeg",
+"MoveCube",
+"PatternLock",
+"RouteStick",
+ ]
+def _parse_oracle_command(choice_action: Optional[Any]) -> Optional[dict[str, Any]]:
+    if not isinstance(choice_action, dict):
+        return None
+    choice = choice_action.get("choice")
+    if not isinstance(choice, str) or not choice.strip():
+        return None
+    if "point" not in choice_action:
+        return None
+    return {
+        "choice": choice_action.get("choice"),
+        "point": choice_action.get("point"),
+    }
+def _to_numpy_copy(value: Any) -> np.ndarray:
+    if isinstance(value, torch.Tensor):
+        value = value.detach().cpu().numpy()
+    else:
+        value = np.asarray(value)
+    return np.array(value, copy=True)
+def _to_frame_list(frames_like: Any) -> list[np.ndarray]:
+    if frames_like is None:
+        return []
+    if isinstance(frames_like, torch.Tensor):
+        arr = frames_like.detach().cpu().numpy()
+        if arr.ndim == 3:
+            return [np.array(arr, copy=True)]
+        if arr.ndim == 4:
+            return [np.array(x, copy=True) for x in arr]
+        return []
+    if isinstance(frames_like, np.ndarray):
+        if frames_like.ndim == 3:
+            return [np.array(frames_like, copy=True)]
+        if frames_like.ndim == 4:
+            return [np.array(x, copy=True) for x in frames_like]
+        return []
+    if isinstance(frames_like, (list, tuple)):
+        out = []
+        for frame in frames_like:
+            if frame is None:
+                continue
+            out.append(_to_numpy_copy(frame))
+        return out
+    try:
+        arr = np.asarray(frames_like)
+    except Exception:
+        return []
+    if arr.ndim == 3:
+        return [np.array(arr, copy=True)]
+    if arr.ndim == 4:
+        return [np.array(x, copy=True) for x in arr]
+    return []
+def _normalize_pixel_xy(pixel_like: Any) -> Optional[list[int]]:
+    if not isinstance(pixel_like, (list, tuple, np.ndarray)):
+        return None
+    if len(pixel_like) < 2:
+        return None
+    try:
+        x = float(pixel_like[0])
+        y = float(pixel_like[1])
+    except (TypeError, ValueError):
+        return None
+    if not np.isfinite(x) or not np.isfinite(y):
+        return None
+    return [int(np.rint(x)), int(np.rint(y))]
+def _normalize_point_yx_to_pixel_xy(point_like: Any) -> Optional[list[int]]:
+    if not isinstance(point_like, (list, tuple, np.ndarray)):
+        return None
+    if len(point_like) < 2:
+        return None
+    try:
+        y = float(point_like[0])
+        x = float(point_like[1])
+    except (TypeError, ValueError):
+        return None
+    if not np.isfinite(x) or not np.isfinite(y):
+        return None
+    return [int(np.rint(x)), int(np.rint(y))]
+def _find_oracle_wrapper(env_like: Any) -> Optional[OraclePlannerDemonstrationWrapper]:
+    current = env_like
+    visited: set[int] = set()
+    for _ in range(16):
+        if current is None:
+            return None
+        if isinstance(current, OraclePlannerDemonstrationWrapper):
+            return current
+        obj_id = id(current)
+        if obj_id in visited:
+            return None
+        visited.add(obj_id)
+        current = getattr(current, "env", None)
+    return None
+def _collect_multi_choice_visualization(
+    env_like: Any,
+    command: dict[str, Any],
+) -> tuple[list[list[int]], Optional[list[int]], Optional[list[int]]]:
+    clicked_pixel = _normalize_point_yx_to_pixel_xy(command.get("point"))
+    oracle_wrapper = _find_oracle_wrapper(env_like)
+    if oracle_wrapper is None:
+        return [], clicked_pixel, None
+    try:
+        _selected_target, solve_options = oracle_wrapper._build_step_options()
+        found_idx, _ = oracle_wrapper._resolve_command(command, solve_options)
+    except Exception:
+        return [], clicked_pixel, None
+    if found_idx is None or found_idx < 0 or found_idx >= len(solve_options):
+        return [], clicked_pixel, None
+    option = solve_options[found_idx]
+    available = option.get("available")
+    intrinsic_cv = getattr(oracle_wrapper, "_front_camera_intrinsic_cv", None)
+    extrinsic_cv = getattr(oracle_wrapper, "_front_camera_extrinsic_cv", None)
+    image_shape = getattr(oracle_wrapper, "_front_rgb_shape", None)
+    candidate_pixels: list[list[int]] = []
+    if available is not None:
+        for actor in _unique_candidates(available):
+            actor_pos = extract_actor_position_xyz(actor)
+            if actor_pos is None:
+                continue
+            projected = project_world_to_pixel(
+                actor_pos,
+                intrinsic_cv=intrinsic_cv,
+                extrinsic_cv=extrinsic_cv,
+                image_shape=image_shape,
+            )
+            if projected is None:
+                continue
+            candidate_pixels.append([int(projected[0]), int(projected[1])])
+    matched_pixel: Optional[list[int]] = None
+    if available is not None and clicked_pixel is not None:
+        matched = select_target_with_pixel(
+            available=available,
+            pixel_like=clicked_pixel,
+            intrinsic_cv=intrinsic_cv,
+            extrinsic_cv=extrinsic_cv,
+            image_shape=image_shape,
+        )
+        if isinstance(matched, dict):
+            matched_pixel = _normalize_pixel_xy(matched.get("projected_pixel"))
+    return candidate_pixels, clicked_pixel, matched_pixel
+def _make_blackboard(frame_like: Any) -> np.ndarray:
+    frame = _to_numpy_copy(frame_like)
+    if frame.ndim < 2:
+        return np.zeros((1, 1, 3), dtype=np.uint8)
+    h, w = int(frame.shape[0]), int(frame.shape[1])
+    if h <= 0 or w <= 0:
+        return np.zeros((1, 1, 3), dtype=np.uint8)
+    return np.zeros((h, w, 3), dtype=np.uint8)
+def _draw_candidate_blackboard(
+    frame_like: Any,
+    candidate_pixels: list[list[int]],
+) -> np.ndarray:
+    board = _make_blackboard(frame_like)
+    for pixel in candidate_pixels:
+        if len(pixel) < 2:
+            continue
+        cv2.circle(board, (int(pixel[0]), int(pixel[1])), 4, (0, 255, 255), 1)
+    return board
+def _draw_selection_blackboard(
+    frame_like: Any,
+    clicked_pixel: Optional[list[int]],
+    matched_pixel: Optional[list[int]],
+) -> np.ndarray:
+    board = _make_blackboard(frame_like)
+    if clicked_pixel is not None:
+        cv2.drawMarker(
+            board,
+            (int(clicked_pixel[0]), int(clicked_pixel[1])),
+            (255, 255, 0),
+            markerType=cv2.MARKER_TILTED_CROSS,
+            markerSize=10,
+            thickness=1,
+        )
+    if matched_pixel is not None:
+        cv2.circle(board, (int(matched_pixel[0]), int(matched_pixel[1])), 5, (255, 0, 0), 2)
+    return board
+def init_worker(gpu_id: int):
+    """
+    Worker process initialization function, sets CUDA_VISIBLE_DEVICES.
+    """
+    from robomme.logging_utils import setup_logging
+    setup_logging(level="DEBUG")
+    os.environ["CUDA_VISIBLE_DEVICES"] = str(gpu_id)
+    # print(f"[Worker] Initialized on GPU {gpu_id} (PID: {os.getpid()})")
+def evaluate_episode(
+    env_id: str,
+    episode: int,
+    dataset_root: str,
+    override_metadata_path: str,
+    action_space: str,
+    out_video_dir: str,
+    gui_render: bool
+) -> str:
+    """
+    Evaluation logic for a single Episode.
+    """
+    # Reconstruct envs in worker process (avoid passing complex objects across processes)
+    env_builder = BenchmarkEnvBuilder(
+        env_id=env_id,
+        dataset="train",
+        action_space=action_space,
+        gui_render=gui_render,
+        override_metadata_path=override_metadata_path,
+    )
+    env = None
+    try:
+        env = env_builder.make_env_for_episode(
+            episode,
+            max_steps=MAX_STEPS,
+            include_maniskill_obs=True,
+            include_front_depth=True,
+            include_wrist_depth=True,
+            include_front_camera_extrinsic=True,
+            include_wrist_camera_extrinsic=True,
+            include_available_multi_choices=True,
+            include_front_camera_intrinsic=True,
+            include_wrist_camera_intrinsic=True,
+        )
+        file_path = Path(dataset_root) / f"record_dataset_{env_id}.h5"
+        if not file_path.exists():
+            raise FileNotFoundError(f"dataset file not found: {file_path}")
+        episode_key = f"episode_{episode}"
+        with h5py.File(file_path, "r") as data:
+            if episode_key not in data:
+                raise KeyError(f"missing key '{episode_key}' in {file_path}")
+            action_sequence = _build_action_sequence(data[episode_key], action_space)
+        print(
+            f"[{env_id}] episode={episode} h5={file_path} "
+            f"episode_key={episode_key} action_space={action_space} "
+            f"action_count={len(action_sequence)}"
+        )
+        # obs_batch, reward_batch, terminated_batch, truncated_batch, info_batch = env.reset()
+        obs_batch, info_batch = env.reset()
+        # Maintain debug variable semantics from subgoal_evaluate_func.py
+        # Note: These local variables in multi-processing can be simplified if printing is not needed, but unpacking logic is retained for consistency.
+        maniskill_obs = obs_batch["maniskill_obs"]
+        front_camera = _to_frame_list(obs_batch["front_rgb_list"])
+        wrist_camera = _to_frame_list(obs_batch["wrist_rgb_list"])
+        # Other variables unpacking skipped unless used downstream
+        task_goal_list = info_batch["task_goal"]
+        # task_goal = task_goal_list[0] if task_goal_list else None
+        info = {k: v[-1] if isinstance(v, list) and v else v for k, v in info_batch.items()}
+        # terminated = bool(terminated_batch[-1].item())
+        # truncated = bool(truncated_batch[-1].item())
+        # ######## Video saving variable preparation (reset phase) start ########
+        reset_base_frames = [_to_numpy_copy(f) for f in front_camera]
+        reset_wrist_frames = [_to_numpy_copy(f) for f in wrist_camera]
+        reset_right_frames = (
+            [_make_blackboard(f) for f in reset_base_frames]
+            if action_space == "multi_choice"
+            else None
+        )
+        reset_far_right_frames = (
+            [_make_blackboard(f) for f in reset_base_frames]
+            if action_space == "multi_choice"
+            else None
+        )
+        _subgoal = info_batch.get("grounded_subgoal_online", "")
+        reset_subgoal_grounded = _subgoal if isinstance(_subgoal, list) else [_subgoal] * len(reset_base_frames)
+        # ######## Video saving variable preparation (reset phase) end ########
+        # ######## Video saving variable initialization start ########
+        step = 0
+        episode_success = False
+        rollout_base_frames: list[np.ndarray] = []
+        rollout_wrist_frames: list[np.ndarray] = []
+        rollout_right_frames: list[np.ndarray] = []
+        rollout_far_right_frames: list[np.ndarray] = []
+        rollout_subgoal_grounded: list[Any] = []
+        # ######## Video saving variable initialization end ########
+        for _, action in enumerate(action_sequence):
+            if action_space == "multi_choice":
+                action = _parse_oracle_command(action)
+            if action is None:
+                continue
+            candidate_pixels: list[list[int]] = []
+            clicked_pixel: Optional[list[int]] = None
+            matched_pixel: Optional[list[int]] = None
+            if action_space == "multi_choice":
+                candidate_pixels, clicked_pixel, matched_pixel = _collect_multi_choice_visualization(
+                    env, action
+                )
+            obs_batch, reward_batch, terminated_batch, truncated_batch, info_batch = env.step(action)
+            # Maintain debug variable semantics from subgoal_evaluate_func.py
+            front_camera = _to_frame_list(obs_batch["front_rgb_list"])
+            wrist_camera = _to_frame_list(obs_batch["wrist_rgb_list"])
+            subgoal_grounded = info_batch["grounded_subgoal_online"]
+            # ######## Video saving variable preparation (replay phase) start ########
+            rollout_base_frames.extend(_to_numpy_copy(f) for f in front_camera)
+            rollout_wrist_frames.extend(_to_numpy_copy(f) for f in wrist_camera)
+            if action_space == "multi_choice":
+                for base_frame in front_camera:
+                    rollout_right_frames.append(
+                        _draw_candidate_blackboard(
+                            base_frame,
+                            candidate_pixels=candidate_pixels,
+                        )
+                    )
+                    rollout_far_right_frames.append(
+                        _draw_selection_blackboard(
+                            base_frame,
+                            clicked_pixel=clicked_pixel,
+                            matched_pixel=matched_pixel,
+                        )
+                    )
+            if isinstance(subgoal_grounded, list):
+                rollout_subgoal_grounded.extend(subgoal_grounded)
+            else:
+                rollout_subgoal_grounded.extend([subgoal_grounded] * len(front_camera))
+            # ######## Video saving variable preparation (replay phase) end ########
+            info = {k: v[-1] if isinstance(v, list) and v else v for k, v in info_batch.items()}
+            terminated = bool(terminated_batch.item())
+            truncated = bool(truncated_batch.item())
+            step += 1
+            if gui_render:
+                env.render()
+            if truncated:
+                # print(f"[{env_id}] episode {episode} step limit exceeded, step {step}.")
+                break
+            if terminated:
+                succ = info.get("success")
+                if succ == torch.tensor([True]) or (
+                    isinstance(succ, torch.Tensor) and succ.item()
+                ):
+                    # print(f"[{env_id}] episode {episode} success.")
+                    episode_success = True
+                elif info.get("fail", False):
+                    # print(f"[{env_id}] episode {episode} failed.")
+                    pass
+                break
+        # ######## Video saving section start ########
+        save_robomme_video(
+            reset_base_frames=reset_base_frames,
+            reset_wrist_frames=reset_wrist_frames,
+            rollout_base_frames=rollout_base_frames,
+            rollout_wrist_frames=rollout_wrist_frames,
+            reset_subgoal_grounded=reset_subgoal_grounded,
+            rollout_subgoal_grounded=rollout_subgoal_grounded,
+            out_video_dir=out_video_dir,
+            action_space=action_space,
+            env_id=env_id,
+            episode=episode,
+            episode_success=episode_success,
+            reset_right_frames=reset_right_frames if action_space == "multi_choice" else None,
+            rollout_right_frames=rollout_right_frames if action_space == "multi_choice" else None,
+            reset_far_right_frames=(
+                reset_far_right_frames if action_space == "multi_choice" else None
+            ),
+            rollout_far_right_frames=(
+                rollout_far_right_frames if action_space == "multi_choice" else None
+            ),
+        )
+        # ######## Video saving section end ########
+        status = "Success" if episode_success else "Ended"
+        if not episode_success and info.get("fail", False):
+            status = "Failed"
+        return f"[{env_id}] episode {episode} {status} (step {step})"
+    except (FileNotFoundError, KeyError) as exc:
+        return f"[{env_id}] episode {episode} data missing, skip. {exc}"
+    except Exception as exc:
+        # import traceback
+        # traceback.print_exc()
+        return f"[{env_id}] episode {episode} replay exception, skip. {exc}"
+    finally:
+        if env is not None:
+            env.close()
+def _parse_gpus(s: str) -> list[int]:
+    """Parse --gpus: '0' -> [0], '1' -> [1], '0,1' -> [0, 1]."""
+    allowed = {"0", "1", "0,1", "1,0"}
+    v = s.strip()
+    if v not in allowed:
+        raise argparse.ArgumentTypeError(
+            f"--gpus must be one of: 0, 1, 0,1 (got {s!r})"
+        )
+    if "," in v:
+        return [int(x) for x in v.split(",")]
+    return [int(v)]
+def _parse_action_spaces(s: str) -> list[str]:
+    tokens = [x.strip() for x in s.split(",") if x.strip()]
+    if not tokens:
+        raise argparse.ArgumentTypeError(
+            "--action_spaces cannot be empty. "
+            f"Allowed action spaces: {AVAILABLE_ACTION_SPACES}"
+        )
+    selected: list[str] = []
+    seen: set[str] = set()
+    invalid: list[str] = []
+    for token in tokens:
+        if token not in AVAILABLE_ACTION_SPACES:
+            invalid.append(token)
+            continue
+        if token in seen:
+            continue
+        seen.add(token)
+        selected.append(token)
+    if invalid:
+        raise argparse.ArgumentTypeError(
+            f"Invalid action space(s): {invalid}. "
+            f"Allowed action spaces: {AVAILABLE_ACTION_SPACES}"
+        )
+    if not selected:
+        raise argparse.ArgumentTypeError(
+            "--action_spaces has no valid value after parsing. "
+            f"Allowed action spaces: {AVAILABLE_ACTION_SPACES}"
+        )
+    return selected
+def _parse_args() -> argparse.Namespace:
+    parser = argparse.ArgumentParser(description="Replay dataset for one env_id in parallel.")
+    parser.add_argument(
+        "--envid",
+        required=False,
+        type=str,
+        default=None,
+        help="Single environment id to replay.",
+    )
+    parser.add_argument(
+        "--max_workers",
+        type=int,
+        default=20,
+        help="Total max workers (split across GPUs when using 2 GPUs).",
+    )
+    parser.add_argument(
+        "--gpus",
+        type=_parse_gpus,
+        default=[1],
+        help="GPUs to use: '0' (GPU 0 only), '1' (GPU 1 only), '0,1' (both). Default: 0.",
+    )
+    parser.add_argument(
+        "--action_spaces",
+        type=_parse_action_spaces,
+        #default=AVAILABLE_ACTION_SPACES.copy(),
+        default=["multi_choice",],
+        help=(
+            "Comma-separated action spaces to replay in order. "
+            "Available: joint_angle,ee_pose,waypoint,multi_choice. "
+            "Default: joint_angle,ee_pose,waypoint,multi_choice."
+        ),
+    )
+    return parser.parse_args()
+def process_env_id(
+    env_id: str,
+    max_workers_total: int,
+    gpu_ids: list[int],
+    action_spaces: list[str],
+):
+    # Simple calculation of episode count (do not instantiate env_builder to avoid overhead, or lightweight instantiation)
+    # To get episode_count, we need to instantiate env_builder once
+    # But we only need the metadata parsing part
+    temp_builder = BenchmarkEnvBuilder(
+        env_id=env_id,
+        dataset="train",
+        action_space=action_spaces[0],
+        gui_render=False, # Just to read metadata
+        override_metadata_path=OVERRIDE_METADATA_PATH,
+    )
+    episode_count = temp_builder.get_episode_num()
+    print(f"[{env_id}] episodes={episode_count}")
+    print(f"Parallel execution with max_workers={max_workers_total} on GPU(s) {gpu_ids}")
+    if episode_count == 0:
+        print(f"[{env_id}] No episodes to replay, skip.")
+        return
+    n_gpus = len(gpu_ids)
+    if n_gpus == 1:
+        mw0 = max(max_workers_total, 1)
+        mw1 = 0
+        print(f"Pool (GPU {gpu_ids[0]}): {mw0} workers")
+    else:
+        mw0 = (max_workers_total + 1) // 2
+        mw1 = max_workers_total // 2
+        if mw0 == 0:
+            mw0 = 1
+        if mw1 == 0 and max_workers_total > 1:
+            mw1 = 1
+        print(f"Pool 0 (GPU {gpu_ids[0]}): {mw0} workers")
+        print(f"Pool 1 (GPU {gpu_ids[1]}): {mw1} workers")
+    for action_space in action_spaces:
+        print(f"[{env_id}] >>> action_space={action_space}")
+        futures = []
+        if n_gpus == 1:
+            g0 = gpu_ids[0]
+            with concurrent.futures.ProcessPoolExecutor(max_workers=mw0, initializer=init_worker, initargs=(g0,)) as executor0:
+                for episode in range(episode_count):
+                    future = executor0.submit(
+                        evaluate_episode,
+                        env_id=env_id,
+                        episode=episode,
+                        dataset_root=DATASET_ROOT,
+                        override_metadata_path=OVERRIDE_METADATA_PATH,
+                        action_space=action_space,
+                        out_video_dir=OUT_VIDEO_DIR,
+                        gui_render=GUI_RENDER
+                    )
+                    futures.append(future)
+                for future in concurrent.futures.as_completed(futures):
+                    res = future.result()
+                    print(res)
+        else:
+            g0, g1 = gpu_ids[0], gpu_ids[1]
+            with concurrent.futures.ProcessPoolExecutor(max_workers=mw0, initializer=init_worker, initargs=(g0,)) as executor0, \
+                 concurrent.futures.ProcessPoolExecutor(max_workers=mw1, initializer=init_worker, initargs=(g1,)) as executor1:
+                for episode in range(episode_count):
+                    if episode % 2 == 0:
+                        executor = executor0
+                    else:
+                        executor = executor1
+                        if mw1 == 0:
+                            executor = executor0
+                    future = executor.submit(
+                        evaluate_episode,
+                        env_id=env_id,
+                        episode=episode,
+                        dataset_root=DATASET_ROOT,
+                        override_metadata_path=OVERRIDE_METADATA_PATH,
+                        action_space=action_space,
+                        out_video_dir=OUT_VIDEO_DIR,
+                        gui_render=GUI_RENDER
+                    )
+                    futures.append(future)
+                for future in concurrent.futures.as_completed(futures):
+                    res = future.result()
+                    print(res)
+        print(f"[{env_id}] <<< action_space={action_space} done")
+def main():
+    from robomme.logging_utils import setup_logging
+    setup_logging(level="DEBUG")
+    # Force use of spawn to avoid PyTorch/CUDA fork issues
+    mp.set_start_method("spawn", force=True)
+    args = _parse_args()
+    env_ids = [args.envid] if args.envid else DEFAULT_ENV_IDS
+    max_workers_total = args.max_workers
+    gpu_ids = args.gpus
+    action_spaces = args.action_spaces
+    print(f"Plan to replay envs: {env_ids} (gpus={gpu_ids})")
+    print(f"Available action spaces: {AVAILABLE_ACTION_SPACES}")
+    print(f"Selected action spaces: {action_spaces}")
+    for env_id in env_ids:
+        print(f"=== Processing {env_id} ===")
+        process_env_id(env_id, max_workers_total, gpu_ids, action_spaces)
+if __name__ == "__main__":
+    main()

scripts/dev/generate-dataset-control-seed-readJson-advanceV3.py ADDED Viewed

	@@ -0,0 +1,878 @@

+import os
+import argparse
+import json
+import shutil
+from concurrent.futures import ProcessPoolExecutor, as_completed
+from pathlib import Path
+import numpy as np
+from typing import Any, Dict, Iterable, List, Optional, Set
+import h5py
+import gymnasium as gym
+# Import Robomme related environment wrappers and exception classes
+from robomme.env_record_wrapper import RobommeRecordWrapper, FailsafeTimeout
+from robomme.robomme_env import *
+from robomme.robomme_env.utils.SceneGenerationError import SceneGenerationError
+# from util import *
+import torch
+# Import planner and related exception classes
+from robomme.robomme_env.utils.planner_fail_safe import (
+    FailAwarePandaArmMotionPlanningSolver,
+    FailAwarePandaStickMotionPlanningSolver,
+    ScrewPlanFailure,
+)
+"""
+Script function: Parallel generation of Robomme environment datasets.
+This script supports multi-process parallel environment simulation, generating HDF5 datasets containing RGB, depth, segmentation, etc.
+Key features include:
+1. Configure environment list and parameters.
+2. Parallel execution of multiple episode simulations.
+3. Use FailAware planner to attempt to solve tasks.
+4. Record data and save as HDF5 file.
+5. Merge multiple temporarily generated HDF5 files into a final dataset.
+"""
+# List of all supported environment module names
+DEFAULT_ENVS =[
+"PickXtimes",
+"StopCube",
+"SwingXtimes",
+"BinFill",
+"VideoUnmaskSwap",
+"VideoUnmask",
+"ButtonUnmaskSwap",
+"ButtonUnmask",
+"VideoRepick",
+"VideoPlaceButton",
+"VideoPlaceOrder",
+"PickHighlight",
+"InsertPeg",
+'MoveCube',
+"PatternLock",
+"RouteStick"
+    ]
+# Reference dataset metadata root directory: used to read difficulty and seed
+SOURCE_METADATA_ROOT = Path("/data/hongzefu/robomme_benchmark/src/robomme/env_metadata/1206")
+VALID_DIFFICULTIES: Set[str] = {"easy", "medium", "hard"}
+DATASET_SCREW_MAX_ATTEMPTS = 3
+DATASET_RRT_MAX_ATTEMPTS = 3
+def _load_env_metadata_records(
+    env_id: str,
+    metadata_root: Path,
+) -> List[Dict[str, Any]]:
+    """
+    Read metadata records for an environment from the reference directory to control difficulty and seed.
+    """
+    metadata_path = metadata_root / f"record_dataset_{env_id}_metadata.json"
+    if not metadata_path.exists():
+        raise FileNotFoundError(
+            f"Metadata file not found for env '{env_id}': {metadata_path}"
+        )
+    with metadata_path.open("r", encoding="utf-8") as metadata_file:
+        payload = json.load(metadata_file)
+    raw_records = payload.get("records")
+    if not isinstance(raw_records, list) or not raw_records:
+        raise ValueError(
+            f"Metadata file has no valid 'records' list: {metadata_path}"
+        )
+    normalized_records: List[Dict[str, Any]] = []
+    for idx, raw_record in enumerate(raw_records):
+        if not isinstance(raw_record, dict):
+            raise ValueError(
+                f"Invalid metadata record at index {idx} in {metadata_path}"
+            )
+        if "episode" not in raw_record or "seed" not in raw_record or "difficulty" not in raw_record:
+            raise ValueError(
+                f"Metadata record missing episode/seed/difficulty at index {idx} in {metadata_path}"
+            )
+        try:
+            episode = int(raw_record["episode"])
+            seed = int(raw_record["seed"])
+        except (TypeError, ValueError) as exc:
+            raise ValueError(
+                f"Metadata record has non-integer episode/seed at index {idx} in {metadata_path}"
+            ) from exc
+        difficulty_raw = str(raw_record["difficulty"]).strip().lower()
+        if difficulty_raw not in VALID_DIFFICULTIES:
+            raise ValueError(
+                f"Metadata record has invalid difficulty '{raw_record['difficulty']}' "
+                f"at index {idx} in {metadata_path}. Expected one of {sorted(VALID_DIFFICULTIES)}."
+            )
+        normalized_records.append(
+            {
+                "episode": episode,
+                "seed": seed,
+                "difficulty": difficulty_raw,
+            }
+        )
+    normalized_records.sort(key=lambda rec: rec["episode"])
+    print(
+        f"Loaded {len(normalized_records)} metadata records for {env_id} from {metadata_path}"
+    )
+    return normalized_records
+def _build_seed_candidates_from_metadata(
+    episode: int,
+    metadata_records: List[Dict[str, Any]],
+) -> List[Dict[str, Any]]:
+    """
+    Construct candidate (seed, difficulty) list for current episode.
+    Strictly use only the seed from metadata for the same episode, no cross-episode fallback.
+    """
+    if not metadata_records:
+        return []
+    same_episode_records = [rec for rec in metadata_records if rec["episode"] == episode]
+    if not same_episode_records:
+        return []
+    if len(same_episode_records) > 1:
+        raise ValueError(
+            f"Found duplicated metadata records for episode {episode}. "
+            "Strict mode requires exactly one source record per episode."
+        )
+    rec = same_episode_records[0]
+    return [{"seed": int(rec["seed"]), "difficulty": rec["difficulty"]}]
+def _tensor_to_bool(value) -> bool:
+    """
+    Helper function: Convert Tensor or numpy array to Python bool type.
+    Used to handle success/failure flags from different sources.
+    """
+    if value is None:
+        return False
+    if isinstance(value, torch.Tensor):
+        return bool(value.detach().cpu().bool().item())
+    if isinstance(value, np.ndarray):
+        return bool(np.any(value))
+    return bool(value)
+def _split_episode_indices(num_episodes: int, max_chunks: int) -> List[List[int]]:
+    """
+    Helper function: Split total episodes into multiple chunks for parallel processing by different processes.
+    Args:
+        num_episodes: Total number of episodes
+        max_chunks: Max number of chunks (usually equals number of workers)
+    Returns:
+        List containing lists of episode indices
+    """
+    if num_episodes <= 0:
+        return []
+    chunk_count = min(max_chunks, num_episodes)
+    base_size, remainder = divmod(num_episodes, chunk_count)
+    chunks: List[List[int]] = []
+    start = 0
+    for chunk_idx in range(chunk_count):
+        # If there is a remainder, allocate one extra episode to the first 'remainder' chunks
+        stop = start + base_size + (1 if chunk_idx < remainder else 0)
+        chunks.append(list(range(start, stop)))
+        start = stop
+    return chunks
+def _run_episode_attempt(
+    env_id: str,
+    episode: int,
+    seed: int,
+    temp_dataset_path: Path,
+    save_video: bool,
+    difficulty: Optional[str],
+) -> bool:
+    """
+    Run a single episode attempt and report success or failure.
+    Main steps:
+    1. Initialize environment parameters and Gym environment.
+    2. Apply RobommeRecordWrapper for data recording.
+    3. Select appropriate planner based on environment type (PandaStick or PandaArm).
+    4. Get task list and execute tasks one by one.
+    5. Use planner to solve task and handle possible planning failures.
+    6. Check task execution result (fail/success).
+    7. Return whether episode is finally successful.
+    """
+    print(f"--- Running simulation for episode:{episode}, seed:{seed}, env: {env_id} ---")
+    env: Optional[gym.Env] = None
+    try:
+        # 1. Environment parameter configuration
+        env_kwargs = dict(
+            obs_mode="rgb+depth+segmentation",  # Observation mode: RGB + Depth + Segmentation
+            control_mode="pd_joint_pos",        # Control mode: Position control
+            render_mode="rgb_array",            # Render mode
+            reward_mode="dense",                # Reward mode
+            seed=seed,             # Random seed
+            difficulty=difficulty, # Difficulty setting
+        )
+        # Special failure recovery settings for the first few episodes (for testing or demonstration purposes only)
+        if episode <= 5:
+            env_kwargs["robomme_failure_recovery"] = True
+            if episode <=2:
+                env_kwargs["robomme_failure_recovery_mode"] = "z"  # z-axis recovery
+            else:
+                env_kwargs["robomme_failure_recovery_mode"] = "xy" # xy-axis recovery
+        env = gym.make(env_id, **env_kwargs)
+        # 2. Wrap environment to record data
+        env = RobommeRecordWrapper(
+            env,
+            dataset=str(temp_dataset_path), # Data save path
+            env_id=env_id,
+            episode=episode,
+            seed=seed,
+            save_video=save_video,
+        )
+        episode_successful = False
+        env.reset()
+        # 3. Select planner
+        # PatternLock and RouteStick require Stick planner, others use Arm planner
+        if env_id == "PatternLock" or env_id == "RouteStick":
+            planner = FailAwarePandaStickMotionPlanningSolver(
+                env,
+                debug=False,
+                vis=False,
+                base_pose=env.unwrapped.agent.robot.pose,
+                visualize_target_grasp_pose=False,
+                print_env_info=False,
+                joint_vel_limits=0.3,
+            )
+        else:
+            planner = FailAwarePandaArmMotionPlanningSolver(
+                env,
+                debug=False,
+                vis=False,
+                base_pose=env.unwrapped.agent.robot.pose,
+                visualize_target_grasp_pose=False,
+                print_env_info=False,
+            )
+        original_move_to_pose_with_screw = planner.move_to_pose_with_screw
+        original_move_to_pose_with_rrt = planner.move_to_pose_with_RRTStar
+        def _move_to_pose_with_screw_then_rrt_retry(*args, **kwargs):
+            for attempt in range(1, DATASET_SCREW_MAX_ATTEMPTS + 1):
+                try:
+                    result = original_move_to_pose_with_screw(*args, **kwargs)
+                except ScrewPlanFailure as exc:
+                    print(
+                        f"[DatasetGen] screw planning failed "
+                        f"(attempt {attempt}/{DATASET_SCREW_MAX_ATTEMPTS}): {exc}"
+                    )
+                    continue
+                if isinstance(result, int) and result == -1:
+                    print(
+                        f"[DatasetGen] screw planning returned -1 "
+                        f"(attempt {attempt}/{DATASET_SCREW_MAX_ATTEMPTS})"
+                    )
+                    continue
+                return result
+            print(
+                "[DatasetGen] screw planning exhausted; "
+                f"fallback to RRT* (max {DATASET_RRT_MAX_ATTEMPTS} attempts)"
+            )
+            for attempt in range(1, DATASET_RRT_MAX_ATTEMPTS + 1):
+                try:
+                    result = original_move_to_pose_with_rrt(*args, **kwargs)
+                except Exception as exc:
+                    print(
+                        f"[DatasetGen] RRT* planning failed "
+                        f"(attempt {attempt}/{DATASET_RRT_MAX_ATTEMPTS}): {exc}"
+                    )
+                    continue
+                if isinstance(result, int) and result == -1:
+                    print(
+                        f"[DatasetGen] RRT* planning returned -1 "
+                        f"(attempt {attempt}/{DATASET_RRT_MAX_ATTEMPTS})"
+                    )
+                    continue
+                return result
+            print("[DatasetGen] screw->RRT* planning exhausted; return -1")
+            return -1
+        planner.move_to_pose_with_screw = _move_to_pose_with_screw_then_rrt_retry
+        env.unwrapped.evaluate()
+        # Get environment task list
+        tasks = list(getattr(env.unwrapped, "task_list", []) or [])
+        print(f"{env_id}: Task list has {len(tasks)} tasks")
+        # 4. Iterate and execute all subtasks
+        for idx, task_entry in enumerate(tasks):
+            task_name = task_entry.get("name", f"Task {idx}")
+            print(f"Executing task {idx + 1}/{len(tasks)}: {task_name}")
+            solve_callable = task_entry.get("solve")
+            if not callable(solve_callable):
+                raise ValueError(
+                    f"Task '{task_name}' must supply a callable 'solve'."
+                )
+            # Evaluate once before executing solve
+            env.unwrapped.evaluate(solve_complete_eval=True)
+            screw_failed = False
+            try:
+                # 5. Call planner to solve current task
+                solve_result = solve_callable(env, planner)
+                if isinstance(solve_result, int) and solve_result == -1:
+                    screw_failed = True
+                    print(f"Screw->RRT* planning exhausted during '{task_name}'")
+                    env.unwrapped.failureflag = torch.tensor([True])
+                    env.unwrapped.successflag = torch.tensor([False])
+                    env.unwrapped.current_task_failure = True
+            except ScrewPlanFailure as exc:
+                # Planning failure handling
+                screw_failed = True
+                print(f"Screw plan failure during '{task_name}': {exc}")
+                env.unwrapped.failureflag = torch.tensor([True])
+                env.unwrapped.successflag = torch.tensor([False])
+                env.unwrapped.current_task_failure = True
+            except FailsafeTimeout as exc:
+                # Timeout handling
+                print(f"Failsafe: {exc}")
+                break
+            # Evaluation after task execution
+            evaluation = env.unwrapped.evaluate(solve_complete_eval=True)
+            fail_flag = evaluation.get("fail", False)
+            success_flag = evaluation.get("success", False)
+            # 6. Check success/failure conditions
+            if _tensor_to_bool(success_flag):
+                print("All tasks completed successfully.")
+                episode_successful = True
+                break
+            if screw_failed or _tensor_to_bool(fail_flag):
+                print("Encountered failure condition; stopping task sequence.")
+                break
+        else:
+            # If loop ends normally (no break), check success again
+            evaluation = env.unwrapped.evaluate(solve_complete_eval=True)
+            episode_successful = _tensor_to_bool(evaluation.get("success", False))
+        # 7. Prioritize wrapper's success signal (double check)
+        episode_successful = episode_successful or _tensor_to_bool(
+            getattr(env, "episode_success", False)
+        )
+    except SceneGenerationError as exc:# Scene generation failure may occur in environments like swingxtimes
+        print(
+            f"Scene generation failed for env {env_id}, episode {episode}, seed {seed}: {exc}"
+        )
+        episode_successful = False
+    finally:
+        if env is not None:
+            try:
+                env.close()
+            except Exception as close_exc:
+                # Even if close() fails, return success if episode was successful
+                # Because HDF5 data was written before close() (in write() method)
+                print(f"Warning: Exception during env.close() for episode {episode}, seed {seed}: {close_exc}")
+                # If episode was successful, close() exception should not affect return value
+                # episode_successful was determined before close()
+    status_text = "SUCCESS" if episode_successful else "FAILED"
+    print(
+        f"--- Finished Running simulation for episode:{episode}, seed:{seed}, env: {env_id} [{status_text}] ---"
+    )
+    return episode_successful
+def run_env_dataset(
+    env_id: str,
+    episode_indices: Iterable[int],
+    temp_folder: Path,
+    save_video: bool,
+    metadata_records: List[Dict[str, Any]],
+    gpu_id: int,
+) -> List[Dict[str, Any]]:
+    """
+    Run dataset generation for a batch of episodes and save data to temporary folder.
+    Args:
+        env_id: Environment ID
+        episode_indices: List of episode indices to run
+        temp_folder: Temporary folder to save data
+        save_video: Whether to save video
+        metadata_records: Records from reference dataset metadata
+        gpu_id: GPU ID to use
+    Returns:
+        Generated episode metadata record list
+    """
+    # Set GPU used by current process
+    os.environ["CUDA_VISIBLE_DEVICES"] = str(gpu_id)
+    temp_folder.mkdir(parents=True, exist_ok=True)
+    episode_indices = list(episode_indices)
+    if not episode_indices:
+        return []
+    if env_id not in DEFAULT_ENVS:
+        raise ValueError(f"Unsupported environment: {env_id}")
+    # Pass a temporary h5 file path to wrapper
+    # Note: wrapper will actually create separate episode files in a subfolder of that path's directory
+    temp_dataset_path = temp_folder / f"temp_chunk.h5"
+    episode_records: List[Dict[str, Any]] = []
+    for episode in episode_indices:
+        candidate_pairs = _build_seed_candidates_from_metadata(episode, metadata_records)
+        if not candidate_pairs:
+            print(f"Episode {episode}: no metadata candidate seeds found, skipping.")
+            continue
+        episode_success = False
+        MAX_RETRY_ATTEMPTS = 20
+        for attempt_idx, candidate in enumerate(candidate_pairs, start=1):
+            base_seed = int(candidate["seed"])
+            difficulty = str(candidate["difficulty"])
+            current_seed = base_seed
+            for retry_count in range(MAX_RETRY_ATTEMPTS):
+                if retry_count > 0:
+                    current_seed += 1
+                print(
+                    f"Episode {episode} attempt {retry_count + 1}/{MAX_RETRY_ATTEMPTS} "
+                    f"with seed={current_seed} (base={base_seed}, diff={difficulty})"
+                )
+                try:
+                    success = _run_episode_attempt(
+                        env_id=env_id,
+                        episode=episode,
+                        seed=current_seed,
+                        temp_dataset_path=temp_dataset_path,
+                        save_video=save_video,
+                        difficulty=difficulty,
+                    )
+                    if success:
+                        # Record successful episode information
+                        episode_records.append(
+                            {
+                                "task": env_id,
+                                "episode": episode,
+                                "seed": current_seed,
+                                "difficulty": difficulty,
+                            }
+                        )
+                        episode_success = True
+                        break  # Break retry loop (seed increment loop)
+                    print(
+                        f"Episode {episode} failed with seed {current_seed}; retrying with seed+1..."
+                    )
+                except Exception as exc:
+                    print(
+                        f"Episode {episode} exception with seed {current_seed}: {exc}; retrying with seed+1..."
+                    )
+            if episode_success:
+                break # Break candidate loop
+        if not episode_success:
+            print(
+                f"Episode {episode} failed with strict source metadata seed; "
+                "metadata will not be recorded for this episode."
+            )
+    return episode_records
+def _merge_dataset_from_folder(
+    env_id: str,
+    temp_folder: Path,
+    final_dataset_path: Path,
+) -> None:
+    """
+    Merge all episode files from temporary folder into final dataset.
+    Args:
+        env_id: Environment ID
+        temp_folder: Temporary folder containing episode files
+        final_dataset_path: Final output HDF5 file path
+    """
+    if not temp_folder.exists() or not temp_folder.is_dir():
+        print(f"Warning: Temporary folder {temp_folder} does not exist")
+        return
+    final_dataset_path.parent.mkdir(parents=True, exist_ok=True)
+    # Find subfolders created by RobommeRecordWrapper
+    # It usually creates directories ending with "_hdf5_files"
+    hdf5_folders = list(temp_folder.glob("*_hdf5_files"))
+    if not hdf5_folders:
+        print(f"Warning: No HDF5 folders found in {temp_folder}")
+        return
+    print(f"Merging episodes from {temp_folder} into {final_dataset_path}")
+    # Open final HDF5 file for append mode writing
+    with h5py.File(final_dataset_path, "a") as final_file:
+        for hdf5_folder in sorted(hdf5_folders):
+            # Get all h5 files in folder
+            h5_files = sorted(hdf5_folder.glob("*.h5"))
+            if not h5_files:
+                print(f"Warning: No h5 files found in {hdf5_folder}")
+                continue
+            print(f"Found {len(h5_files)} episode files in {hdf5_folder.name}")
+            # Merge each episode file
+            for h5_file in h5_files:
+                print(f"  - Merging {h5_file.name}")
+                try:
+                    with h5py.File(h5_file, "r") as episode_file:
+                        file_keys = list(episode_file.keys())
+                        if len(file_keys) == 0:
+                            print(f"    Warning: {h5_file.name} is empty, skipping...")
+                            continue
+                        for env_group_name, src_env_group in episode_file.items():
+                            episode_keys = list(src_env_group.keys()) if isinstance(src_env_group, h5py.Group) else []
+                            if len(episode_keys) == 0:
+                                print(f"    Warning: {env_group_name} in {h5_file.name} has no episodes, skipping...")
+                                continue
+                            # If environment group (e.g. 'PickXtimes') does not exist, copy directly
+                            if env_group_name not in final_file:
+                                final_file.copy(src_env_group, env_group_name)
+                                continue
+                            dest_env_group = final_file[env_group_name]
+                            if not isinstance(dest_env_group, h5py.Group):
+                                print(f"    Warning: {env_group_name} is not a group, skipping...")
+                                continue
+                            # If environment group exists, copy episodes one by one
+                            for episode_name in src_env_group.keys():
+                                if episode_name in dest_env_group:
+                                    print(f"    Warning: Episode {episode_name} already exists, overwriting...")
+                                    del dest_env_group[episode_name]
+                                src_env_group.copy(episode_name, dest_env_group, name=episode_name)
+                except Exception as e:
+                    print(f"    Error merging {h5_file.name}: {e}")
+                    continue
+    # Keep videos: wrapper writes videos to 'videos' under temp dir, move to final dir before cleanup
+    temp_videos_dir = temp_folder / "videos"
+    final_videos_dir = final_dataset_path.parent / "videos"
+    if temp_videos_dir.exists() and temp_videos_dir.is_dir():
+        final_videos_dir.mkdir(parents=True, exist_ok=True)
+        moved_count = 0
+        for video_path in sorted(temp_videos_dir.glob("*.mp4")):
+            target_path = final_videos_dir / video_path.name
+            if target_path.exists():
+                stem = target_path.stem
+                suffix = target_path.suffix
+                index = 1
+                while True:
+                    candidate = final_videos_dir / f"{stem}_dup{index}{suffix}"
+                    if not candidate.exists():
+                        target_path = candidate
+                        break
+                    index += 1
+            try:
+                shutil.move(str(video_path), str(target_path))
+                moved_count += 1
+            except Exception as exc:
+                print(f"Warning: Failed to move video {video_path.name}: {exc}")
+        if moved_count > 0:
+            print(f"Moved {moved_count} videos to {final_videos_dir}")
+    # Clean up temporary folder after successful merge
+    try:
+        shutil.rmtree(temp_folder)
+        print(f"Cleaned up temporary folder: {temp_folder}")
+    except Exception as e:
+        print(f"Warning: Failed to remove temporary folder {temp_folder}: {e}")
+def _save_episode_metadata(
+    records: List[Dict[str, Any]],
+    metadata_path: Path,
+    env_id: str,
+) -> None:
+    """Save seed/difficulty metadata for each episode to JSON file."""
+    metadata_path.parent.mkdir(parents=True, exist_ok=True)
+    sorted_records = sorted(records, key=lambda rec: rec.get("episode", -1))
+    metadata = {
+        "env_id": env_id,
+        "record_count": len(sorted_records),
+        "records": sorted_records,
+    }
+    try:
+        with metadata_path.open("w", encoding="utf-8") as metadata_file:
+            json.dump(metadata, metadata_file, indent=2)
+        print(f"Saved episode metadata to {metadata_path}")
+    except Exception as exc:
+        print(f"Warning: Failed to save episode metadata to {metadata_path}: {exc}")
+def parse_args() -> argparse.Namespace:
+    parser = argparse.ArgumentParser(description="Robomme Dataset Generator")
+    parser.add_argument(
+        "--env",
+        "-e",
+        type=str,
+        nargs="+",
+        default=None,
+        help="Environment IDs to run. Provide one or more values; defaults to all built-in Robomme environments.",
+    )
+    parser.add_argument(
+        "--episodes",
+        "-n",
+        type=int,
+        default=100,
+        help="Number of episodes generated per environment (Default: 100)",
+    )
+    parser.add_argument(
+        "--save-video",
+        dest="save_video",
+        action="store_true",
+        default=True,
+        help="Enable video recording via RobommeRecordWrapper (Default: Enabled).",
+    )
+    parser.add_argument(
+        "--no-save-video",
+        dest="save_video",
+        action="store_false",
+        help="Disable video recording.",
+    )
+    parser.add_argument(
+        "--max-workers",
+        "-w",
+        type=int,
+        default=20,
+        help="Number of parallel workers when running multiple environments.",
+    )
+    parser.add_argument(
+        "--gpus",
+        type=str,
+        default="1",
+        help="GPU selection. Supported values: '0', '1', '0,1' (or '1,0'). Default: '0'.",
+    )
+    return parser.parse_args()
+def _parse_gpu_ids(gpu_spec: str) -> List[int]:
+    """Parse user GPU spec string to a deduplicated GPU id list."""
+    valid_gpu_ids = {0, 1}
+    raw_tokens = [token.strip() for token in gpu_spec.split(",") if token.strip()]
+    if not raw_tokens:
+        raise ValueError("GPU spec is empty. Use one of: 0, 1, 0,1")
+    gpu_ids: List[int] = []
+    for token in raw_tokens:
+        try:
+            gpu_id = int(token)
+        except ValueError as exc:
+            raise ValueError(
+                f"Invalid GPU id '{token}'. Supported values are 0 and 1."
+            ) from exc
+        if gpu_id not in valid_gpu_ids:
+            raise ValueError(
+                f"Unsupported GPU id '{gpu_id}'. Supported values are 0 and 1."
+            )
+        if gpu_id not in gpu_ids:
+            gpu_ids.append(gpu_id)
+    if not gpu_ids:
+        raise ValueError("No valid GPU id provided. Use one of: 0, 1, 0,1")
+    return gpu_ids
+def main() -> None:
+    args = parse_args()
+    env_inputs = args.env or DEFAULT_ENVS
+    env_ids: List[str] = []
+    # Parse environment list arguments, support comma separation
+    for raw_env in env_inputs:
+        env_ids.extend(env.strip() for env in raw_env.split(",") if env.strip())
+    if not env_ids:
+        env_ids = DEFAULT_ENVS.copy()
+    num_workers = max(1, args.max_workers)
+    gpu_spec = args.gpus
+    gpu_ids = _parse_gpu_ids(gpu_spec)
+    episode_indices = list(range(args.episodes))
+    for env_id in env_ids:
+        source_metadata_records = _load_env_metadata_records(
+            env_id=env_id,
+            metadata_root=SOURCE_METADATA_ROOT,
+        )
+        # Create shared temporary folder for all episodes
+        temp_folder =  Path(f"/data/hongzefu/data_0226/temp_{env_id}_episodes")
+        final_dataset_path =  Path(f"/data/hongzefu/data_0226/record_dataset_{env_id}.h5")
+        #final_dataset_path =  Path(f"/data/hongzefu/dataset_generate/record_dataset_{env_id}.h5")
+        print(f"\n{'='*80}")
+        print(f"Environment: {env_id}")
+        print(f"Episodes: {args.episodes}")
+        print(f"Workers: {num_workers}")
+        if len(gpu_ids) == 1:
+            print(f"GPU mode: Single GPU ({gpu_ids[0]})")
+        else:
+            print(f"GPU mode: Multi GPU ({','.join(str(gpu) for gpu in gpu_ids)})")
+        print(f"Temporary folder: {temp_folder}")
+        print(f"Final dataset: {final_dataset_path}")
+        print(f"{'='*80}\n")
+        episode_records: List[Dict[str, Any]] = []
+        if num_workers > 1:
+            # 1. Split task chunks
+            episode_chunks = _split_episode_indices(args.episodes, num_workers)
+            if len(episode_chunks) <= 1:
+                # Single chunk, run directly
+                chunk = episode_chunks[0] if episode_chunks else []
+                episode_records = run_env_dataset(
+                    env_id,
+                    chunk,
+                    temp_folder,
+                    args.save_video,
+                    source_metadata_records,
+                    gpu_ids[0],
+                )
+            else:
+                worker_count = len(episode_chunks)
+                print(
+                    f"Running {env_id} with {worker_count} workers across {args.episodes} episodes..."
+                )
+                future_to_chunk = {}
+                futures = []
+                if len(gpu_ids) == 1:
+                    print(
+                        f"Assigning all {len(episode_chunks)} chunks to GPU {gpu_ids[0]} ({num_workers} workers)"
+                    )
+                else:
+                    print(
+                        f"Assigning {len(episode_chunks)} chunks across GPUs {','.join(str(gpu) for gpu in gpu_ids)}"
+                    )
+                with ProcessPoolExecutor(max_workers=num_workers) as executor:
+                    for chunk_idx, chunk in enumerate(episode_chunks):
+                        assigned_gpu = gpu_ids[chunk_idx % len(gpu_ids)]
+                        f = executor.submit(
+                            run_env_dataset,
+                            env_id,
+                            chunk,
+                            temp_folder,
+                            args.save_video,
+                            source_metadata_records,
+                            assigned_gpu,
+                        )
+                        future_to_chunk[f] = (chunk, assigned_gpu)
+                        futures.append(f)
+                    for future in as_completed(futures):
+                        chunk, assigned_gpu = future_to_chunk[future]
+                        chunk_label = (chunk[0], chunk[-1]) if chunk else ("?", "?")
+                        try:
+                            records = future.result()
+                            episode_records.extend(records)
+                            print(
+                                f"✓ Completed episodes {chunk_label[0]}-{chunk_label[1]} for {env_id} on GPU {assigned_gpu}"
+                            )
+                        except Exception as exc:
+                            print(
+                                f"✗ Environment {env_id} failed on episodes "
+                                f"{chunk_label[0]}-{chunk_label[1]} (GPU {assigned_gpu}) with error: {exc}"
+                            )
+            # 3. Merge all episode files into final dataset
+            print(f"\nMerging all episodes into final dataset...")
+            _merge_dataset_from_folder(
+                env_id,
+                temp_folder,
+                final_dataset_path,
+            )
+        else:
+            # Single worker mode
+            episode_records = run_env_dataset(
+                env_id,
+                episode_indices,
+                temp_folder,
+                args.save_video,
+                source_metadata_records,
+                gpu_ids[0], # gpu_id
+            )
+            # Merge episodes into final dataset
+            print(f"\nMerging all episodes into final dataset...")
+            _merge_dataset_from_folder(
+                env_id,
+                temp_folder,
+                final_dataset_path,
+            )
+        # 4. Save metadata
+        metadata_path = final_dataset_path.with_name(
+            f"{final_dataset_path.stem}_metadata.json"
+        )
+        _save_episode_metadata(episode_records, metadata_path, env_id)
+        print(f"\n✓ Finished! Final dataset saved to: {final_dataset_path}\n")
+    print("✓ All requested environments processed.")
+if __name__ == "__main__":
+    main()