Spaces:

HongzeFu
/

RoboMME

Sleeping

App Files Files Community

HongzeFu commited on Mar 8

Commit

4ccc0e4

1 Parent(s): 8e0fa10

video v1

Browse files

Files changed (9) hide show

gradio-web/config.py +1 -3
gradio-web/gradio_callbacks.py +223 -228
gradio-web/process_session.py +7 -0
gradio-web/test/test_live_obs_refresh.py +82 -42
gradio-web/test/test_queue_session_limit_e2e.py +8 -10
gradio-web/test/test_ui_native_layout_contract.py +3 -1
gradio-web/test/test_ui_phase_machine_runtime_e2e.py +158 -104
gradio-web/test/test_ui_text_config.py +2 -1
gradio-web/ui_layout.py +23 -48

gradio-web/config.py CHANGED Viewed

@@ -2,10 +2,8 @@
 配置常量模块
 """
 # --- Configuration ---
-VIDEO_PLAYBACK_FPS = 20.0  # Frame rate for demonstration video playback
 USE_SEGMENTED_VIEW = False  # Set to True to use segmented view, False to use original image
-LIVE_OBS_REFRESH_HZ = 30.0  # Live observation refresh frequency in Hz
-KEYFRAME_DOWNSAMPLE_FACTOR = 1  # Keep 1 frame out of every N streamed frames
 # 主界面两列宽度比例 (Point Selection : Right Panel)
 POINT_SELECTION_SCALE = 1

 配置常量模块
 """
 # --- Configuration ---
+VIDEO_PLAYBACK_FPS = 20.0  # Frame rate for demonstration and execute video playback
 USE_SEGMENTED_VIEW = False  # Set to True to use segmented view, False to use original image
 # 主界面两列宽度比例 (Point Selection : Right Panel)
 POINT_SELECTION_SCALE = 1

gradio-web/gradio_callbacks.py CHANGED Viewed

@@ -4,15 +4,14 @@ Gradio回调函数模块
 """
 import logging
 import os
-import queue
 import re
 import threading
 import time
 from datetime import datetime
 import gradio as gr
 import numpy as np
-from PIL import Image
 from state_manager import (
     cleanup_session,
@@ -31,8 +30,6 @@ from image_utils import draw_marker, save_video, concatenate_frames_horizontally
 from user_manager import user_manager
 from config import (
     EXECUTE_LIMIT_OFFSET,
-    KEYFRAME_DOWNSAMPLE_FACTOR,
-    LIVE_OBS_REFRESH_HZ,
     UI_TEXT,
     USE_SEGMENTED_VIEW,
     get_live_obs_elem_classes,
@@ -43,10 +40,9 @@ from process_session import ScrewPlanFailureError
 from note_content import get_task_hint
-# --- live_obs refresh queue state ---
-# Each uid keeps its own FIFO queue and sampling cursor.
-_LIVE_OBS_REFRESH = {}
-_LIVE_OBS_REFRESH_LOCK = threading.Lock()
 LOGGER = logging.getLogger("robomme.callbacks")
@@ -86,8 +82,7 @@ def cleanup_user_session(uid):
     """Unified cleanup entry for gr.State TTL deletion and unload hooks."""
     if not uid:
         return
-    with _LIVE_OBS_REFRESH_LOCK:
-        _LIVE_OBS_REFRESH.pop(uid, None)
     cleanup_session(uid)
@@ -134,15 +129,6 @@ def _option_requires_coords(session, option_value) -> bool:
     return bool(raw_solve_options[option_idx].get("available"))
-def _should_enqueue_sample(sample_index: int) -> bool:
-    factor = max(1, int(KEYFRAME_DOWNSAMPLE_FACTOR))
-    return sample_index % factor == 0
-def _live_obs_refresh_interval_sec() -> float:
-    return 1.0 / max(float(LIVE_OBS_REFRESH_HZ), 1.0)
 def _uid_for_log(uid):
     if not uid:
         return "<none>"
@@ -150,6 +136,109 @@ def _uid_for_log(uid):
     return text if len(text) <= 12 else f"{text[:8]}..."
 def capitalize_first_letter(text: str) -> str:
     """确保字符串的第一个字母大写，其余字符保持不变"""
     if not text:
@@ -284,21 +373,9 @@ def on_demo_video_play(uid):
 def switch_to_execute_phase(uid):
-    """Disable controls and point clicking during execute playback."""
     if uid:
-        session = get_session(uid)
-        base_count = len(getattr(session, "base_frames", []) or []) if session else 0
-        LOGGER.debug(
-            "switch_to_execute_phase uid=%s base_frames=%s",
-            _uid_for_log(uid),
-            base_count,
-        )
-        with _LIVE_OBS_REFRESH_LOCK:
-            _LIVE_OBS_REFRESH[uid] = {
-                "frame_queue": queue.Queue(),
-                "last_base_count": base_count,
-                "sample_index": 0,
-            }
     return (
         gr.update(interactive=False),  # options_radio
         gr.update(interactive=False),  # exec_btn
@@ -313,8 +390,6 @@ def switch_to_action_phase(uid=None):
     """Switch display to action phase and restore control panel interactions."""
     if uid:
         LOGGER.debug("switch_to_action_phase uid=%s", _uid_for_log(uid))
-        with _LIVE_OBS_REFRESH_LOCK:
-            _LIVE_OBS_REFRESH.pop(uid, None)
     return (
         gr.update(interactive=True),  # options_radio
         gr.update(),  # exec_btn (keep execute_step result)
@@ -325,152 +400,29 @@ def switch_to_action_phase(uid=None):
     )
-def _get_live_obs_refresh_state(uid, base_count=0):
-    with _LIVE_OBS_REFRESH_LOCK:
-        if uid not in _LIVE_OBS_REFRESH:
-            _LIVE_OBS_REFRESH[uid] = {
-                "frame_queue": queue.Queue(),
-                "last_base_count": int(base_count),
-                "sample_index": 0,
-            }
-        return _LIVE_OBS_REFRESH[uid]
-def _enqueue_live_obs_frames(uid, base_frames):
-    """
-    Push newly appended base_frames into per-uid FIFO queue with configurable downsampling.
-    """
-    if not uid:
-        return 0
-    frames = base_frames or []
-    state = _get_live_obs_refresh_state(uid, base_count=len(frames))
-    frame_queue = state["frame_queue"]
-    current_count = len(frames)
-    last_count = int(state.get("last_base_count", 0))
-    # Session/task reset: history shrank.
-    if current_count < last_count:
-        with _LIVE_OBS_REFRESH_LOCK:
-            state["frame_queue"] = queue.Queue()
-            state["last_base_count"] = current_count
-            state["sample_index"] = 0
-        return 0
-    if current_count <= last_count:
-        return frame_queue.qsize()
-    new_frames = frames[last_count:current_count]
-    sample_index = int(state.get("sample_index", 0))
-    for frame in new_frames:
-        if _should_enqueue_sample(sample_index) and frame is not None:
-            frame_queue.put(frame)
-        sample_index += 1
-    with _LIVE_OBS_REFRESH_LOCK:
-        state["last_base_count"] = current_count
-        state["sample_index"] = sample_index
-    return frame_queue.qsize()
-def _wait_for_live_obs_queue_drain(uid, max_wait_sec=None, empty_grace_sec=0.2, poll_sec=0.05):
-    """
-    Wait for timer-driven live_obs refresh to consume queued frames before phase switch.
-    """
-    if not uid:
-        return
-    with _LIVE_OBS_REFRESH_LOCK:
-        state0 = _LIVE_OBS_REFRESH.get(uid)
-        queue0 = state0.get("frame_queue") if state0 else None
-        initial_qsize = int(queue0.qsize()) if queue0 is not None else 0
-    if max_wait_sec is None:
-        # Timer-driven playback + small buffer, capped to keep UI responsive.
-        max_wait_sec = min(30.0, max(2.0, initial_qsize * (_live_obs_refresh_interval_sec() + 0.02) + 1.0))
-    start = time.time()
-    empty_since = None
-    while True:
-        if (time.time() - start) >= max_wait_sec:
-            break
-        with _LIVE_OBS_REFRESH_LOCK:
-            state = _LIVE_OBS_REFRESH.get(uid)
-            frame_queue = state.get("frame_queue") if state else None
-        if frame_queue is None:
-            break
-        if frame_queue.qsize() > 0:
-            empty_since = None
-        else:
-            if empty_since is None:
-                empty_since = time.time()
-            elif (time.time() - empty_since) >= empty_grace_sec:
-                break
-        time.sleep(poll_sec)
-def _prepare_refresh_frame(frame):
-    """Normalize cached frame to an RGB uint8 PIL image for gr.Image."""
-    if frame is None:
-        return None
-    frame_arr = np.asarray(frame)
-    if frame_arr.dtype != np.uint8:
-        max_val = float(np.max(frame_arr)) if frame_arr.size else 0.0
-        if max_val <= 1.0:
-            frame_arr = (frame_arr * 255.0).clip(0, 255).astype(np.uint8)
-        else:
-            frame_arr = frame_arr.clip(0, 255).astype(np.uint8)
-    if frame_arr.ndim == 2:
-        frame_arr = np.stack([frame_arr] * 3, axis=-1)
-    elif frame_arr.ndim == 3 and frame_arr.shape[2] == 4:
-        frame_arr = frame_arr[:, :, :3]
-    return Image.fromarray(frame_arr)
-def refresh_live_obs(uid, ui_phase):
-    """
-    Poll latest cached frame during execute phase.
-    Updates live_obs using the configured gr.Timer interval.
-    """
-    if ui_phase != "execution_playback":
-        return gr.update()
-    session = get_session(uid)
-    if not session:
-        return gr.update()
-    base_frames = getattr(session, "base_frames", None) or []
-    if not base_frames:
-        return gr.update()
-    _enqueue_live_obs_frames(uid, base_frames)
-    state = _get_live_obs_refresh_state(uid, base_count=len(base_frames))
-    frame_queue = state["frame_queue"]
-    if frame_queue.empty():
-        return gr.update()
-    latest = frame_queue.get()
-    env_id = getattr(session, "env_id", None)
-    stitched = concatenate_frames_horizontally([latest], env_id=env_id)
-    if stitched:
-        latest = stitched[-1]
-    img = _prepare_refresh_frame(latest)
-    if img is None:
-        return gr.update()
-    return _live_obs_update(value=img, interactive=False)
-def on_video_end_transition(uid):
-    """Called when demo video finishes. Transition from video to action phase."""
     return (
         gr.update(visible=False),  # video_phase_group
         gr.update(visible=True),   # action_phase_group
         gr.update(visible=True),   # control_panel_group
-        _action_selection_log(),
         gr.update(visible=False, interactive=False),  # watch_demo_video_btn
     )
 def _task_load_failed_response(uid, message):
     LOGGER.warning("task_load_failed uid=%s message=%s", _uid_for_log(uid), message)
     return (
         uid,
         gr.update(visible=True),  # main_interface
@@ -479,7 +431,7 @@ def _task_load_failed_response(uid, message):
         gr.update(choices=[], value=None),  # options_radio
         "",  # goal_box
         _ui_text("coords", "not_needed"),  # coords_box
-        gr.update(value=None, visible=False),  # video_display
         gr.update(visible=False, interactive=False),  # watch_demo_video_btn
         "",  # task_info_box
         "",  # progress_info_box
@@ -525,8 +477,7 @@ def _load_status_task(uid, status):
     LOGGER.debug("loading episode env=%s episode=%s uid=%s", env_id, ep_num, _uid_for_log(uid))
-    with _LIVE_OBS_REFRESH_LOCK:
-        _LIVE_OBS_REFRESH.pop(uid, None)
     reset_play_button_clicked(uid)
     reset_execute_count(uid, env_id, int(ep_num))
@@ -555,7 +506,7 @@ def _load_status_task(uid, status):
             gr.update(choices=[], value=None),  # options_radio
             "",  # goal_box
             _ui_text("coords", "not_needed"),  # coords_box
-            gr.update(value=None, visible=False),  # video_display
             gr.update(visible=False, interactive=False),  # watch_demo_video_btn
             f"{actual_env_id} (Episode {ep_num})",  # task_info_box
             progress_text,  # progress_info_box
@@ -574,19 +525,7 @@ def _load_status_task(uid, status):
     else:
         goal_text = capitalize_first_letter(session.language_goal) if session.language_goal else ""
-    options = session.available_options
-    radio_choices = []
-    for opt_label, opt_idx in options:
-        opt_label = _ui_option_label(session, opt_label, opt_idx)
-        if 0 <= opt_idx < len(session.raw_solve_options):
-            opt = session.raw_solve_options[opt_idx]
-            if opt.get("available"):
-                opt_label_with_hint = f"{opt_label}{_ui_text('actions', 'point_required_suffix')}"
-            else:
-                opt_label_with_hint = opt_label
-        else:
-            opt_label_with_hint = opt_label
-        radio_choices.append((opt_label_with_hint, opt_idx))
     LOGGER.debug(
         "options prepared uid=%s env=%s count=%s",
         _uid_for_log(uid),
@@ -634,7 +573,7 @@ def _load_status_task(uid, status):
             gr.update(choices=radio_choices, value=None),  # options_radio
             goal_text,  # goal_box
             _ui_text("coords", "not_needed"),  # coords_box
-            gr.update(value=demo_video_path, visible=True),  # video_display
             gr.update(visible=True, interactive=True),  # watch_demo_video_btn
             f"{actual_env_id} (Episode {ep_num})",  # task_info_box
             progress_text,  # progress_info_box
@@ -658,7 +597,7 @@ def _load_status_task(uid, status):
         gr.update(choices=radio_choices, value=None),  # options_radio
         goal_text,  # goal_box
         _ui_text("coords", "not_needed"),  # coords_box
-        gr.update(value=None, visible=False),  # video_display (no video)
         gr.update(visible=False, interactive=False),  # watch_demo_video_btn
         f"{actual_env_id} (Episode {ep_num})",  # task_info_box
         progress_text,  # progress_info_box
@@ -1061,18 +1000,60 @@ def execute_step(uid, option_idx, coords_str):
         option_idx,
         coords_str,
     )
     session = get_session(uid)
     if not session:
         LOGGER.error("execute_step missing session uid=%s", _uid_for_log(uid))
-        return (
-            _live_obs_update(value=None, interactive=False),
-            format_log_markdown(_session_error_text()),
-            gr.update(),
-            gr.update(),
-            gr.update(interactive=False),
-            gr.update(interactive=False),
         )
     # 检查 execute 次数限制（在执行前检查，如果达到限制则模拟失败状态）
     execute_limit_reached = False
     if uid and session.env_id is not None and session.episode_idx is not None:
@@ -1094,22 +1075,24 @@ def execute_step(uid, option_idx, coords_str):
                 max_execute,
                 execute_limit_reached,
             )
-    # Ensure at least one cached frame exists for timer-based refresh.
     if not session.base_frames:
         LOGGER.debug("execute_step uid=%s base_frames empty; triggering update_observation", _uid_for_log(uid))
         session.update_observation(use_segmentation=USE_SEGMENTED_VIEW)
     option_idx = _parse_option_idx(option_idx)
     if option_idx is None:
         LOGGER.debug("execute_step uid=%s aborted: option_idx is None", _uid_for_log(uid))
-        return (
-            _live_obs_update(value=session.get_pil_image(use_segmented=USE_SEGMENTED_VIEW), interactive=False),
-            format_log_markdown(_ui_text("log", "execute_missing_action")),
-            gr.update(),
-            gr.update(),
-            gr.update(interactive=False),
-            gr.update(interactive=True),
         )
     needs_coords = _option_requires_coords(session, option_idx)
@@ -1125,7 +1108,15 @@ def execute_step(uid, option_idx, coords_str):
             )
             current_img = session.get_pil_image(use_segmented=USE_SEGMENTED_VIEW)
             error_msg = _ui_text("coords", "select_point_before_execute")
-            return _live_obs_update(value=current_img, interactive=False), format_log_markdown(error_msg), gr.update(), gr.update(), gr.update(interactive=False), gr.update(interactive=True)
     # Parse coords
     click_coords = None
@@ -1212,14 +1203,6 @@ def execute_step(uid, option_idx, coords_str):
                 new_count,
             )
-    # Execute frames are produced in batch when execute_action returns from worker process.
-    # Enqueue them now, then wait briefly for the configured timer to drain FIFO playback.
-    _enqueue_live_obs_frames(uid, getattr(session, "base_frames", None))
-    _wait_for_live_obs_queue_drain(uid)
-    LOGGER.debug("execute_step playback drain complete uid=%s", _uid_for_log(uid))
-    # 注意：执行阶段画面由 live_obs 的配置化轮询间隔刷新。
     progress_update = gr.update()  # 默认不更新 progress
     task_update = gr.update()
@@ -1263,28 +1246,40 @@ def execute_step(uid, option_idx, coords_str):
     # 根据视图模式重新获取图片
     img = session.get_pil_image(use_segmented=USE_SEGMENTED_VIEW)
     restart_episode_update = gr.update(interactive=True)
     next_task_update = gr.update(interactive=True)
     exec_btn_update = gr.update(interactive=False) if done else gr.update(interactive=True)
     # 格式化日志消息为 HTML 格式（支持颜色显示）
     formatted_status = format_log_markdown(status)
     LOGGER.debug(
-        "execute_step done uid=%s env=%s ep=%s done=%s exec_btn_interactive=%s",
         _uid_for_log(uid),
         getattr(session, "env_id", None),
         getattr(session, "episode_idx", None),
         done,
         not done,
     )
-    return (
-        _live_obs_update(value=img, interactive=False),
-        formatted_status,
-        task_update,
-        progress_update,
-        restart_episode_update,
-        next_task_update,
-        exec_btn_update,
     )

 """
 import logging
 import os
 import re
 import threading
 import time
 from datetime import datetime
+from pathlib import Path
 import gradio as gr
 import numpy as np
 from state_manager import (
     cleanup_session,
 from user_manager import user_manager
 from config import (
     EXECUTE_LIMIT_OFFSET,
     UI_TEXT,
     USE_SEGMENTED_VIEW,
     get_live_obs_elem_classes,
 from note_content import get_task_hint
+# --- execute video temp files ---
+_EXECUTION_VIDEO_PATHS = {}
+_EXECUTION_VIDEO_LOCK = threading.Lock()
 LOGGER = logging.getLogger("robomme.callbacks")
     """Unified cleanup entry for gr.State TTL deletion and unload hooks."""
     if not uid:
         return
+    _clear_execution_video_path(uid)
     cleanup_session(uid)
     return bool(raw_solve_options[option_idx].get("available"))
 def _uid_for_log(uid):
     if not uid:
         return "<none>"
     return text if len(text) <= 12 else f"{text[:8]}..."
+def _delete_temp_video(path):
+    if not path:
+        return
+    try:
+        Path(path).unlink(missing_ok=True)
+    except Exception:
+        LOGGER.warning("failed to delete temp video: %s", path, exc_info=True)
+def _clear_execution_video_path(uid):
+    if not uid:
+        return
+    with _EXECUTION_VIDEO_LOCK:
+        old_path = _EXECUTION_VIDEO_PATHS.pop(uid, None)
+    _delete_temp_video(old_path)
+def _set_execution_video_path(uid, path):
+    if not uid:
+        return
+    with _EXECUTION_VIDEO_LOCK:
+        old_path = _EXECUTION_VIDEO_PATHS.get(uid)
+        _EXECUTION_VIDEO_PATHS[uid] = path
+    if old_path and old_path != path:
+        _delete_temp_video(old_path)
+def _build_radio_choices(session):
+    radio_choices = []
+    options = getattr(session, "available_options", None) or []
+    raw_solve_options = getattr(session, "raw_solve_options", None) or []
+    for opt_label, opt_idx in options:
+        ui_label = _ui_option_label(session, opt_label, opt_idx)
+        if 0 <= opt_idx < len(raw_solve_options) and raw_solve_options[opt_idx].get("available"):
+            ui_label = f"{ui_label}{_ui_text('actions', 'point_required_suffix')}"
+        radio_choices.append((ui_label, opt_idx))
+    return radio_choices
+def _coerce_video_source_frames(frames):
+    if not isinstance(frames, list):
+        return []
+    valid = []
+    for frame in frames:
+        if frame is None:
+            continue
+        frame_arr = np.asarray(frame)
+        if frame_arr.ndim not in {2, 3}:
+            continue
+        if frame_arr.dtype.kind in {"U", "S", "O"}:
+            continue
+        valid.append(frame_arr)
+    return valid
+def _fallback_execution_frames(session):
+    base_frames = getattr(session, "base_frames", None) or []
+    if base_frames:
+        return [np.asarray(base_frames[-1])]
+    try:
+        pil_image = session.get_pil_image(use_segmented=False)
+    except Exception:
+        return []
+    if pil_image is None:
+        return []
+    frame_arr = np.asarray(pil_image)
+    if frame_arr.ndim not in {2, 3}:
+        return []
+    if frame_arr.dtype.kind in {"U", "S", "O"}:
+        return []
+    return [frame_arr]
+def _build_execution_video_update(uid, session):
+    raw_frames = _coerce_video_source_frames(getattr(session, "last_execution_frames", None))
+    if not raw_frames:
+        raw_frames = _fallback_execution_frames(session)
+    stitched_frames = concatenate_frames_horizontally(
+        raw_frames,
+        env_id=getattr(session, "env_id", None),
+    )
+    if not stitched_frames:
+        _clear_execution_video_path(uid)
+        return gr.update(value=None, visible=False)
+    suffix = f"execute_{int(time.time() * 1000)}"
+    video_path = save_video(stitched_frames, suffix=suffix)
+    if not video_path:
+        _clear_execution_video_path(uid)
+        return gr.update(value=None, visible=False)
+    if not (os.path.exists(video_path) and os.path.getsize(video_path) > 0):
+        _clear_execution_video_path(uid)
+        return gr.update(value=None, visible=False)
+    _set_execution_video_path(uid, video_path)
+    return gr.update(
+        value=video_path,
+        visible=True,
+        autoplay=True,
+        playback_position=0,
+    )
 def capitalize_first_letter(text: str) -> str:
     """确保字符串的第一个字母大写，其余字符保持不变"""
     if not text:
 def switch_to_execute_phase(uid):
+    """Disable controls and point clicking while execute work is running."""
     if uid:
+        LOGGER.debug("switch_to_execute_phase uid=%s", _uid_for_log(uid))
     return (
         gr.update(interactive=False),  # options_radio
         gr.update(interactive=False),  # exec_btn
     """Switch display to action phase and restore control panel interactions."""
     if uid:
         LOGGER.debug("switch_to_action_phase uid=%s", _uid_for_log(uid))
     return (
         gr.update(interactive=True),  # options_radio
         gr.update(),  # exec_btn (keep execute_step result)
     )
+def on_video_end_transition(uid, ui_phase=None):
+    """Transition from video phase back to the action phase."""
+    LOGGER.debug(
+        "on_video_end_transition uid=%s ui_phase=%s",
+        _uid_for_log(uid),
+        ui_phase,
+    )
+    log_update = gr.update()
+    if ui_phase == "demo_video" or ui_phase is None:
+        log_update = _action_selection_log()
     return (
         gr.update(visible=False),  # video_phase_group
         gr.update(visible=True),   # action_phase_group
         gr.update(visible=True),   # control_panel_group
+        log_update,  # log_output
         gr.update(visible=False, interactive=False),  # watch_demo_video_btn
+        "action_point",  # ui_phase_state
     )
 def _task_load_failed_response(uid, message):
     LOGGER.warning("task_load_failed uid=%s message=%s", _uid_for_log(uid), message)
+    _clear_execution_video_path(uid)
     return (
         uid,
         gr.update(visible=True),  # main_interface
         gr.update(choices=[], value=None),  # options_radio
         "",  # goal_box
         _ui_text("coords", "not_needed"),  # coords_box
+        gr.update(value=None, visible=False, autoplay=False, playback_position=0),  # video_display
         gr.update(visible=False, interactive=False),  # watch_demo_video_btn
         "",  # task_info_box
         "",  # progress_info_box
     LOGGER.debug("loading episode env=%s episode=%s uid=%s", env_id, ep_num, _uid_for_log(uid))
+    _clear_execution_video_path(uid)
     reset_play_button_clicked(uid)
     reset_execute_count(uid, env_id, int(ep_num))
             gr.update(choices=[], value=None),  # options_radio
             "",  # goal_box
             _ui_text("coords", "not_needed"),  # coords_box
+            gr.update(value=None, visible=False, autoplay=False, playback_position=0),  # video_display
             gr.update(visible=False, interactive=False),  # watch_demo_video_btn
             f"{actual_env_id} (Episode {ep_num})",  # task_info_box
             progress_text,  # progress_info_box
     else:
         goal_text = capitalize_first_letter(session.language_goal) if session.language_goal else ""
+    radio_choices = _build_radio_choices(session)
     LOGGER.debug(
         "options prepared uid=%s env=%s count=%s",
         _uid_for_log(uid),
             gr.update(choices=radio_choices, value=None),  # options_radio
             goal_text,  # goal_box
             _ui_text("coords", "not_needed"),  # coords_box
+            gr.update(value=demo_video_path, visible=True, autoplay=False, playback_position=0),  # video_display
             gr.update(visible=True, interactive=True),  # watch_demo_video_btn
             f"{actual_env_id} (Episode {ep_num})",  # task_info_box
             progress_text,  # progress_info_box
         gr.update(choices=radio_choices, value=None),  # options_radio
         goal_text,  # goal_box
         _ui_text("coords", "not_needed"),  # coords_box
+        gr.update(value=None, visible=False, autoplay=False, playback_position=0),  # video_display (no video)
         gr.update(visible=False, interactive=False),  # watch_demo_video_btn
         f"{actual_env_id} (Episode {ep_num})",  # task_info_box
         progress_text,  # progress_info_box
         option_idx,
         coords_str,
     )
+    def _response(
+        *,
+        img_update,
+        log_update,
+        task_update=gr.update(),
+        progress_update=gr.update(),
+        restart_update=gr.update(interactive=True),
+        next_update=gr.update(interactive=True),
+        exec_update=gr.update(interactive=True),
+        video_update=None,
+        options_update=gr.update(interactive=True),
+        coords_update=None,
+        reference_update=gr.update(interactive=True),
+        show_execution_video=False,
+        ui_phase="action_point",
+    ):
+        if video_update is None:
+            video_update = gr.update(value=None, visible=False, autoplay=False, playback_position=0)
+        if coords_update is None:
+            coords_update = _ui_text("coords", "not_needed")
+        return (
+            img_update,
+            log_update,
+            task_update,
+            progress_update,
+            restart_update,
+            next_update,
+            exec_update,
+            video_update,
+            gr.update(visible=False, interactive=False),  # watch_demo_video_btn
+            gr.update(visible=show_execution_video),  # video_phase_group
+            gr.update(visible=not show_execution_video),  # action_phase_group
+            gr.update(visible=not show_execution_video),  # control_panel_group
+            options_update,
+            coords_update,
+            reference_update,
+            ui_phase,
+        )
     session = get_session(uid)
     if not session:
         LOGGER.error("execute_step missing session uid=%s", _uid_for_log(uid))
+        return _response(
+            img_update=_live_obs_update(value=None, interactive=False),
+            log_update=format_log_markdown(_session_error_text()),
+            restart_update=gr.update(interactive=False),
+            next_update=gr.update(interactive=False),
+            exec_update=gr.update(interactive=False),
+            options_update=gr.update(interactive=False),
+            reference_update=gr.update(interactive=False),
+            show_execution_video=False,
         )
     # 检查 execute 次数限制（在执行前检查，如果达到限制则模拟失败状态）
     execute_limit_reached = False
     if uid and session.env_id is not None and session.episode_idx is not None:
                 max_execute,
                 execute_limit_reached,
             )
+    # Ensure at least one cached frame exists for fallback clip generation.
     if not session.base_frames:
         LOGGER.debug("execute_step uid=%s base_frames empty; triggering update_observation", _uid_for_log(uid))
         session.update_observation(use_segmentation=USE_SEGMENTED_VIEW)
+    if hasattr(session, "last_execution_frames"):
+        session.last_execution_frames = []
     option_idx = _parse_option_idx(option_idx)
     if option_idx is None:
         LOGGER.debug("execute_step uid=%s aborted: option_idx is None", _uid_for_log(uid))
+        return _response(
+            img_update=_live_obs_update(value=session.get_pil_image(use_segmented=USE_SEGMENTED_VIEW), interactive=False),
+            log_update=format_log_markdown(_ui_text("log", "execute_missing_action")),
+            exec_update=gr.update(interactive=True),
+            options_update=gr.update(choices=_build_radio_choices(session), value=None, interactive=True),
+            reference_update=gr.update(interactive=True),
+            show_execution_video=False,
         )
     needs_coords = _option_requires_coords(session, option_idx)
             )
             current_img = session.get_pil_image(use_segmented=USE_SEGMENTED_VIEW)
             error_msg = _ui_text("coords", "select_point_before_execute")
+            return _response(
+                img_update=_live_obs_update(value=current_img, interactive=False),
+                log_update=format_log_markdown(error_msg),
+                exec_update=gr.update(interactive=True),
+                options_update=gr.update(choices=_build_radio_choices(session), value=None, interactive=True),
+                coords_update=coords_str,
+                reference_update=gr.update(interactive=True),
+                show_execution_video=False,
+            )
     # Parse coords
     click_coords = None
                 new_count,
             )
     progress_update = gr.update()  # 默认不更新 progress
     task_update = gr.update()
     # 根据视图模式重新获取图片
     img = session.get_pil_image(use_segmented=USE_SEGMENTED_VIEW)
+    video_update = _build_execution_video_update(uid, session)
+    show_execution_video = video_update.get("visible") is True
+    radio_choices = _build_radio_choices(session)
     restart_episode_update = gr.update(interactive=True)
     next_task_update = gr.update(interactive=True)
     exec_btn_update = gr.update(interactive=False) if done else gr.update(interactive=True)
+    options_update = gr.update(choices=radio_choices, value=None, interactive=True)
+    coords_update = _ui_text("coords", "not_needed")
+    reference_update = gr.update(interactive=True)
     # 格式化日志消息为 HTML 格式（支持颜色显示）
     formatted_status = format_log_markdown(status)
     LOGGER.debug(
+        "execute_step done uid=%s env=%s ep=%s done=%s exec_btn_interactive=%s show_execution_video=%s",
         _uid_for_log(uid),
         getattr(session, "env_id", None),
         getattr(session, "episode_idx", None),
         done,
         not done,
+        show_execution_video,
     )
+    return _response(
+        img_update=_live_obs_update(value=img, interactive=False),
+        log_update=formatted_status,
+        task_update=task_update,
+        progress_update=progress_update,
+        restart_update=restart_episode_update,
+        next_update=next_task_update,
+        exec_update=exec_btn_update,
+        video_update=video_update,
+        options_update=options_update,
+        coords_update=coords_update,
+        reference_update=reference_update,
+        show_execution_video=show_execution_video,
+        ui_phase="execution_video" if show_execution_video else "action_point",
     )

gradio-web/process_session.py CHANGED Viewed

@@ -180,6 +180,7 @@ def session_worker_loop(cmd_queue, result_queue, stream_queue, dataset_root, gui
                     "difficulty": session.difficulty,
                     "seed": session.seed,
                     "demonstration_frames": session.demonstration_frames,
                     "base_frames": session.base_frames,  # 加载时完整同步
                     "wrist_frames": session.wrist_frames,  # 加载时完整同步
                     "available_options": session.available_options,
@@ -192,6 +193,7 @@ def session_worker_loop(cmd_queue, result_queue, stream_queue, dataset_root, gui
             elif cmd == CMD_EXECUTE_ACTION:
                 # 执行动作（重计算任务）
                 try:
                     res = session.execute_action(*args, **kwargs)
                     LOGGER.info(
@@ -240,8 +242,11 @@ def session_worker_loop(cmd_queue, result_queue, stream_queue, dataset_root, gui
                 if session.env:
                     is_demonstration = getattr(session.env, 'current_task_demonstration', False)
                 # 构建状态更新（只更新选项和分割视图，帧通过流队列同步）
                 state_update = {
                     "available_options": session.available_options,
                     "raw_solve_options": _sanitize_options(session.raw_solve_options),
                     "seg_vis": session.seg_vis,
@@ -353,6 +358,7 @@ class ProcessSessionProxy:
         self.difficulty = None
         self.seed = None
         self.demonstration_frames = []
         self.base_frames = []  # 由后台同步线程持续更新
         self.wrist_frames = []  # 由后台同步线程持续更新
         self.available_options = []
@@ -482,6 +488,7 @@ class ProcessSessionProxy:
         Returns:
             tuple: (PIL.Image, str, bool) 图像、状态消息、是否完成
         """
         return self._send_cmd(CMD_EXECUTE_ACTION, action_idx, click_coords)
     def get_pil_image(self, use_segmented=True):

                     "difficulty": session.difficulty,
                     "seed": session.seed,
                     "demonstration_frames": session.demonstration_frames,
+                    "last_execution_frames": [],
                     "base_frames": session.base_frames,  # 加载时完整同步
                     "wrist_frames": session.wrist_frames,  # 加载时完整同步
                     "available_options": session.available_options,
             elif cmd == CMD_EXECUTE_ACTION:
                 # 执行动作（重计算任务）
+                execute_base_start = len(session.base_frames)
                 try:
                     res = session.execute_action(*args, **kwargs)
                     LOGGER.info(
                 if session.env:
                     is_demonstration = getattr(session.env, 'current_task_demonstration', False)
+                execution_frames = session.base_frames[execute_base_start:]
                 # 构建状态更新（只更新选项和分割视图，帧通过流队列同步）
                 state_update = {
+                    "last_execution_frames": execution_frames,
                     "available_options": session.available_options,
                     "raw_solve_options": _sanitize_options(session.raw_solve_options),
                     "seg_vis": session.seg_vis,
         self.difficulty = None
         self.seed = None
         self.demonstration_frames = []
+        self.last_execution_frames = []
         self.base_frames = []  # 由后台同步线程持续更新
         self.wrist_frames = []  # 由后台同步线程持续更新
         self.available_options = []
         Returns:
             tuple: (PIL.Image, str, bool) 图像、状态消息、是否完成
         """
+        self.last_execution_frames = []
         return self._send_cmd(CMD_EXECUTE_ACTION, action_idx, click_coords)
     def get_pil_image(self, use_segmented=True):

gradio-web/test/test_live_obs_refresh.py CHANGED Viewed

@@ -1,64 +1,104 @@
 from __future__ import annotations
 import numpy as np
-from PIL import Image
 class _FakeSession:
-    def __init__(self, frames, env_id="BinFill"):
-        self.base_frames = frames
-        self.env_id = env_id
-def test_refresh_live_obs_skips_when_not_execution_phase(monkeypatch, reload_module):
     callbacks = reload_module("gradio_callbacks")
-    monkeypatch.setattr(callbacks, "get_session", lambda uid: _FakeSession([]))
-    update = callbacks.refresh_live_obs("uid-1", "action_point")
-    assert update.get("__type__") == "update"
-    assert "value" not in update
-def test_refresh_live_obs_updates_image_from_latest_frame(monkeypatch, reload_module):
-    config = reload_module("config")
-    callbacks = reload_module("gradio_callbacks")
-    frame0 = np.zeros((8, 8, 3), dtype=np.uint8)
-    frame1 = np.full((8, 8, 3), 11, dtype=np.uint8)
-    frame2 = np.full((8, 8, 3), 22, dtype=np.uint8)
-    frame3 = np.full((8, 8, 3), 33, dtype=np.uint8)
-    frame4 = np.full((8, 8, 3), 44, dtype=np.uint8)
-    session = _FakeSession([frame0])
     monkeypatch.setattr(callbacks, "get_session", lambda uid: session)
-    monkeypatch.setattr(callbacks, "KEYFRAME_DOWNSAMPLE_FACTOR", 2)
-    # Reset queue state at execute start (cursor anchored at current base_frames length).
-    callbacks.switch_to_execute_phase("uid-2")
-    session.base_frames.extend([frame1, frame2, frame3, frame4])
-    # Downsample x2 + FIFO => first frame1, then frame3.
-    update1 = callbacks.refresh_live_obs("uid-2", "execution_playback")
-    update2 = callbacks.refresh_live_obs("uid-2", "execution_playback")
-    update3 = callbacks.refresh_live_obs("uid-2", "execution_playback")
-    assert update1.get("__type__") == "update"
-    assert update1.get("interactive") is False
-    assert update1.get("elem_classes") == config.get_live_obs_elem_classes()
-    assert isinstance(update1.get("value"), Image.Image)
-    assert update1["value"].getpixel((0, 0)) == (11, 11, 11)
-    assert update2.get("__type__") == "update"
-    assert update2.get("interactive") is False
-    assert update2.get("elem_classes") == config.get_live_obs_elem_classes()
-    assert isinstance(update2.get("value"), Image.Image)
-    assert update2["value"].getpixel((0, 0)) == (33, 33, 33)
-    # Queue drained, so no further value update.
-    assert update3.get("__type__") == "update"
-    assert "value" not in update3
-def test_switch_phase_keeps_live_obs_visible_and_toggles_interactive(reload_module):
     config = reload_module("config")
     callbacks = reload_module("gradio_callbacks")

 from __future__ import annotations
 import numpy as np
 class _FakeSession:
+    def __init__(self):
+        self.env_id = "BinFill"
+        self.episode_idx = 1
+        self.raw_solve_options = [{"available": False}]
+        self.available_options = [("pick", 0)]
+        self.base_frames = []
+        self.last_execution_frames = []
+        self.non_demonstration_task_length = None
+        self.difficulty = "easy"
+        self.language_goal = "goal"
+        self.seed = 123
+    def get_pil_image(self, use_segmented=False):
+        _ = use_segmented
+        return "IMG"
+    def update_observation(self, use_segmentation=False):
+        _ = use_segmentation
+        return None
+def test_execute_step_builds_video_from_last_execution_frames(monkeypatch, reload_module):
     callbacks = reload_module("gradio_callbacks")
+    frame1 = np.full((8, 8, 3), 11, dtype=np.uint8)
+    frame2 = np.full((8, 8, 3), 22, dtype=np.uint8)
+    session = _FakeSession()
+    session.base_frames = [frame2]
+    def _execute_action(_option_idx, _coords):
+        session.last_execution_frames = [frame1, frame2]
+        return "IMG", "Executing: pick", False
+    session.execute_action = _execute_action
+    captured = {}
     monkeypatch.setattr(callbacks, "get_session", lambda uid: session)
+    monkeypatch.setattr(callbacks, "increment_execute_count", lambda uid, env_id, episode_idx: 1)
+    monkeypatch.setattr(callbacks, "concatenate_frames_horizontally", lambda frames, env_id=None: list(frames))
+    def _save_video(frames, suffix=""):
+        captured["payload"] = (list(frames), suffix)
+        return "/tmp/exec.mp4"
+    monkeypatch.setattr(callbacks, "save_video", _save_video)
+    monkeypatch.setattr(callbacks.os.path, "exists", lambda path: True)
+    monkeypatch.setattr(callbacks.os.path, "getsize", lambda path: 10)
+    result = callbacks.execute_step("uid-1", 0, callbacks.UI_TEXT["coords"]["not_needed"])
+    saved_frames, suffix = captured["payload"]
+    assert [int(frame[0, 0, 0]) for frame in saved_frames] == [11, 22]
+    assert suffix.startswith("execute_")
+    assert result[7]["visible"] is True
+    assert result[7]["autoplay"] is True
+    assert result[9]["visible"] is True
+    assert result[10]["visible"] is False
+    assert result[11]["visible"] is False
+    assert result[12]["value"] is None
+    assert result[15] == "execution_video"
+def test_execute_step_falls_back_to_single_frame_clip_when_no_new_frames(monkeypatch, reload_module):
+    callbacks = reload_module("gradio_callbacks")
+    frame = np.full((8, 8, 3), 33, dtype=np.uint8)
+    session = _FakeSession()
+    session.base_frames = [frame]
+    def _execute_action(_option_idx, _coords):
+        session.last_execution_frames = []
+        return "IMG", "Executing: pick", False
+    session.execute_action = _execute_action
+    captured = {}
+    monkeypatch.setattr(callbacks, "get_session", lambda uid: session)
+    monkeypatch.setattr(callbacks, "increment_execute_count", lambda uid, env_id, episode_idx: 1)
+    monkeypatch.setattr(callbacks, "concatenate_frames_horizontally", lambda frames, env_id=None: list(frames))
+    def _save_video(frames, suffix=""):
+        captured["frames"] = list(frames)
+        return "/tmp/exec-single.mp4"
+    monkeypatch.setattr(callbacks, "save_video", _save_video)
+    monkeypatch.setattr(callbacks.os.path, "exists", lambda path: True)
+    monkeypatch.setattr(callbacks.os.path, "getsize", lambda path: 10)
+    result = callbacks.execute_step("uid-1", 0, callbacks.UI_TEXT["coords"]["not_needed"])
+    assert len(captured["frames"]) == 1
+    assert int(captured["frames"][0][0, 0, 0]) == 33
+    assert result[7]["visible"] is True
+    assert result[15] == "execution_video"
+def test_switch_phase_toggles_live_obs_interactive_without_refresh_queue(reload_module):
     config = reload_module("config")
     callbacks = reload_module("gradio_callbacks")

gradio-web/test/test_queue_session_limit_e2e.py CHANGED Viewed

@@ -435,23 +435,21 @@ def test_execute_does_not_use_episode_loading_copy(monkeypatch):
             gr.update(interactive=True),
             gr.update(interactive=True),
             gr.update(interactive=True),
-        )
-    def fake_switch_to_action_phase(uid=None):
-        return (
-            gr.update(interactive=True),
-            gr.update(),
-            gr.update(),
-            gr.update(),
-            gr.update(interactive=True),
             gr.update(interactive=True),
         )
     monkeypatch.setattr(ui_layout, "init_app", fake_init_app)
     monkeypatch.setattr(ui_layout, "precheck_execute_inputs", fake_precheck_execute_inputs)
     monkeypatch.setattr(ui_layout, "switch_to_execute_phase", fake_switch_to_execute_phase)
     monkeypatch.setattr(ui_layout, "execute_step", fake_execute_step)
-    monkeypatch.setattr(ui_layout, "switch_to_action_phase", fake_switch_to_action_phase)
     demo = ui_layout.create_ui_blocks()
     root_url, demo, server, thread = _mount_demo(demo)

             gr.update(interactive=True),
             gr.update(interactive=True),
             gr.update(interactive=True),
+            gr.update(value=None, visible=False, autoplay=False, playback_position=0),
+            gr.update(visible=False, interactive=False),
+            gr.update(visible=False),
+            gr.update(visible=True),
+            gr.update(visible=True),
+            gr.update(choices=[("pick", 0)], value=None, interactive=True),
+            "No need for coordinates",
             gr.update(interactive=True),
+            "action_point",
         )
     monkeypatch.setattr(ui_layout, "init_app", fake_init_app)
     monkeypatch.setattr(ui_layout, "precheck_execute_inputs", fake_precheck_execute_inputs)
     monkeypatch.setattr(ui_layout, "switch_to_execute_phase", fake_switch_to_execute_phase)
     monkeypatch.setattr(ui_layout, "execute_step", fake_execute_step)
     demo = ui_layout.create_ui_blocks()
     root_url, demo, server, thread = _mount_demo(demo)

gradio-web/test/test_ui_native_layout_contract.py CHANGED Viewed

@@ -184,12 +184,14 @@ def test_native_ui_config_contains_phase_machine_and_precheck_chain(reload_modul
             if comp.get("props", {}).get("elem_id") == "demo_video"
         )
         assert demo_video_comp.get("props", {}).get("autoplay") is False
         api_names = [dep.get("api_name") for dep in cfg.get("dependencies", [])]
         assert "on_demo_video_play" in api_names
         assert "precheck_execute_inputs" in api_names
         assert "switch_to_execute_phase" in api_names
         assert "execute_step" in api_names
-        assert "switch_to_action_phase" in api_names
     finally:
         demo.close()

             if comp.get("props", {}).get("elem_id") == "demo_video"
         )
         assert demo_video_comp.get("props", {}).get("autoplay") is False
+        component_types = [comp.get("type") for comp in cfg.get("components", [])]
+        assert "timer" not in component_types
         api_names = [dep.get("api_name") for dep in cfg.get("dependencies", [])]
         assert "on_demo_video_play" in api_names
         assert "precheck_execute_inputs" in api_names
         assert "switch_to_execute_phase" in api_names
         assert "execute_step" in api_names
+        assert "refresh_live_obs" not in api_names
     finally:
         demo.close()

gradio-web/test/test_ui_phase_machine_runtime_e2e.py CHANGED Viewed

@@ -416,6 +416,7 @@ def font_size_probe_ui_url(monkeypatch):
 def phase_machine_ui_url():
     state = {"precheck_calls": 0, "play_clicks": 0}
     demo_video_url = "https://interactive-examples.mdn.mozilla.net/media/cc0-videos/flower.mp4"
     ui_layout = importlib.reload(importlib.import_module("ui_layout"))
     with gr.Blocks(title="Native phase machine test") as demo:
@@ -504,7 +505,6 @@ def phase_machine_ui_url():
                 gr.update(interactive=False),
                 gr.update(interactive=False),
                 gr.update(interactive=False),
-                "execution_playback",
             )
         def execute_fn():
@@ -513,16 +513,15 @@ def phase_machine_ui_url():
                 "executed",
                 gr.update(interactive=True),
                 gr.update(interactive=True),
-            )
-        def to_action_fn():
-            return (
-                gr.update(interactive=True),
-                gr.update(interactive=True),
-                gr.update(interactive=True),
                 gr.update(interactive=True),
                 gr.update(interactive=True),
-                "action_point",
             )
         login_btn.click(
@@ -616,16 +615,24 @@ def phase_machine_ui_url():
                 next_task_btn,
                 img_display,
                 reference_action_btn,
-                phase_state,
             ],
             queue=False,
         ).then(
             fn=execute_fn,
-            outputs=[log_output, next_task_btn, exec_btn],
-            queue=False,
-        ).then(
-            fn=to_action_fn,
-            outputs=[options_radio, exec_btn, next_task_btn, img_display, reference_action_btn, phase_state],
             queue=False,
         )
@@ -963,6 +970,21 @@ def test_phase_machine_runtime_flow_and_execute_precheck(phase_machine_ui_url):
         assert interactive_snapshot["execDisabled"] is True
         assert interactive_snapshot["nextDisabled"] is True
         page.wait_for_function(
             """() => {
                 const execBtn = document.querySelector('button#exec_btn') || document.querySelector('#exec_btn button');
@@ -2451,54 +2473,58 @@ def test_header_task_switch_to_video_task_shows_demo_phase(monkeypatch):
 def test_phase_machine_runtime_local_video_path_end_transition():
     import gradio_callbacks as cb
-    ui_layout = importlib.reload(importlib.import_module("ui_layout"))
     demo_video_path = gr.get_video("world.mp4")
     fake_obs = np.zeros((24, 24, 3), dtype=np.uint8)
     class FakeSession:
         def __init__(self):
-            self.env_id = "VideoUnmask"
             self.language_goal = "place cube on target"
-            self.available_options = [("pick", 0)]
-            self.raw_solve_options = [{"available": False}]
-            self.demonstration_frames = [fake_obs.copy() for _ in range(4)]
-        def load_episode(self, env_id, episode_idx):
-            self.env_id = env_id
-            return fake_obs.copy(), f"loaded {env_id}:{episode_idx}"
         def get_pil_image(self, use_segmented=False):
             _ = use_segmented
             return fake_obs.copy()
     originals = {
         "get_session": cb.get_session,
-        "reset_play_button_clicked": cb.reset_play_button_clicked,
-        "reset_execute_count": cb.reset_execute_count,
-        "set_task_start_time": cb.set_task_start_time,
-        "set_ui_phase": cb.set_ui_phase,
         "save_video": cb.save_video,
     }
     fake_session = FakeSession()
     cb.get_session = lambda uid: fake_session
-    cb.reset_play_button_clicked = lambda uid: None
-    cb.reset_execute_count = lambda uid, env_id, ep_num: None
-    cb.set_task_start_time = lambda uid, env_id, ep_num, start_time: None
-    cb.set_ui_phase = lambda uid, phase: None
     cb.save_video = lambda frames, suffix="": demo_video_path
     try:
         with gr.Blocks(title="Native phase machine local video test") as demo:
             uid_state = gr.State(value="uid-local-video")
-            demo.load(
-                fn=None,
-                js=ui_layout.DEMO_VIDEO_PLAY_BINDING_JS,
-                queue=False,
-            )
-            with gr.Column(visible=False, elem_id="main_interface") as main_interface:
                 with gr.Column(visible=False, elem_id="video_phase_group") as video_phase_group:
                     video_display = gr.Video(value=None, elem_id="demo_video", autoplay=False)
                     watch_demo_video_btn = gr.Button(
@@ -2512,72 +2538,87 @@ def test_phase_machine_runtime_local_video_path_end_transition():
                     img_display = gr.Image(value=fake_obs.copy(), elem_id="live_obs")
                 with gr.Column(visible=True, elem_id="control_panel_group") as control_panel_group:
-                    options_radio = gr.Radio(choices=[("pick", 0)], value=None, elem_id="action_radio")
             log_output = gr.Markdown("", elem_id="log_output")
-            goal_box = gr.Textbox("")
-            coords_box = gr.Textbox("No need for coordinates")
             task_info_box = gr.Textbox("")
             progress_info_box = gr.Textbox("")
-            task_hint_display = gr.Textbox("")
-            with gr.Column(visible=False) as loading_overlay:
-                gr.Markdown("Loading...")
-            restart_episode_btn = gr.Button("restart", interactive=False)
-            next_task_btn = gr.Button("next", interactive=False)
-            exec_btn = gr.Button("execute", interactive=False)
-            reference_action_btn = gr.Button("reference", interactive=False)
-            def load_fn():
-                status = {
-                    "current_task": {"env_id": "VideoUnmask", "episode_idx": 1},
-                    "completed_count": 0,
-                }
-                return cb._load_status_task("uid-local-video", status)
-            demo.load(
-                fn=load_fn,
                 outputs=[
-                    uid_state,
-                    main_interface,
                     img_display,
                     log_output,
-                    options_radio,
-                    goal_box,
-                    coords_box,
-                    video_display,
-                    watch_demo_video_btn,
                     task_info_box,
                     progress_info_box,
                     restart_episode_btn,
                     next_task_btn,
                     exec_btn,
                     video_phase_group,
                     action_phase_group,
                     control_panel_group,
-                    task_hint_display,
-                    loading_overlay,
                     reference_action_btn,
                 ],
                 queue=False,
             )
-            watch_demo_video_btn.click(
-                fn=cb.on_demo_video_play,
-                inputs=[uid_state],
-                outputs=[watch_demo_video_btn],
                 queue=False,
             )
             video_display.end(
                 fn=cb.on_video_end_transition,
-                inputs=[uid_state],
                 outputs=[
                     video_phase_group,
                     action_phase_group,
                     control_panel_group,
                     log_output,
                     watch_demo_video_btn,
                 ],
                 queue=False,
             )
@@ -2601,9 +2642,10 @@ def test_phase_machine_runtime_local_video_path_end_transition():
                 page = browser.new_page(viewport={"width": 1280, "height": 900})
                 page.goto(root_url, wait_until="domcontentloaded")
                 page.wait_for_selector("#main_interface", state="visible", timeout=20000)
                 page.wait_for_selector("#demo_video video", timeout=5000)
-                phase_after_login = page.evaluate(
                     """() => {
                         const visible = (id) => {
                             const el = document.getElementById(id);
@@ -2611,37 +2653,25 @@ def test_phase_machine_runtime_local_video_path_end_transition():
                             const st = getComputedStyle(el);
                             return st.display !== 'none' && st.visibility !== 'hidden' && el.getClientRects().length > 0;
                         };
-                        return {
-                            video: visible('demo_video'),
-                            watchButton: visible('watch_demo_video_btn'),
-                            action: visible('live_obs'),
-                            control: visible('action_radio'),
-                        };
-                    }"""
-                )
-                assert phase_after_login == {
-                    "video": True,
-                    "watchButton": True,
-                    "action": False,
-                    "control": False,
-                }
-                controls_after_login = _read_demo_video_controls(page)
-                assert controls_after_login["buttonVisible"] is True
-                assert controls_after_login["buttonDisabled"] is False
-                assert controls_after_login["autoplay"] is False
-                assert controls_after_login["paused"] is True
-                _click_demo_video_button(page)
-                page.wait_for_function(
-                    """() => {
-                        const button =
-                            document.querySelector('#watch_demo_video_btn button') ||
-                            document.querySelector('button#watch_demo_video_btn');
-                        return !!button && button.disabled === true;
                     }""",
-                    timeout=5000,
                 )
                 did_dispatch_end = _dispatch_video_event(page, "ended")
                 assert did_dispatch_end
@@ -2663,6 +2693,30 @@ def test_phase_machine_runtime_local_video_path_end_transition():
                     }""",
                     timeout=2000,
                 )
                 browser.close()
         finally:

 def phase_machine_ui_url():
     state = {"precheck_calls": 0, "play_clicks": 0}
     demo_video_url = "https://interactive-examples.mdn.mozilla.net/media/cc0-videos/flower.mp4"
+    execution_video_path = gr.get_video("world.mp4")
     ui_layout = importlib.reload(importlib.import_module("ui_layout"))
     with gr.Blocks(title="Native phase machine test") as demo:
                 gr.update(interactive=False),
                 gr.update(interactive=False),
                 gr.update(interactive=False),
             )
         def execute_fn():
                 "executed",
                 gr.update(interactive=True),
                 gr.update(interactive=True),
+                gr.update(value=execution_video_path, visible=True, autoplay=True, playback_position=0),
+                gr.update(visible=False, interactive=False),
+                gr.update(visible=True),
+                gr.update(visible=False),
+                gr.update(visible=False),
                 gr.update(interactive=True),
+                "No need for coordinates",
                 gr.update(interactive=True),
+                "execution_video",
             )
         login_btn.click(
                 next_task_btn,
                 img_display,
                 reference_action_btn,
             ],
             queue=False,
         ).then(
             fn=execute_fn,
+            outputs=[
+                log_output,
+                next_task_btn,
+                exec_btn,
+                video_display,
+                watch_demo_video_btn,
+                video_phase_group,
+                action_phase_group,
+                control_panel_group,
+                options_radio,
+                coords_box,
+                reference_action_btn,
+                phase_state,
+            ],
             queue=False,
         )
         assert interactive_snapshot["execDisabled"] is True
         assert interactive_snapshot["nextDisabled"] is True
+        page.wait_for_function(
+            """() => {
+                const videoEl = document.querySelector('#demo_video video');
+                return !!videoEl && videoEl.autoplay === true && (videoEl.paused === false || videoEl.currentTime > 0);
+            }""",
+            timeout=6000,
+        )
+        execute_video_controls = _read_demo_video_controls(page)
+        assert execute_video_controls["videoVisible"] is True
+        assert execute_video_controls["autoplay"] is True
+        assert execute_video_controls["paused"] is False
+        did_dispatch_end = _dispatch_video_event(page, "ended")
+        assert did_dispatch_end
         page.wait_for_function(
             """() => {
                 const execBtn = document.querySelector('button#exec_btn') || document.querySelector('#exec_btn button');
 def test_phase_machine_runtime_local_video_path_end_transition():
     import gradio_callbacks as cb
+    import config as config_module
+    ui_layout = importlib.reload(importlib.import_module("ui_layout"))
     demo_video_path = gr.get_video("world.mp4")
     fake_obs = np.zeros((24, 24, 3), dtype=np.uint8)
     class FakeSession:
         def __init__(self):
+            self.env_id = "BinFill"
+            self.episode_idx = 1
             self.language_goal = "place cube on target"
+            self.available_options = [("pick", 0), ("point", 1)]
+            self.raw_solve_options = [{"available": False}, {"available": [object()]}]
+            self.demonstration_frames = []
+            self.last_execution_frames = []
+            self.base_frames = [fake_obs.copy()]
+            self.non_demonstration_task_length = None
+            self.difficulty = "easy"
+            self.seed = 123
         def get_pil_image(self, use_segmented=False):
             _ = use_segmented
             return fake_obs.copy()
+        def update_observation(self, use_segmentation=False):
+            _ = use_segmentation
+            return None
+        def execute_action(self, option_idx, click_coords):
+            _ = option_idx, click_coords
+            self.last_execution_frames = [fake_obs.copy() for _ in range(3)]
+            self.base_frames.extend(self.last_execution_frames)
+            return fake_obs.copy(), "Executing: pick", False
     originals = {
         "get_session": cb.get_session,
+        "increment_execute_count": cb.increment_execute_count,
         "save_video": cb.save_video,
     }
     fake_session = FakeSession()
     cb.get_session = lambda uid: fake_session
+    cb.increment_execute_count = lambda uid, env_id, ep_num: 1
     cb.save_video = lambda frames, suffix="": demo_video_path
     try:
         with gr.Blocks(title="Native phase machine local video test") as demo:
             uid_state = gr.State(value="uid-local-video")
+            phase_state = gr.State(value="action_point")
+            suppress_state = gr.State(value=False)
+            with gr.Column(visible=True, elem_id="main_interface") as main_interface:
                 with gr.Column(visible=False, elem_id="video_phase_group") as video_phase_group:
                     video_display = gr.Video(value=None, elem_id="demo_video", autoplay=False)
                     watch_demo_video_btn = gr.Button(
                     img_display = gr.Image(value=fake_obs.copy(), elem_id="live_obs")
                 with gr.Column(visible=True, elem_id="control_panel_group") as control_panel_group:
+                    options_radio = gr.Radio(choices=[("pick", 0), ("point", 1)], value=None, elem_id="action_radio")
+                    coords_box = gr.Textbox(config_module.UI_TEXT["coords"]["not_needed"], elem_id="coords_box")
+                    exec_btn = gr.Button("execute", interactive=True, elem_id="exec_btn")
+                    reference_action_btn = gr.Button("reference", interactive=True, elem_id="reference_action_btn")
+                    restart_episode_btn = gr.Button("restart", interactive=True, elem_id="restart_episode_btn")
+                    next_task_btn = gr.Button("next", interactive=True, elem_id="next_task_btn")
             log_output = gr.Markdown("", elem_id="log_output")
             task_info_box = gr.Textbox("")
             progress_info_box = gr.Textbox("")
+            exec_btn.click(
+                fn=cb.precheck_execute_inputs,
+                inputs=[uid_state, options_radio, coords_box],
+                outputs=[],
+                queue=False,
+            ).then(
+                fn=cb.switch_to_execute_phase,
+                inputs=[uid_state],
+                outputs=[
+                    options_radio,
+                    exec_btn,
+                    restart_episode_btn,
+                    next_task_btn,
+                    img_display,
+                    reference_action_btn,
+                ],
+                queue=False,
+            ).then(
+                fn=cb.execute_step,
+                inputs=[uid_state, options_radio, coords_box],
                 outputs=[
                     img_display,
                     log_output,
                     task_info_box,
                     progress_info_box,
                     restart_episode_btn,
                     next_task_btn,
                     exec_btn,
+                    video_display,
+                    watch_demo_video_btn,
                     video_phase_group,
                     action_phase_group,
                     control_panel_group,
+                    options_radio,
+                    coords_box,
                     reference_action_btn,
+                    phase_state,
                 ],
                 queue=False,
             )
+            options_radio.change(
+                fn=cb.on_option_select,
+                inputs=[uid_state, options_radio, coords_box, suppress_state],
+                outputs=[coords_box, img_display, log_output, suppress_state],
                 queue=False,
             )
             video_display.end(
                 fn=cb.on_video_end_transition,
+                inputs=[uid_state, phase_state],
                 outputs=[
                     video_phase_group,
                     action_phase_group,
                     control_panel_group,
                     log_output,
                     watch_demo_video_btn,
+                    phase_state,
+                ],
+                queue=False,
+            )
+            video_display.stop(
+                fn=cb.on_video_end_transition,
+                inputs=[uid_state, phase_state],
+                outputs=[
+                    video_phase_group,
+                    action_phase_group,
+                    control_panel_group,
+                    log_output,
+                    watch_demo_video_btn,
+                    phase_state,
                 ],
                 queue=False,
             )
                 page = browser.new_page(viewport={"width": 1280, "height": 900})
                 page.goto(root_url, wait_until="domcontentloaded")
                 page.wait_for_selector("#main_interface", state="visible", timeout=20000)
+                page.locator("#action_radio input[type='radio']").first.check(force=True)
+                page.locator("#exec_btn button, button#exec_btn").first.click()
                 page.wait_for_selector("#demo_video video", timeout=5000)
+                page.wait_for_function(
                     """() => {
                         const visible = (id) => {
                             const el = document.getElementById(id);
                             const st = getComputedStyle(el);
                             return st.display !== 'none' && st.visibility !== 'hidden' && el.getClientRects().length > 0;
                         };
+                        const videoEl = document.querySelector('#demo_video video');
+                        return (
+                            visible('video_phase_group') &&
+                            visible('demo_video') &&
+                            !visible('watch_demo_video_btn') &&
+                            !visible('action_phase_group') &&
+                            !visible('control_panel_group') &&
+                            !!videoEl &&
+                            videoEl.autoplay === true &&
+                            (videoEl.paused === false || videoEl.currentTime > 0)
+                        );
                     }""",
+                    timeout=10000,
                 )
+                controls_after_execute = _read_demo_video_controls(page)
+                assert controls_after_execute["videoVisible"] is True
+                assert controls_after_execute["buttonVisible"] is False
+                assert controls_after_execute["autoplay"] is True
+                assert controls_after_execute["paused"] is False
                 did_dispatch_end = _dispatch_video_event(page, "ended")
                 assert did_dispatch_end
                     }""",
                     timeout=2000,
                 )
+                page.locator("#action_radio input[type='radio']").nth(1).check(force=True)
+                page.wait_for_function(
+                    """(state) => {
+                        const liveObs = document.getElementById('live_obs');
+                        const coordsRoot = document.getElementById('coords_box');
+                        const coordsField = coordsRoot?.querySelector('textarea, input');
+                        const logRoot = document.getElementById('log_output');
+                        const logField = logRoot?.querySelector('textarea, input');
+                        const coordsValue = coordsField ? coordsField.value.trim() : '';
+                        const logValue = logField ? logField.value.trim() : (logRoot?.textContent || '').trim();
+                        return (
+                            !!liveObs &&
+                            liveObs.classList.contains(state.waitClass) &&
+                            coordsValue === state.coordsPrompt &&
+                            logValue === state.waitLog
+                        );
+                    }""",
+                    arg={
+                        "waitClass": config_module.LIVE_OBS_POINT_WAIT_CLASS,
+                        "coordsPrompt": config_module.UI_TEXT["coords"]["select_point"],
+                        "waitLog": config_module.UI_TEXT["log"]["point_selection_prompt"],
+                    },
+                    timeout=5000,
+                )
                 browser.close()
         finally:

gradio-web/test/test_ui_text_config.py CHANGED Viewed

@@ -75,11 +75,12 @@ def test_on_video_end_transition_uses_configured_action_prompt(monkeypatch, relo
     monkeypatch.setitem(callbacks.UI_TEXT["log"], "action_selection_prompt", "choose an action from config")
-    result = callbacks.on_video_end_transition("uid-1")
     assert result[3] == "choose an action from config"
     assert result[4]["visible"] is False
     assert result[4]["interactive"] is False
 def test_on_demo_video_play_disables_button_and_sets_single_use_state(monkeypatch, reload_module):

     monkeypatch.setitem(callbacks.UI_TEXT["log"], "action_selection_prompt", "choose an action from config")
+    result = callbacks.on_video_end_transition("uid-1", "demo_video")
     assert result[3] == "choose an action from config"
     assert result[4]["visible"] is False
     assert result[4]["interactive"] is False
+    assert result[5] == "action_point"
 def test_on_demo_video_play_disables_button_and_sets_single_use_state(monkeypatch, reload_module):

gradio-web/ui_layout.py CHANGED Viewed

@@ -1,6 +1,6 @@
 """
 Native Gradio UI layout.
-Sequential media phases: Demo Video -> Action+Point.
 Two-column layout: Point Selection | Right Panel.
 """
@@ -16,7 +16,6 @@ from config import (
     SESSION_CONCURRENCY_ID,
     SESSION_CONCURRENCY_LIMIT,
     SESSION_TIMEOUT,
-    LIVE_OBS_REFRESH_HZ,
     POINT_SELECTION_SCALE,
     RIGHT_TOP_ACTION_SCALE,
     RIGHT_TOP_LOG_SCALE,
@@ -36,7 +35,6 @@ from gradio_callbacks import (
     on_reference_action,
     on_video_end_transition,
     precheck_execute_inputs,
-    refresh_live_obs,
     restart_episode_wrapper,
     switch_env_wrapper,
     switch_to_action_phase,
@@ -49,7 +47,7 @@ from user_manager import user_manager
 PHASE_INIT = "init"
 PHASE_DEMO_VIDEO = "demo_video"
 PHASE_ACTION_POINT = "action_point"
-PHASE_EXECUTION_PLAYBACK = "execution_playback"
 LOAD_STATUS_MODE_IDLE = "idle"
 LOAD_STATUS_MODE_EPISODE_LOAD = "episode_load"
@@ -863,13 +861,13 @@ def _with_rejected_init(load_result, message):
 def _phase_visibility_updates(phase):
-    if phase == PHASE_DEMO_VIDEO:
         return (
             gr.update(visible=True),
             gr.update(visible=False),
             gr.update(visible=False),
         )
-    if phase in {PHASE_ACTION_POINT, PHASE_EXECUTION_PLAYBACK}:
         return (
             gr.update(visible=False),
             gr.update(visible=True),
@@ -931,7 +929,6 @@ def create_ui_blocks():
         ui_phase_state = gr.State(value=PHASE_INIT)
         current_task_env_state = gr.State(value=None)
         suppress_next_option_change_state = gr.State(value=False)
-        live_obs_timer = gr.Timer(value=1.0 / LIVE_OBS_REFRESH_HZ, active=True)
         task_info_box = gr.Textbox(visible=False, elem_id="task_info_box")
         progress_info_box = gr.Textbox(visible=False)
@@ -949,7 +946,7 @@ def create_ui_blocks():
                     with gr.Column(elem_classes=["native-card"], elem_id="media_card"):
                         with gr.Column(visible=False, elem_id="video_phase_group") as video_phase_group:
                             video_display = gr.Video(
-                                label="Demonstration Video 🎬",
                                 interactive=False,
                                 elem_id="demo_video",
                                 autoplay=False,
@@ -1316,21 +1313,17 @@ def create_ui_blocks():
         video_display.end(
             fn=on_video_end_transition,
-            inputs=[uid_state],
             outputs=[
                 video_phase_group,
                 action_phase_group,
                 control_panel_group,
                 log_output,
                 watch_demo_video_btn,
             ],
             queue=False,
             show_progress="hidden",
-        ).then(
-            fn=lambda: PHASE_ACTION_POINT,
-            outputs=[ui_phase_state],
-            queue=False,
-            show_progress="hidden",
         ).then(
             fn=touch_session,
             inputs=[uid_state],
@@ -1340,21 +1333,17 @@ def create_ui_blocks():
         )
         video_display.stop(
             fn=on_video_end_transition,
-            inputs=[uid_state],
             outputs=[
                 video_phase_group,
                 action_phase_group,
                 control_panel_group,
                 log_output,
                 watch_demo_video_btn,
             ],
             queue=False,
             show_progress="hidden",
-        ).then(
-            fn=lambda: PHASE_ACTION_POINT,
-            outputs=[ui_phase_state],
-            queue=False,
-            show_progress="hidden",
         ).then(
             fn=touch_session,
             inputs=[uid_state],
@@ -1437,11 +1426,6 @@ def create_ui_blocks():
             ],
             queue=False,
             show_progress="hidden",
-        ).then(
-            fn=lambda: PHASE_EXECUTION_PLAYBACK,
-            outputs=[ui_phase_state],
-            queue=False,
-            show_progress="hidden",
         ).then(
             fn=touch_session,
             inputs=[uid_state],
@@ -1451,27 +1435,26 @@ def create_ui_blocks():
         ).then(
             fn=execute_step,
             inputs=[uid_state, options_radio, coords_box],
-            outputs=[img_display, log_output, task_info_box, progress_info_box, restart_episode_btn, next_task_btn, exec_btn],
-            show_progress="hidden",
-            **action_queue_kwargs,
-        ).then(
-            fn=switch_to_action_phase,
-            inputs=[uid_state],
             outputs=[
-                options_radio,
-                exec_btn,
                 restart_episode_btn,
                 next_task_btn,
-                img_display,
                 reference_action_btn,
             ],
-            queue=False,
-            show_progress="hidden",
-        ).then(
-            fn=lambda: PHASE_ACTION_POINT,
-            outputs=[ui_phase_state],
-            queue=False,
             show_progress="hidden",
         ).then(
             fn=touch_session,
             inputs=[uid_state],
@@ -1480,14 +1463,6 @@ def create_ui_blocks():
             show_progress="hidden",
         )
-        live_obs_timer.tick(
-            fn=refresh_live_obs,
-            inputs=[uid_state, ui_phase_state],
-            outputs=[img_display],
-            queue=False,
-            show_progress="hidden",
-        )
         demo.load(
             fn=None,
             js=THEME_LOCK_JS,

 """
 Native Gradio UI layout.
+Sequential media phases: Demo Video -> Action+Point -> Execute Video.
 Two-column layout: Point Selection | Right Panel.
 """
     SESSION_CONCURRENCY_ID,
     SESSION_CONCURRENCY_LIMIT,
     SESSION_TIMEOUT,
     POINT_SELECTION_SCALE,
     RIGHT_TOP_ACTION_SCALE,
     RIGHT_TOP_LOG_SCALE,
     on_reference_action,
     on_video_end_transition,
     precheck_execute_inputs,
     restart_episode_wrapper,
     switch_env_wrapper,
     switch_to_action_phase,
 PHASE_INIT = "init"
 PHASE_DEMO_VIDEO = "demo_video"
 PHASE_ACTION_POINT = "action_point"
+PHASE_EXECUTION_VIDEO = "execution_video"
 LOAD_STATUS_MODE_IDLE = "idle"
 LOAD_STATUS_MODE_EPISODE_LOAD = "episode_load"
 def _phase_visibility_updates(phase):
+    if phase in {PHASE_DEMO_VIDEO, PHASE_EXECUTION_VIDEO}:
         return (
             gr.update(visible=True),
             gr.update(visible=False),
             gr.update(visible=False),
         )
+    if phase == PHASE_ACTION_POINT:
         return (
             gr.update(visible=False),
             gr.update(visible=True),
         ui_phase_state = gr.State(value=PHASE_INIT)
         current_task_env_state = gr.State(value=None)
         suppress_next_option_change_state = gr.State(value=False)
         task_info_box = gr.Textbox(visible=False, elem_id="task_info_box")
         progress_info_box = gr.Textbox(visible=False)
                     with gr.Column(elem_classes=["native-card"], elem_id="media_card"):
                         with gr.Column(visible=False, elem_id="video_phase_group") as video_phase_group:
                             video_display = gr.Video(
+                                label="Video Playback 🎬",
                                 interactive=False,
                                 elem_id="demo_video",
                                 autoplay=False,
         video_display.end(
             fn=on_video_end_transition,
+            inputs=[uid_state, ui_phase_state],
             outputs=[
                 video_phase_group,
                 action_phase_group,
                 control_panel_group,
                 log_output,
                 watch_demo_video_btn,
+                ui_phase_state,
             ],
             queue=False,
             show_progress="hidden",
         ).then(
             fn=touch_session,
             inputs=[uid_state],
         )
         video_display.stop(
             fn=on_video_end_transition,
+            inputs=[uid_state, ui_phase_state],
             outputs=[
                 video_phase_group,
                 action_phase_group,
                 control_panel_group,
                 log_output,
                 watch_demo_video_btn,
+                ui_phase_state,
             ],
             queue=False,
             show_progress="hidden",
         ).then(
             fn=touch_session,
             inputs=[uid_state],
             ],
             queue=False,
             show_progress="hidden",
         ).then(
             fn=touch_session,
             inputs=[uid_state],
         ).then(
             fn=execute_step,
             inputs=[uid_state, options_radio, coords_box],
             outputs=[
+                img_display,
+                log_output,
+                task_info_box,
+                progress_info_box,
                 restart_episode_btn,
                 next_task_btn,
+                exec_btn,
+                video_display,
+                watch_demo_video_btn,
+                video_phase_group,
+                action_phase_group,
+                control_panel_group,
+                options_radio,
+                coords_box,
                 reference_action_btn,
+                ui_phase_state,
             ],
             show_progress="hidden",
+            **action_queue_kwargs,
         ).then(
             fn=touch_session,
             inputs=[uid_state],
             show_progress="hidden",
         )
         demo.load(
             fn=None,
             js=THEME_LOCK_JS,