HongzeFu commited on
Commit
93eb118
·
1 Parent(s): de73880
README.md CHANGED
@@ -1,7 +1,7 @@
1
  ---
2
  title: RoboMME Oracle Planner
3
  sdk: gradio
4
- app_file: app.py
5
  python_version: "3.11"
6
  ---
7
 
 
1
  ---
2
  title: RoboMME Oracle Planner
3
  sdk: gradio
4
+ app_file: gradio-web/main.py
5
  python_version: "3.11"
6
  ---
7
 
gradio-web/gradio_callbacks.py CHANGED
@@ -2,6 +2,7 @@
2
  Gradio回调函数模块
3
  响应UI事件,调用业务逻辑,返回UI更新
4
  """
 
5
  import gradio as gr
6
  import numpy as np
7
  import time
@@ -39,6 +40,14 @@ from note_content import get_task_hint
39
  # Each uid keeps its own FIFO queue and sampling cursor.
40
  _LIVE_OBS_REFRESH = {}
41
  _LIVE_OBS_REFRESH_LOCK = threading.Lock()
 
 
 
 
 
 
 
 
42
 
43
 
44
  def capitalize_first_letter(text: str) -> str:
@@ -164,6 +173,7 @@ def show_loading_info():
164
  Returns:
165
  gr.update: 显示 loading overlay group
166
  """
 
167
  return gr.update(visible=True)
168
 
169
 
@@ -180,6 +190,11 @@ def switch_to_execute_phase(uid):
180
  if uid:
181
  session = get_session(uid)
182
  base_count = len(getattr(session, "base_frames", []) or []) if session else 0
 
 
 
 
 
183
  with _LIVE_OBS_REFRESH_LOCK:
184
  _LIVE_OBS_REFRESH[uid] = {
185
  "frame_queue": queue.Queue(),
@@ -199,6 +214,7 @@ def switch_to_execute_phase(uid):
199
  def switch_to_action_phase(uid=None):
200
  """Switch display to action phase and restore control panel interactions."""
201
  if uid:
 
202
  with _LIVE_OBS_REFRESH_LOCK:
203
  _LIVE_OBS_REFRESH.pop(uid, None)
204
  return (
@@ -355,6 +371,7 @@ def on_video_end_transition(uid):
355
 
356
 
357
  def _task_load_failed_response(uid, message):
 
358
  return (
359
  uid,
360
  gr.update(visible=True), # main_interface
@@ -394,17 +411,24 @@ def _load_status_task(uid, status):
394
  except (TypeError, ValueError):
395
  completed_count = 0
396
  progress_text = f"Completed: {completed_count}"
 
 
 
 
 
 
 
397
 
398
  session = get_session(uid)
399
  if session is None:
400
- print(f"Session {uid} not found, creating new session")
401
  session = ProcessSessionProxy()
402
  with _state_lock:
403
  GLOBAL_SESSIONS[uid] = session
404
  SESSION_LAST_ACTIVITY[uid] = time.time()
405
- print(f"New session created for {uid}")
406
 
407
- print(f"Loading {env_id} Ep {ep_num} for {uid}")
408
 
409
  with _LIVE_OBS_REFRESH_LOCK:
410
  _LIVE_OBS_REFRESH.pop(uid, None)
@@ -413,6 +437,14 @@ def _load_status_task(uid, status):
413
 
414
  img, load_msg = session.load_episode(env_id, int(ep_num))
415
  actual_env_id = getattr(session, "env_id", None) or env_id
 
 
 
 
 
 
 
 
416
 
417
  if img is not None:
418
  start_time = datetime.now().isoformat()
@@ -460,6 +492,12 @@ def _load_status_task(uid, status):
460
  else:
461
  opt_label_with_hint = opt_label
462
  radio_choices.append((opt_label_with_hint, opt_idx))
 
 
 
 
 
 
463
 
464
  demo_video_path = None
465
  has_demo_video = False
@@ -479,6 +517,13 @@ def _load_status_task(uid, status):
479
  demo_video_path = None
480
  except Exception:
481
  demo_video_path = None
 
 
 
 
 
 
 
482
 
483
  img = session.get_pil_image(use_segmented=USE_SEGMENTED_VIEW)
484
 
@@ -537,17 +582,22 @@ def init_session_and_load_task(uid):
537
  if not uid:
538
  uid = create_session()
539
 
540
- print(f"[DEBUG init_session_and_load_task] Calling user_manager.init_session({uid})")
541
  success, msg, status = user_manager.init_session(uid)
542
- print(f"[DEBUG init_session_and_load_task] init_session result: success={success}, msg={msg}")
 
 
 
 
 
543
 
544
  if uid:
545
  update_session_activity(uid)
546
 
547
  if not success:
548
- print(f"[DEBUG init_session_and_load_task] Failed, returning error response")
549
  return _task_load_failed_response(uid, msg)
550
- print(f"[DEBUG init_session_and_load_task] Success, loading task...")
551
  return _load_status_task(uid, status)
552
 
553
 
@@ -556,10 +606,12 @@ def load_next_task_wrapper(uid):
556
 
557
  if not uid:
558
  uid = create_session()
 
559
 
560
  if uid:
561
  update_session_activity(uid)
562
 
 
563
  status = user_manager.next_episode_same_env(uid)
564
  if not status:
565
  return _task_load_failed_response(uid, "Failed to load next task")
@@ -570,10 +622,12 @@ def restart_episode_wrapper(uid):
570
  """Reload the current env + episode."""
571
  if not uid:
572
  uid = create_session()
 
573
 
574
  if uid:
575
  update_session_activity(uid)
576
 
 
577
  status = user_manager.get_session_status(uid)
578
  current_task = status.get("current_task") if isinstance(status, dict) else None
579
  if not current_task:
@@ -591,10 +645,16 @@ def switch_env_wrapper(uid, selected_env):
591
  """Switch env from Current Task dropdown and randomly assign an episode."""
592
  if not uid:
593
  uid = create_session()
 
594
 
595
  if uid:
596
  update_session_activity(uid)
597
 
 
 
 
 
 
598
  if selected_env:
599
  status = user_manager.switch_env_and_random_episode(uid, selected_env)
600
  else:
@@ -616,6 +676,7 @@ def on_map_click(uid, option_value, evt: gr.SelectData):
616
 
617
  session = get_session(uid)
618
  if not session:
 
619
  return None, "Session Error"
620
 
621
  # Check if current option actually needs coordinates
@@ -634,12 +695,25 @@ def on_map_click(uid, option_value, evt: gr.SelectData):
634
  needs_coords = True
635
 
636
  if not needs_coords:
 
 
 
 
 
 
637
  # Return current state without changes (or reset to default message if needed, but it should already be there)
638
  # We return the clean image and the "No need" message to enforce state
639
  base_img = session.get_pil_image(use_segmented=USE_SEGMENTED_VIEW)
640
  return base_img, "No need for coordinates"
641
 
642
  x, y = evt.index[0], evt.index[1]
 
 
 
 
 
 
 
643
 
644
  # Get clean image from session
645
  base_img = session.get_pil_image(use_segmented=USE_SEGMENTED_VIEW)
@@ -676,6 +750,7 @@ def on_option_select(uid, option_value, coords_str=None):
676
  default_msg = "No need for coordinates"
677
 
678
  if option_value is None:
 
679
  return default_msg, gr.update(interactive=False)
680
 
681
  # 更新session活动时间(选择选项操作)
@@ -684,6 +759,7 @@ def on_option_select(uid, option_value, coords_str=None):
684
 
685
  session = get_session(uid)
686
  if not session:
 
687
  return default_msg, gr.update(interactive=False)
688
 
689
  # option_value 是 (label, idx) 元组或直接是 idx
@@ -696,10 +772,17 @@ def on_option_select(uid, option_value, coords_str=None):
696
  if 0 <= option_idx < len(session.raw_solve_options):
697
  opt = session.raw_solve_options[option_idx]
698
  if opt.get("available"):
 
 
 
 
 
 
699
  if _is_valid_coords_text(coords_str):
700
  return coords_str, gr.update(interactive=True)
701
  return "please click the keypoint selection image", gr.update(interactive=True)
702
 
 
703
  return default_msg, gr.update(interactive=False)
704
 
705
 
@@ -712,6 +795,7 @@ def on_reference_action(uid):
712
 
713
  session = get_session(uid)
714
  if not session:
 
715
  return (
716
  None,
717
  gr.update(),
@@ -719,11 +803,13 @@ def on_reference_action(uid):
719
  format_log_markdown("Session Error"),
720
  )
721
 
 
722
  current_img = session.get_pil_image(use_segmented=USE_SEGMENTED_VIEW)
723
 
724
  try:
725
  reference = session.get_reference_action()
726
  except Exception as exc:
 
727
  return (
728
  current_img,
729
  gr.update(),
@@ -758,6 +844,13 @@ def on_reference_action(uid):
758
  updated_img = draw_marker(current_img, x, y)
759
  coords_text = f"{x}, {y}"
760
  log_text = f"Ground Truth Action: {option_label}. {option_action} | coords: {coords_text}"
 
 
 
 
 
 
 
761
 
762
  return (
763
  updated_img,
@@ -777,18 +870,16 @@ def init_app(request: gr.Request):
777
  Returns:
778
  初始化后的UI状态
779
  """
780
- import traceback
781
  _ = request # Query params are intentionally ignored in session-based mode.
782
  try:
783
- print(f"[DEBUG init_app] Creating session...")
784
  uid = create_session()
785
- print(f"[DEBUG init_app] Session created: {uid}")
786
  result = init_session_and_load_task(uid)
787
- print(f"[DEBUG init_app] init_session_and_load_task returned {len(result)} elements")
788
  return result
789
  except Exception as e:
790
- print(f"[ERROR init_app] Exception: {e}")
791
- traceback.print_exc()
792
  # Return a safe fallback that hides the loading overlay and shows error
793
  return _task_load_failed_response("", f"Initialization error: {e}")
794
 
@@ -803,6 +894,7 @@ def precheck_execute_inputs(uid, option_idx, coords_str):
803
 
804
  session = get_session(uid)
805
  if not session:
 
806
  raise gr.Error("Session Error")
807
 
808
  parsed_option_idx = option_idx
@@ -810,6 +902,7 @@ def precheck_execute_inputs(uid, option_idx, coords_str):
810
  _, parsed_option_idx = option_idx
811
 
812
  if parsed_option_idx is None:
 
813
  raise gr.Error("Error: No action selected")
814
 
815
  needs_coords = False
@@ -821,15 +914,39 @@ def precheck_execute_inputs(uid, option_idx, coords_str):
821
  needs_coords = bool(opt.get("available"))
822
 
823
  if needs_coords and not _is_valid_coords_text(coords_str):
 
 
 
 
 
 
824
  raise gr.Error("please click the keypoint selection image before execute!")
 
 
 
 
 
 
825
 
826
 
827
  def execute_step(uid, option_idx, coords_str):
 
 
 
 
 
 
828
  # 检查session是否超时(在更新活动时间之前检查)
829
  last_activity = get_session_activity(uid)
830
  if last_activity is not None:
831
  elapsed = time.time() - last_activity
832
  if elapsed > SESSION_TIMEOUT:
 
 
 
 
 
 
833
  raise gr.Error(f"Session已超时:超过 {SESSION_TIMEOUT} 秒未活动。请刷新页面重新登录。")
834
 
835
  # 更新session的最后活动时间
@@ -837,6 +954,7 @@ def execute_step(uid, option_idx, coords_str):
837
 
838
  session = get_session(uid)
839
  if not session:
 
840
  return None, format_log_markdown("Session Error"), gr.update(), gr.update(), gr.update(interactive=False), gr.update(interactive=False)
841
 
842
  # 检查 execute 次数限制(在执行前检查,如果达到限制则模拟失败状态)
@@ -851,12 +969,23 @@ def execute_step(uid, option_idx, coords_str):
851
  current_count = get_execute_count(uid, session.env_id, session.episode_idx)
852
  if current_count >= max_execute:
853
  execute_limit_reached = True
 
 
 
 
 
 
 
 
 
854
 
855
  # Ensure at least one cached frame exists for timer-based refresh.
856
  if not session.base_frames:
 
857
  session.update_observation(use_segmentation=USE_SEGMENTED_VIEW)
858
 
859
  if option_idx is None:
 
860
  return session.get_pil_image(use_segmented=USE_SEGMENTED_VIEW), format_log_markdown("Error: No action selected"), gr.update(), gr.update(), gr.update(interactive=False), gr.update(interactive=True)
861
 
862
  # 检查当前选项是否需要坐标
@@ -883,6 +1012,12 @@ def execute_step(uid, option_idx, coords_str):
883
 
884
  # 如果需要坐标但没有有效坐标,返回错误提示
885
  if not is_valid_coords:
 
 
 
 
 
 
886
  current_img = session.get_pil_image(use_segmented=USE_SEGMENTED_VIEW)
887
  error_msg = "please click the keypoint selection image before execute!"
888
  return current_img, format_log_markdown(error_msg), gr.update(), gr.update(), gr.update(interactive=False), gr.update(interactive=True)
@@ -918,11 +1053,24 @@ def execute_step(uid, option_idx, coords_str):
918
  # 增加 execute 计数(因为这也算一次尝试)
919
  if uid and session.env_id is not None and session.episode_idx is not None:
920
  new_count = increment_execute_count(uid, session.env_id, session.episode_idx)
921
- print(f"Execute limit reached for {uid}:{session.env_id}:{session.episode_idx} (count: {new_count})")
 
 
 
 
 
 
922
  else:
923
  # 正常执行
924
  # 异常处理:所有异常(ScrewPlanFailure 和其他执行错误)都会显示弹窗通知
925
- print(f"Executing step: Opt {option_idx}, Coords {click_coords}")
 
 
 
 
 
 
 
926
  try:
927
  img, status, done = session.execute_action(option_idx, click_coords)
928
  except ScrewPlanFailureError as e:
@@ -935,6 +1083,7 @@ def execute_step(uid, option_idx, coords_str):
935
  done = False
936
  # 继续正常返回流程
937
  img = current_img
 
938
  except RuntimeError as e:
939
  # 捕获所有其他执行错误,显示弹窗通知
940
  error_message = str(e)
@@ -945,16 +1094,24 @@ def execute_step(uid, option_idx, coords_str):
945
  done = False
946
  # 继续正常返回流程
947
  img = current_img
 
948
 
949
  # 增加 execute 计数(无论成功或失败都计数,因为用户已经执行了一次操作)
950
  if uid and session.env_id is not None and session.episode_idx is not None:
951
  new_count = increment_execute_count(uid, session.env_id, session.episode_idx)
952
- print(f"Execute count for {uid}:{session.env_id}:{session.episode_idx} = {new_count}")
 
 
 
 
 
 
953
 
954
  # Execute frames are produced in batch when execute_action returns from worker process.
955
  # Enqueue them now, then wait briefly for the 0.1s timer to drain FIFO playback.
956
  _enqueue_live_obs_frames(uid, getattr(session, "base_frames", None))
957
  _wait_for_live_obs_queue_drain(uid)
 
958
 
959
  # 注意:执行阶段画面由 live_obs 的 0.1s 轮询刷新。
960
 
@@ -990,6 +1147,14 @@ def execute_step(uid, option_idx, coords_str):
990
  completed_count = user_status.get("completed_count", 0)
991
  task_update = f"{session.env_id} (Episode {session.episode_idx})"
992
  progress_update = f"Completed: {completed_count}"
 
 
 
 
 
 
 
 
993
 
994
  # 根据视图模式重新获取图片
995
  img = session.get_pil_image(use_segmented=USE_SEGMENTED_VIEW)
@@ -1000,6 +1165,14 @@ def execute_step(uid, option_idx, coords_str):
1000
 
1001
  # 格式化日志消息为 HTML 格式(支持颜色显示)
1002
  formatted_status = format_log_markdown(status)
 
 
 
 
 
 
 
 
1003
 
1004
  return (
1005
  img,
 
2
  Gradio回调函数模块
3
  响应UI事件,调用业务逻辑,返回UI更新
4
  """
5
+ import logging
6
  import gradio as gr
7
  import numpy as np
8
  import time
 
40
  # Each uid keeps its own FIFO queue and sampling cursor.
41
  _LIVE_OBS_REFRESH = {}
42
  _LIVE_OBS_REFRESH_LOCK = threading.Lock()
43
+ LOGGER = logging.getLogger("robomme.callbacks")
44
+
45
+
46
+ def _uid_for_log(uid):
47
+ if not uid:
48
+ return "<none>"
49
+ text = str(uid)
50
+ return text if len(text) <= 12 else f"{text[:8]}..."
51
 
52
 
53
  def capitalize_first_letter(text: str) -> str:
 
173
  Returns:
174
  gr.update: 显示 loading overlay group
175
  """
176
+ LOGGER.debug("show_loading_info: displaying loading overlay")
177
  return gr.update(visible=True)
178
 
179
 
 
190
  if uid:
191
  session = get_session(uid)
192
  base_count = len(getattr(session, "base_frames", []) or []) if session else 0
193
+ LOGGER.debug(
194
+ "switch_to_execute_phase uid=%s base_frames=%s",
195
+ _uid_for_log(uid),
196
+ base_count,
197
+ )
198
  with _LIVE_OBS_REFRESH_LOCK:
199
  _LIVE_OBS_REFRESH[uid] = {
200
  "frame_queue": queue.Queue(),
 
214
  def switch_to_action_phase(uid=None):
215
  """Switch display to action phase and restore control panel interactions."""
216
  if uid:
217
+ LOGGER.debug("switch_to_action_phase uid=%s", _uid_for_log(uid))
218
  with _LIVE_OBS_REFRESH_LOCK:
219
  _LIVE_OBS_REFRESH.pop(uid, None)
220
  return (
 
371
 
372
 
373
  def _task_load_failed_response(uid, message):
374
+ LOGGER.warning("task_load_failed uid=%s message=%s", _uid_for_log(uid), message)
375
  return (
376
  uid,
377
  gr.update(visible=True), # main_interface
 
411
  except (TypeError, ValueError):
412
  completed_count = 0
413
  progress_text = f"Completed: {completed_count}"
414
+ LOGGER.info(
415
+ "load_status_task uid=%s env=%s episode=%s completed=%s",
416
+ _uid_for_log(uid),
417
+ env_id,
418
+ ep_num,
419
+ completed_count,
420
+ )
421
 
422
  session = get_session(uid)
423
  if session is None:
424
+ LOGGER.warning("session missing for uid=%s, creating ProcessSessionProxy", _uid_for_log(uid))
425
  session = ProcessSessionProxy()
426
  with _state_lock:
427
  GLOBAL_SESSIONS[uid] = session
428
  SESSION_LAST_ACTIVITY[uid] = time.time()
429
+ LOGGER.info("created replacement session for uid=%s", _uid_for_log(uid))
430
 
431
+ LOGGER.debug("loading episode env=%s episode=%s uid=%s", env_id, ep_num, _uid_for_log(uid))
432
 
433
  with _LIVE_OBS_REFRESH_LOCK:
434
  _LIVE_OBS_REFRESH.pop(uid, None)
 
437
 
438
  img, load_msg = session.load_episode(env_id, int(ep_num))
439
  actual_env_id = getattr(session, "env_id", None) or env_id
440
+ LOGGER.debug(
441
+ "load_episode result uid=%s env=%s episode=%s img_none=%s message=%s",
442
+ _uid_for_log(uid),
443
+ actual_env_id,
444
+ ep_num,
445
+ img is None,
446
+ load_msg,
447
+ )
448
 
449
  if img is not None:
450
  start_time = datetime.now().isoformat()
 
492
  else:
493
  opt_label_with_hint = opt_label
494
  radio_choices.append((opt_label_with_hint, opt_idx))
495
+ LOGGER.debug(
496
+ "options prepared uid=%s env=%s count=%s",
497
+ _uid_for_log(uid),
498
+ actual_env_id,
499
+ len(radio_choices),
500
+ )
501
 
502
  demo_video_path = None
503
  has_demo_video = False
 
517
  demo_video_path = None
518
  except Exception:
519
  demo_video_path = None
520
+ LOGGER.debug(
521
+ "demo video decision uid=%s env=%s should_show=%s has_path=%s",
522
+ _uid_for_log(uid),
523
+ actual_env_id,
524
+ should_show,
525
+ bool(demo_video_path),
526
+ )
527
 
528
  img = session.get_pil_image(use_segmented=USE_SEGMENTED_VIEW)
529
 
 
582
  if not uid:
583
  uid = create_session()
584
 
585
+ LOGGER.debug("init_session_and_load_task: init_session uid=%s", _uid_for_log(uid))
586
  success, msg, status = user_manager.init_session(uid)
587
+ LOGGER.debug(
588
+ "init_session_and_load_task result uid=%s success=%s msg=%s",
589
+ _uid_for_log(uid),
590
+ success,
591
+ msg,
592
+ )
593
 
594
  if uid:
595
  update_session_activity(uid)
596
 
597
  if not success:
598
+ LOGGER.warning("init_session_and_load_task failed uid=%s msg=%s", _uid_for_log(uid), msg)
599
  return _task_load_failed_response(uid, msg)
600
+ LOGGER.debug("init_session_and_load_task success uid=%s -> load_status_task", _uid_for_log(uid))
601
  return _load_status_task(uid, status)
602
 
603
 
 
606
 
607
  if not uid:
608
  uid = create_session()
609
+ LOGGER.debug("load_next_task_wrapper created uid=%s", _uid_for_log(uid))
610
 
611
  if uid:
612
  update_session_activity(uid)
613
 
614
+ LOGGER.info("load_next_task_wrapper uid=%s", _uid_for_log(uid))
615
  status = user_manager.next_episode_same_env(uid)
616
  if not status:
617
  return _task_load_failed_response(uid, "Failed to load next task")
 
622
  """Reload the current env + episode."""
623
  if not uid:
624
  uid = create_session()
625
+ LOGGER.debug("restart_episode_wrapper created uid=%s", _uid_for_log(uid))
626
 
627
  if uid:
628
  update_session_activity(uid)
629
 
630
+ LOGGER.info("restart_episode_wrapper uid=%s", _uid_for_log(uid))
631
  status = user_manager.get_session_status(uid)
632
  current_task = status.get("current_task") if isinstance(status, dict) else None
633
  if not current_task:
 
645
  """Switch env from Current Task dropdown and randomly assign an episode."""
646
  if not uid:
647
  uid = create_session()
648
+ LOGGER.debug("switch_env_wrapper created uid=%s", _uid_for_log(uid))
649
 
650
  if uid:
651
  update_session_activity(uid)
652
 
653
+ LOGGER.info(
654
+ "switch_env_wrapper uid=%s selected_env=%s",
655
+ _uid_for_log(uid),
656
+ selected_env,
657
+ )
658
  if selected_env:
659
  status = user_manager.switch_env_and_random_episode(uid, selected_env)
660
  else:
 
676
 
677
  session = get_session(uid)
678
  if not session:
679
+ LOGGER.warning("on_map_click: missing session uid=%s", _uid_for_log(uid))
680
  return None, "Session Error"
681
 
682
  # Check if current option actually needs coordinates
 
695
  needs_coords = True
696
 
697
  if not needs_coords:
698
+ LOGGER.debug(
699
+ "on_map_click ignored uid=%s option=%s needs_coords=%s",
700
+ _uid_for_log(uid),
701
+ option_value,
702
+ needs_coords,
703
+ )
704
  # Return current state without changes (or reset to default message if needed, but it should already be there)
705
  # We return the clean image and the "No need" message to enforce state
706
  base_img = session.get_pil_image(use_segmented=USE_SEGMENTED_VIEW)
707
  return base_img, "No need for coordinates"
708
 
709
  x, y = evt.index[0], evt.index[1]
710
+ LOGGER.debug(
711
+ "on_map_click uid=%s option=%s coords=(%s,%s)",
712
+ _uid_for_log(uid),
713
+ option_value,
714
+ x,
715
+ y,
716
+ )
717
 
718
  # Get clean image from session
719
  base_img = session.get_pil_image(use_segmented=USE_SEGMENTED_VIEW)
 
750
  default_msg = "No need for coordinates"
751
 
752
  if option_value is None:
753
+ LOGGER.debug("on_option_select uid=%s option=None", _uid_for_log(uid))
754
  return default_msg, gr.update(interactive=False)
755
 
756
  # 更新session活动时间(选择选项操作)
 
759
 
760
  session = get_session(uid)
761
  if not session:
762
+ LOGGER.warning("on_option_select: missing session uid=%s", _uid_for_log(uid))
763
  return default_msg, gr.update(interactive=False)
764
 
765
  # option_value 是 (label, idx) 元组或直接是 idx
 
772
  if 0 <= option_idx < len(session.raw_solve_options):
773
  opt = session.raw_solve_options[option_idx]
774
  if opt.get("available"):
775
+ LOGGER.debug(
776
+ "on_option_select uid=%s option=%s requires_coords=True valid_coords=%s",
777
+ _uid_for_log(uid),
778
+ option_idx,
779
+ _is_valid_coords_text(coords_str),
780
+ )
781
  if _is_valid_coords_text(coords_str):
782
  return coords_str, gr.update(interactive=True)
783
  return "please click the keypoint selection image", gr.update(interactive=True)
784
 
785
+ LOGGER.debug("on_option_select uid=%s option=%s requires_coords=False", _uid_for_log(uid), option_idx)
786
  return default_msg, gr.update(interactive=False)
787
 
788
 
 
795
 
796
  session = get_session(uid)
797
  if not session:
798
+ LOGGER.warning("on_reference_action: missing session uid=%s", _uid_for_log(uid))
799
  return (
800
  None,
801
  gr.update(),
 
803
  format_log_markdown("Session Error"),
804
  )
805
 
806
+ LOGGER.info("on_reference_action uid=%s env=%s", _uid_for_log(uid), getattr(session, "env_id", None))
807
  current_img = session.get_pil_image(use_segmented=USE_SEGMENTED_VIEW)
808
 
809
  try:
810
  reference = session.get_reference_action()
811
  except Exception as exc:
812
+ LOGGER.exception("on_reference_action failed uid=%s", _uid_for_log(uid))
813
  return (
814
  current_img,
815
  gr.update(),
 
844
  updated_img = draw_marker(current_img, x, y)
845
  coords_text = f"{x}, {y}"
846
  log_text = f"Ground Truth Action: {option_label}. {option_action} | coords: {coords_text}"
847
+ LOGGER.debug(
848
+ "on_reference_action resolved uid=%s option_idx=%s need_coords=%s coords=%s",
849
+ _uid_for_log(uid),
850
+ option_idx,
851
+ need_coords,
852
+ coords_xy,
853
+ )
854
 
855
  return (
856
  updated_img,
 
870
  Returns:
871
  初始化后的UI状态
872
  """
 
873
  _ = request # Query params are intentionally ignored in session-based mode.
874
  try:
875
+ LOGGER.info("init_app: creating session")
876
  uid = create_session()
877
+ LOGGER.info("init_app: created uid=%s", _uid_for_log(uid))
878
  result = init_session_and_load_task(uid)
879
+ LOGGER.debug("init_app: init_session_and_load_task returned %s outputs", len(result))
880
  return result
881
  except Exception as e:
882
+ LOGGER.exception("init_app exception")
 
883
  # Return a safe fallback that hides the loading overlay and shows error
884
  return _task_load_failed_response("", f"Initialization error: {e}")
885
 
 
894
 
895
  session = get_session(uid)
896
  if not session:
897
+ LOGGER.error("precheck_execute_inputs: missing session uid=%s", _uid_for_log(uid))
898
  raise gr.Error("Session Error")
899
 
900
  parsed_option_idx = option_idx
 
902
  _, parsed_option_idx = option_idx
903
 
904
  if parsed_option_idx is None:
905
+ LOGGER.debug("precheck_execute_inputs uid=%s missing option", _uid_for_log(uid))
906
  raise gr.Error("Error: No action selected")
907
 
908
  needs_coords = False
 
914
  needs_coords = bool(opt.get("available"))
915
 
916
  if needs_coords and not _is_valid_coords_text(coords_str):
917
+ LOGGER.debug(
918
+ "precheck_execute_inputs uid=%s option=%s requires_coords but coords invalid: %s",
919
+ _uid_for_log(uid),
920
+ parsed_option_idx,
921
+ coords_str,
922
+ )
923
  raise gr.Error("please click the keypoint selection image before execute!")
924
+ LOGGER.debug(
925
+ "precheck_execute_inputs passed uid=%s option=%s needs_coords=%s",
926
+ _uid_for_log(uid),
927
+ parsed_option_idx,
928
+ needs_coords,
929
+ )
930
 
931
 
932
  def execute_step(uid, option_idx, coords_str):
933
+ LOGGER.info(
934
+ "execute_step start uid=%s option=%s coords=%s",
935
+ _uid_for_log(uid),
936
+ option_idx,
937
+ coords_str,
938
+ )
939
  # 检查session是否超时(在更新活动时间之前检查)
940
  last_activity = get_session_activity(uid)
941
  if last_activity is not None:
942
  elapsed = time.time() - last_activity
943
  if elapsed > SESSION_TIMEOUT:
944
+ LOGGER.warning(
945
+ "execute_step timeout uid=%s elapsed=%.2fs timeout=%ss",
946
+ _uid_for_log(uid),
947
+ elapsed,
948
+ SESSION_TIMEOUT,
949
+ )
950
  raise gr.Error(f"Session已超时:超过 {SESSION_TIMEOUT} 秒未活动。请刷新页面重新登录。")
951
 
952
  # 更新session的最后活动时间
 
954
 
955
  session = get_session(uid)
956
  if not session:
957
+ LOGGER.error("execute_step missing session uid=%s", _uid_for_log(uid))
958
  return None, format_log_markdown("Session Error"), gr.update(), gr.update(), gr.update(interactive=False), gr.update(interactive=False)
959
 
960
  # 检查 execute 次数限制(在执行前检查,如果达到限制则模拟失败状态)
 
969
  current_count = get_execute_count(uid, session.env_id, session.episode_idx)
970
  if current_count >= max_execute:
971
  execute_limit_reached = True
972
+ LOGGER.debug(
973
+ "execute limit check uid=%s env=%s ep=%s current=%s max=%s reached=%s",
974
+ _uid_for_log(uid),
975
+ session.env_id,
976
+ session.episode_idx,
977
+ current_count,
978
+ max_execute,
979
+ execute_limit_reached,
980
+ )
981
 
982
  # Ensure at least one cached frame exists for timer-based refresh.
983
  if not session.base_frames:
984
+ LOGGER.debug("execute_step uid=%s base_frames empty; triggering update_observation", _uid_for_log(uid))
985
  session.update_observation(use_segmentation=USE_SEGMENTED_VIEW)
986
 
987
  if option_idx is None:
988
+ LOGGER.debug("execute_step uid=%s aborted: option_idx is None", _uid_for_log(uid))
989
  return session.get_pil_image(use_segmented=USE_SEGMENTED_VIEW), format_log_markdown("Error: No action selected"), gr.update(), gr.update(), gr.update(interactive=False), gr.update(interactive=True)
990
 
991
  # 检查当前选项是否需要坐标
 
1012
 
1013
  # 如果需要坐标但没有有效坐标,返回错误提示
1014
  if not is_valid_coords:
1015
+ LOGGER.debug(
1016
+ "execute_step uid=%s option=%s missing valid coords, coords_str=%s",
1017
+ _uid_for_log(uid),
1018
+ option_idx,
1019
+ coords_str,
1020
+ )
1021
  current_img = session.get_pil_image(use_segmented=USE_SEGMENTED_VIEW)
1022
  error_msg = "please click the keypoint selection image before execute!"
1023
  return current_img, format_log_markdown(error_msg), gr.update(), gr.update(), gr.update(interactive=False), gr.update(interactive=True)
 
1053
  # 增加 execute 计数(因为这也算一次尝试)
1054
  if uid and session.env_id is not None and session.episode_idx is not None:
1055
  new_count = increment_execute_count(uid, session.env_id, session.episode_idx)
1056
+ LOGGER.warning(
1057
+ "execute limit reached uid=%s env=%s ep=%s count=%s",
1058
+ _uid_for_log(uid),
1059
+ session.env_id,
1060
+ session.episode_idx,
1061
+ new_count,
1062
+ )
1063
  else:
1064
  # 正常执行
1065
  # 异常处理:所有异常(ScrewPlanFailure 和其他执行错误)都会显示弹窗通知
1066
+ LOGGER.info(
1067
+ "executing action uid=%s env=%s ep=%s option=%s coords=%s",
1068
+ _uid_for_log(uid),
1069
+ getattr(session, "env_id", None),
1070
+ getattr(session, "episode_idx", None),
1071
+ option_idx,
1072
+ click_coords,
1073
+ )
1074
  try:
1075
  img, status, done = session.execute_action(option_idx, click_coords)
1076
  except ScrewPlanFailureError as e:
 
1083
  done = False
1084
  # 继续正常返回流程
1085
  img = current_img
1086
+ LOGGER.warning("execute_step screw_plan_failure uid=%s error=%s", _uid_for_log(uid), error_message)
1087
  except RuntimeError as e:
1088
  # 捕获所有其他执行错误,显示弹窗通知
1089
  error_message = str(e)
 
1094
  done = False
1095
  # 继续正常返回流程
1096
  img = current_img
1097
+ LOGGER.warning("execute_step runtime_error uid=%s error=%s", _uid_for_log(uid), error_message)
1098
 
1099
  # 增加 execute 计数(无论成功或失败都计数,因为用户已经执行了一次操作)
1100
  if uid and session.env_id is not None and session.episode_idx is not None:
1101
  new_count = increment_execute_count(uid, session.env_id, session.episode_idx)
1102
+ LOGGER.debug(
1103
+ "execute count incremented uid=%s env=%s ep=%s count=%s",
1104
+ _uid_for_log(uid),
1105
+ session.env_id,
1106
+ session.episode_idx,
1107
+ new_count,
1108
+ )
1109
 
1110
  # Execute frames are produced in batch when execute_action returns from worker process.
1111
  # Enqueue them now, then wait briefly for the 0.1s timer to drain FIFO playback.
1112
  _enqueue_live_obs_frames(uid, getattr(session, "base_frames", None))
1113
  _wait_for_live_obs_queue_drain(uid)
1114
+ LOGGER.debug("execute_step playback drain complete uid=%s", _uid_for_log(uid))
1115
 
1116
  # 注意:执行阶段画面由 live_obs 的 0.1s 轮询刷新。
1117
 
 
1147
  completed_count = user_status.get("completed_count", 0)
1148
  task_update = f"{session.env_id} (Episode {session.episode_idx})"
1149
  progress_update = f"Completed: {completed_count}"
1150
+ LOGGER.info(
1151
+ "task complete uid=%s env=%s ep=%s final=%s completed_count=%s",
1152
+ _uid_for_log(uid),
1153
+ session.env_id,
1154
+ session.episode_idx,
1155
+ final_log_status,
1156
+ completed_count,
1157
+ )
1158
 
1159
  # 根据视图模式重新获取图片
1160
  img = session.get_pil_image(use_segmented=USE_SEGMENTED_VIEW)
 
1165
 
1166
  # 格式化日志消息为 HTML 格式(支持颜色显示)
1167
  formatted_status = format_log_markdown(status)
1168
+ LOGGER.debug(
1169
+ "execute_step done uid=%s env=%s ep=%s done=%s exec_btn_interactive=%s",
1170
+ _uid_for_log(uid),
1171
+ getattr(session, "env_id", None),
1172
+ getattr(session, "episode_idx", None),
1173
+ done,
1174
+ not done,
1175
+ )
1176
 
1177
  return (
1178
  img,
gradio-web/main.py CHANGED
@@ -1,12 +1,11 @@
1
  """Main entry for Gradio app (single-instance mode for Hugging Face Spaces)."""
2
 
 
3
  import os
 
4
  import tempfile
5
  from pathlib import Path
6
 
7
- from ui_layout import create_ui_blocks
8
- from state_manager import start_timeout_monitor
9
-
10
  APP_DIR = Path(__file__).resolve().parent
11
  PROJECT_ROOT = APP_DIR.parent
12
  VIDEOS_DIR = APP_DIR / "videos"
@@ -14,10 +13,55 @@ TEMP_DEMOS_DIR = PROJECT_ROOT / "temp_demos"
14
  CWD_TEMP_DEMOS_DIR = Path.cwd() / "temp_demos"
15
 
16
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
17
  def ensure_media_dirs():
18
  """Ensure media temp directories exist before first write."""
19
  TEMP_DEMOS_DIR.mkdir(parents=True, exist_ok=True)
20
  CWD_TEMP_DEMOS_DIR.mkdir(parents=True, exist_ok=True)
 
 
 
 
 
21
 
22
 
23
  def build_allowed_paths():
@@ -38,21 +82,36 @@ def build_allowed_paths():
38
  if normalized not in seen:
39
  seen.add(normalized)
40
  deduped.append(normalized)
 
41
  return deduped
42
 
43
 
44
  def main():
 
 
 
 
45
  ensure_media_dirs()
46
  start_timeout_monitor()
47
 
48
  os.environ.setdefault("ROBOMME_TEMP_DEMOS_DIR", str(TEMP_DEMOS_DIR))
49
  allowed_paths = build_allowed_paths()
 
 
 
 
 
 
 
50
 
51
  demo = create_ui_blocks()
52
  demo.launch(
53
  server_name="0.0.0.0",
54
- server_port=int(os.getenv("PORT", "7860")),
55
  allowed_paths=allowed_paths,
 
 
 
56
  )
57
 
58
 
 
1
  """Main entry for Gradio app (single-instance mode for Hugging Face Spaces)."""
2
 
3
+ import logging
4
  import os
5
+ import sys
6
  import tempfile
7
  from pathlib import Path
8
 
 
 
 
9
  APP_DIR = Path(__file__).resolve().parent
10
  PROJECT_ROOT = APP_DIR.parent
11
  VIDEOS_DIR = APP_DIR / "videos"
 
13
  CWD_TEMP_DEMOS_DIR = Path.cwd() / "temp_demos"
14
 
15
 
16
+ def setup_logging() -> logging.Logger:
17
+ """Configure structured logging for Spaces runtime."""
18
+ level_name = "DEBUG"
19
+ os.environ["LOG_LEVEL"] = level_name
20
+ level = logging.DEBUG
21
+ try:
22
+ sys.stdout.reconfigure(line_buffering=True)
23
+ sys.stderr.reconfigure(line_buffering=True)
24
+ except Exception:
25
+ pass
26
+ logging.basicConfig(
27
+ level=level,
28
+ format=(
29
+ "%(asctime)s | %(levelname)s | %(name)s | "
30
+ "pid=%(process)d tid=%(threadName)s | %(message)s"
31
+ ),
32
+ stream=sys.stdout,
33
+ force=True,
34
+ )
35
+ for noisy_logger in [
36
+ "asyncio",
37
+ "httpx",
38
+ "httpcore",
39
+ "urllib3",
40
+ "matplotlib",
41
+ "PIL",
42
+ "h5py",
43
+ "trimesh",
44
+ "toppra",
45
+ ]:
46
+ logging.getLogger(noisy_logger).setLevel(logging.WARNING)
47
+ logging.getLogger("robomme").setLevel(logging.DEBUG)
48
+ logger = logging.getLogger("robomme.main")
49
+ logger.info("Logging initialized with LOG_LEVEL=%s", level_name)
50
+ return logger
51
+
52
+
53
+ LOGGER = setup_logging()
54
+
55
+
56
  def ensure_media_dirs():
57
  """Ensure media temp directories exist before first write."""
58
  TEMP_DEMOS_DIR.mkdir(parents=True, exist_ok=True)
59
  CWD_TEMP_DEMOS_DIR.mkdir(parents=True, exist_ok=True)
60
+ LOGGER.debug(
61
+ "Ensured media dirs: temp_demos=%s cwd_temp_demos=%s",
62
+ TEMP_DEMOS_DIR,
63
+ CWD_TEMP_DEMOS_DIR,
64
+ )
65
 
66
 
67
  def build_allowed_paths():
 
82
  if normalized not in seen:
83
  seen.add(normalized)
84
  deduped.append(normalized)
85
+ LOGGER.debug("Allowed paths resolved (%d): %s", len(deduped), deduped)
86
  return deduped
87
 
88
 
89
  def main():
90
+ from state_manager import start_timeout_monitor
91
+ from ui_layout import create_ui_blocks
92
+
93
+ LOGGER.info("Starting Gradio real environment entrypoint: %s", __file__)
94
  ensure_media_dirs()
95
  start_timeout_monitor()
96
 
97
  os.environ.setdefault("ROBOMME_TEMP_DEMOS_DIR", str(TEMP_DEMOS_DIR))
98
  allowed_paths = build_allowed_paths()
99
+ server_port = int(os.getenv("PORT", "7860"))
100
+ LOGGER.info(
101
+ "Launching UI with server_name=%s server_port=%s ROBOMME_TEMP_DEMOS_DIR=%s",
102
+ "0.0.0.0",
103
+ server_port,
104
+ os.environ.get("ROBOMME_TEMP_DEMOS_DIR"),
105
+ )
106
 
107
  demo = create_ui_blocks()
108
  demo.launch(
109
  server_name="0.0.0.0",
110
+ server_port=server_port,
111
  allowed_paths=allowed_paths,
112
+ debug=True,
113
+ show_error=True,
114
+ quiet=False,
115
  )
116
 
117
 
gradio-web/process_session.py CHANGED
@@ -10,12 +10,11 @@
10
  3. 进程间通信:通过 multiprocessing.Queue 进行命令和结果的传递
11
  4. 视频帧同步:工作进程产生的新帧通过 stream_queue 推送到主进程,由后台线程同步到代理的本地缓存
12
  """
 
13
  import multiprocessing
14
  import queue
15
  import threading
16
  import time
17
- import traceback
18
- import numpy as np
19
  import sys
20
  import os
21
 
@@ -46,6 +45,35 @@ CMD_GET_PIL_IMAGE = "get_pil_image"
46
  CMD_EXECUTE_ACTION = "execute_action"
47
  CMD_GET_REFERENCE_ACTION = "get_reference_action"
48
  CMD_CLOSE = "close"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
49
 
50
  def _sanitize_options(options):
51
  """
@@ -92,10 +120,17 @@ def session_worker_loop(cmd_queue, result_queue, stream_queue, dataset_root, gui
92
  dataset_root: 数据集根目录路径
93
  gui_render: 是否使用GUI渲染模式
94
  """
 
95
  session = None
96
  try:
 
 
 
 
 
97
  session = OracleSession(dataset_root=dataset_root, gui_render=gui_render)
98
  session.stream_frame_callback = lambda frames: stream_queue.put({"base": frames, "wrist": []})
 
99
 
100
  while True:
101
  try:
@@ -107,15 +142,23 @@ def session_worker_loop(cmd_queue, result_queue, stream_queue, dataset_root, gui
107
  cmd = cmd_data["cmd"]
108
  args = cmd_data.get("args", [])
109
  kwargs = cmd_data.get("kwargs", {})
 
110
 
111
  if cmd == CMD_CLOSE:
112
  if session:
113
  session.close()
 
114
  break
115
 
116
  elif cmd == CMD_LOAD_EPISODE:
117
  # 加载环境episode
118
  res = session.load_episode(*args, **kwargs)
 
 
 
 
 
 
119
 
120
  # 更新帧索引跟踪(用于增量同步)
121
  session.last_base_frame_idx = len(session.base_frames)
@@ -148,12 +191,20 @@ def session_worker_loop(cmd_queue, result_queue, stream_queue, dataset_root, gui
148
  # 执行动作(重计算任务)
149
  try:
150
  res = session.execute_action(*args, **kwargs)
 
 
 
 
 
 
151
  except ScrewPlanFailure as e:
152
  # 捕获 ScrewPlanFailure 并作为特殊状态传递到主进程,用于显示弹窗
 
153
  result_queue.put({"status": "screw_plan_failure", "message": str(e)})
154
  continue
155
  except Exception as e:
156
  # 捕获所有其他异常并传递到主进程,用于显示弹窗
 
157
  result_queue.put({"status": "execution_error", "message": str(e)})
158
  continue
159
 
@@ -175,6 +226,11 @@ def session_worker_loop(cmd_queue, result_queue, stream_queue, dataset_root, gui
175
  # 如果有新帧,推送到流队列
176
  if new_base or new_wrist:
177
  stream_queue.put({"base": new_base, "wrist": new_wrist})
 
 
 
 
 
178
 
179
  # 获取演示状态(从 DemonstrationWrapper 获取)
180
  is_demonstration = False
@@ -209,6 +265,11 @@ def session_worker_loop(cmd_queue, result_queue, stream_queue, dataset_root, gui
209
  # 如果有新帧,推送到流队列
210
  if new_base or new_wrist:
211
  stream_queue.put({"base": new_base, "wrist": new_wrist})
 
 
 
 
 
212
 
213
  # 获取演示状态(从 DemonstrationWrapper 获取)
214
  is_demonstration = False
@@ -226,13 +287,15 @@ def session_worker_loop(cmd_queue, result_queue, stream_queue, dataset_root, gui
226
 
227
  elif cmd == CMD_GET_REFERENCE_ACTION:
228
  res = session.get_reference_action(*args, **kwargs)
 
229
  result_queue.put({"status": "success", "result": res})
230
 
231
  else:
 
232
  result_queue.put({"status": "error", "message": f"Unknown command: {cmd}"})
233
 
234
  except Exception as e:
235
- traceback.print_exc()
236
  result_queue.put({"status": "fatal", "message": str(e)})
237
 
238
 
@@ -273,6 +336,12 @@ class ProcessSessionProxy:
273
  daemon=True
274
  )
275
  self.process.start()
 
 
 
 
 
 
276
 
277
  # 本地状态缓存(从工作进程同步)
278
  self.env_id = None
@@ -317,6 +386,7 @@ class ProcessSessionProxy:
317
  except queue.Empty:
318
  continue
319
  except Exception:
 
320
  break
321
 
322
  def _send_cmd(self, cmd, *args, **kwargs):
@@ -336,10 +406,26 @@ class ProcessSessionProxy:
336
  TimeoutError: 工作进程超时(600秒)
337
  """
338
  # 发送命令到工作进程
 
 
 
 
 
 
 
 
339
  self.cmd_queue.put({"cmd": cmd, "args": args, "kwargs": kwargs})
340
  try:
341
  # 等待结果(重任务如加载/执行可能需要较长时间,设置600秒超时)
342
  res = self.result_queue.get(timeout=600)
 
 
 
 
 
 
 
 
343
 
344
  # 检查错误状态并转换为异常,以便在 gradio_callbacks 中捕获并显示弹窗
345
  if res.get("status") == "screw_plan_failure":
@@ -366,6 +452,7 @@ class ProcessSessionProxy:
366
 
367
  return res.get("result")
368
  except queue.Empty:
 
369
  raise TimeoutError("工作进程超时")
370
 
371
  def load_episode(self, env_id, episode_idx):
@@ -440,9 +527,13 @@ class ProcessSessionProxy:
440
  self.stop_sync = True
441
  try:
442
  self.cmd_queue.put({"cmd": CMD_CLOSE})
443
- except:
 
 
444
  pass
445
  # 等待工作进程优雅退出
446
  self.process.join(timeout=1)
447
  if self.process.is_alive():
 
448
  self.process.terminate()
 
 
10
  3. 进程间通信:通过 multiprocessing.Queue 进行命令和结果的传递
11
  4. 视频帧同步:工作进程产生的新帧通过 stream_queue 推送到主进程,由后台线程同步到代理的本地缓存
12
  """
13
+ import logging
14
  import multiprocessing
15
  import queue
16
  import threading
17
  import time
 
 
18
  import sys
19
  import os
20
 
 
45
  CMD_EXECUTE_ACTION = "execute_action"
46
  CMD_GET_REFERENCE_ACTION = "get_reference_action"
47
  CMD_CLOSE = "close"
48
+ LOGGER = logging.getLogger("robomme.process_session")
49
+
50
+
51
+ def _setup_worker_logging():
52
+ level_name = os.getenv("LOG_LEVEL", "DEBUG").upper()
53
+ level = getattr(logging, level_name, logging.DEBUG)
54
+ logging.basicConfig(
55
+ level=level,
56
+ format=(
57
+ "%(asctime)s | %(levelname)s | %(name)s | "
58
+ "pid=%(process)d tid=%(threadName)s | %(message)s"
59
+ ),
60
+ stream=sys.stdout,
61
+ force=True,
62
+ )
63
+ for noisy_logger in [
64
+ "asyncio",
65
+ "httpx",
66
+ "httpcore",
67
+ "urllib3",
68
+ "matplotlib",
69
+ "PIL",
70
+ "h5py",
71
+ "trimesh",
72
+ "toppra",
73
+ ]:
74
+ logging.getLogger(noisy_logger).setLevel(logging.WARNING)
75
+ logging.getLogger("robomme").setLevel(logging.DEBUG)
76
+ LOGGER.debug("worker logging initialized level=%s", level_name)
77
 
78
  def _sanitize_options(options):
79
  """
 
120
  dataset_root: 数据集根目录路径
121
  gui_render: 是否使用GUI渲染模式
122
  """
123
+ _setup_worker_logging()
124
  session = None
125
  try:
126
+ LOGGER.info(
127
+ "worker loop starting dataset_root=%s gui_render=%s",
128
+ dataset_root,
129
+ gui_render,
130
+ )
131
  session = OracleSession(dataset_root=dataset_root, gui_render=gui_render)
132
  session.stream_frame_callback = lambda frames: stream_queue.put({"base": frames, "wrist": []})
133
+ LOGGER.info("worker OracleSession initialized")
134
 
135
  while True:
136
  try:
 
142
  cmd = cmd_data["cmd"]
143
  args = cmd_data.get("args", [])
144
  kwargs = cmd_data.get("kwargs", {})
145
+ LOGGER.debug("worker received cmd=%s args=%s kwargs_keys=%s", cmd, len(args), list(kwargs.keys()))
146
 
147
  if cmd == CMD_CLOSE:
148
  if session:
149
  session.close()
150
+ LOGGER.info("worker received close command, exiting")
151
  break
152
 
153
  elif cmd == CMD_LOAD_EPISODE:
154
  # 加载环境episode
155
  res = session.load_episode(*args, **kwargs)
156
+ LOGGER.info(
157
+ "worker load_episode env=%s episode=%s result_msg=%s",
158
+ getattr(session, "env_id", None),
159
+ getattr(session, "episode_idx", None),
160
+ res[1] if isinstance(res, tuple) and len(res) > 1 else None,
161
+ )
162
 
163
  # 更新帧索引跟踪(用于增量同步)
164
  session.last_base_frame_idx = len(session.base_frames)
 
191
  # 执行动作(重计算任务)
192
  try:
193
  res = session.execute_action(*args, **kwargs)
194
+ LOGGER.info(
195
+ "worker execute_action done env=%s episode=%s done=%s",
196
+ getattr(session, "env_id", None),
197
+ getattr(session, "episode_idx", None),
198
+ res[2] if isinstance(res, tuple) and len(res) > 2 else None,
199
+ )
200
  except ScrewPlanFailure as e:
201
  # 捕获 ScrewPlanFailure 并作为特殊状态传递到主进程,用于显示弹窗
202
+ LOGGER.warning("worker screw_plan_failure: %s", e)
203
  result_queue.put({"status": "screw_plan_failure", "message": str(e)})
204
  continue
205
  except Exception as e:
206
  # 捕获所有其他异常并传递到主进程,用于显示弹窗
207
+ LOGGER.exception("worker execution_error")
208
  result_queue.put({"status": "execution_error", "message": str(e)})
209
  continue
210
 
 
226
  # 如果有新帧,推送到流队列
227
  if new_base or new_wrist:
228
  stream_queue.put({"base": new_base, "wrist": new_wrist})
229
+ LOGGER.debug(
230
+ "worker execute_action streamed frames base=%s wrist=%s",
231
+ len(new_base),
232
+ len(new_wrist),
233
+ )
234
 
235
  # 获取演示状态(从 DemonstrationWrapper 获取)
236
  is_demonstration = False
 
265
  # 如果有新帧,推送到流队列
266
  if new_base or new_wrist:
267
  stream_queue.put({"base": new_base, "wrist": new_wrist})
268
+ LOGGER.debug(
269
+ "worker update_observation streamed frames base=%s wrist=%s",
270
+ len(new_base),
271
+ len(new_wrist),
272
+ )
273
 
274
  # 获取演示状态(从 DemonstrationWrapper 获取)
275
  is_demonstration = False
 
287
 
288
  elif cmd == CMD_GET_REFERENCE_ACTION:
289
  res = session.get_reference_action(*args, **kwargs)
290
+ LOGGER.debug("worker get_reference_action ok=%s", bool(res.get("ok")) if isinstance(res, dict) else None)
291
  result_queue.put({"status": "success", "result": res})
292
 
293
  else:
294
+ LOGGER.error("worker unknown command=%s", cmd)
295
  result_queue.put({"status": "error", "message": f"Unknown command: {cmd}"})
296
 
297
  except Exception as e:
298
+ LOGGER.exception("worker fatal error")
299
  result_queue.put({"status": "fatal", "message": str(e)})
300
 
301
 
 
336
  daemon=True
337
  )
338
  self.process.start()
339
+ LOGGER.info(
340
+ "ProcessSessionProxy started worker pid=%s dataset_root=%s gui_render=%s",
341
+ self.process.pid,
342
+ dataset_root,
343
+ gui_render,
344
+ )
345
 
346
  # 本地状态缓存(从工作进程同步)
347
  self.env_id = None
 
386
  except queue.Empty:
387
  continue
388
  except Exception:
389
+ LOGGER.exception("ProcessSessionProxy sync loop crashed")
390
  break
391
 
392
  def _send_cmd(self, cmd, *args, **kwargs):
 
406
  TimeoutError: 工作进程超时(600秒)
407
  """
408
  # 发送命令到工作进程
409
+ start_ts = time.time()
410
+ LOGGER.debug(
411
+ "proxy send cmd=%s pid=%s args=%s kwargs_keys=%s",
412
+ cmd,
413
+ self.process.pid,
414
+ len(args),
415
+ list(kwargs.keys()),
416
+ )
417
  self.cmd_queue.put({"cmd": cmd, "args": args, "kwargs": kwargs})
418
  try:
419
  # 等待结果(重任务如加载/执行可能需要较长时间,设置600秒超时)
420
  res = self.result_queue.get(timeout=600)
421
+ elapsed_ms = int((time.time() - start_ts) * 1000)
422
+ LOGGER.debug(
423
+ "proxy recv cmd=%s pid=%s status=%s elapsed_ms=%s",
424
+ cmd,
425
+ self.process.pid,
426
+ res.get("status"),
427
+ elapsed_ms,
428
+ )
429
 
430
  # 检查错误状态并转换为异常,以便在 gradio_callbacks 中捕获并显示弹窗
431
  if res.get("status") == "screw_plan_failure":
 
452
 
453
  return res.get("result")
454
  except queue.Empty:
455
+ LOGGER.error("proxy command timeout cmd=%s pid=%s", cmd, self.process.pid)
456
  raise TimeoutError("工作进程超时")
457
 
458
  def load_episode(self, env_id, episode_idx):
 
527
  self.stop_sync = True
528
  try:
529
  self.cmd_queue.put({"cmd": CMD_CLOSE})
530
+ LOGGER.debug("proxy close command sent pid=%s", self.process.pid)
531
+ except Exception:
532
+ LOGGER.exception("proxy failed to send close command pid=%s", self.process.pid)
533
  pass
534
  # 等待工作进程优雅退出
535
  self.process.join(timeout=1)
536
  if self.process.is_alive():
537
+ LOGGER.warning("proxy worker still alive after join; terminating pid=%s", self.process.pid)
538
  self.process.terminate()
539
+ LOGGER.info("ProcessSessionProxy closed pid=%s", self.process.pid)
gradio-web/state_manager.py CHANGED
@@ -12,11 +12,12 @@
12
  实际的 OracleSession 运行在独立的工作进程中,通过代理对象进行通信。
13
  当同一用户第二次登录时,系统会自动清理旧会话的所有资源(进程、RAM、VRAM、状态数据等)。
14
  """
 
15
  import uuid
16
  import threading
17
- import traceback
18
  import time
19
  from process_session import ProcessSessionProxy
 
20
 
21
  # --- 全局会话存储 ---
22
  # 存储所有用户的 ProcessSessionProxy 实例
@@ -88,6 +89,7 @@ def create_session():
88
  with _state_lock:
89
  GLOBAL_SESSIONS[uid] = session
90
  SESSION_LAST_ACTIVITY[uid] = time.time()
 
91
  return uid
92
 
93
 
@@ -289,22 +291,21 @@ def cleanup_session(uid):
289
  session = GLOBAL_SESSIONS.get(uid)
290
  if session:
291
  try:
292
- print(f"Cleaning up session {uid}: closing ProcessSessionProxy...")
293
  session.close()
294
- print(f"Session {uid}: ProcessSessionProxy closed successfully")
295
  except Exception as e:
296
- print(f"Error closing ProcessSessionProxy for {uid}: {e}")
297
- traceback.print_exc()
298
 
299
  # 2. 从 GLOBAL_SESSIONS 中移除
300
  if uid in GLOBAL_SESSIONS:
301
  del GLOBAL_SESSIONS[uid]
302
- print(f"Session {uid}: removed from GLOBAL_SESSIONS")
303
 
304
  # 3. 清理任务索引
305
  if uid in TASK_INDEX_MAP:
306
  del TASK_INDEX_MAP[uid]
307
- print(f"Session {uid}: task index cleaned up")
308
 
309
  # 4. 清理UI阶段
310
  if uid in UI_PHASE_MAP:
@@ -313,22 +314,22 @@ def cleanup_session(uid):
313
  # 清理播放按钮状态
314
  if uid in PLAY_BUTTON_CLICKED:
315
  del PLAY_BUTTON_CLICKED[uid]
316
- print(f"Session {uid}: UI phase cleaned up")
317
 
318
  # 5. 清理活动时间跟踪
319
  if uid in SESSION_LAST_ACTIVITY:
320
  del SESSION_LAST_ACTIVITY[uid]
321
- print(f"Session {uid}: last activity time cleaned up")
322
 
323
  # 6. 清理超时警告标志
324
  if uid in SESSION_TIMEOUT_WARNED:
325
  del SESSION_TIMEOUT_WARNED[uid]
326
- print(f"Session {uid}: timeout warning flag cleaned up")
327
 
328
  # 注意:不清理 EXECUTE_COUNTS,因为它是按任务跟踪的,不是按 session 跟踪的
329
  # 如果需要清理,应该在任务切换时调用 reset_execute_count
330
 
331
- print(f"Session {uid}: all resources cleaned up successfully")
332
 
333
 
334
  def update_session_activity(uid):
@@ -343,6 +344,7 @@ def update_session_activity(uid):
343
  # 如果之前被警告过,清除警告标志
344
  if uid in SESSION_TIMEOUT_WARNED:
345
  del SESSION_TIMEOUT_WARNED[uid]
 
346
 
347
 
348
  def get_session_activity(uid):
@@ -396,14 +398,18 @@ def check_and_cleanup_timeout_sessions():
396
  with _state_lock:
397
  SESSION_TIMEOUT_WARNED[uid] = True
398
  timeout_sessions.append(uid)
399
- print(f"Session {uid}: 超时警告 - 已超过 {SESSION_TIMEOUT} 秒未活动")
400
  elif elapsed > SESSION_TIMEOUT + 5:
401
  # 已警告且再等5秒仍未活动,标记为需要清理
402
  warned_sessions_to_cleanup.append(uid)
403
 
404
  # 清理超时的session
405
  for uid in warned_sessions_to_cleanup:
406
- print(f"Session {uid}: 超时清理 - 已超过 {SESSION_TIMEOUT + 5} 秒未活动,开始清理资源")
 
 
 
 
407
  cleanup_session(uid)
408
  # cleanup_session内部会清理SESSION_LAST_ACTIVITY和SESSION_TIMEOUT_WARNED
409
 
@@ -424,8 +430,7 @@ def _timeout_monitor_loop():
424
  try:
425
  check_and_cleanup_timeout_sessions()
426
  except Exception as e:
427
- print(f"Error in timeout monitor loop: {e}")
428
- traceback.print_exc()
429
 
430
  # 每5秒检查一次
431
  for _ in range(50): # 5秒 = 50 * 0.1秒
@@ -443,7 +448,7 @@ def start_timeout_monitor():
443
 
444
  with _timeout_monitor_lock:
445
  if _timeout_monitor_running:
446
- print("Timeout monitor is already running")
447
  return
448
 
449
  _timeout_monitor_running = True
@@ -453,7 +458,7 @@ def start_timeout_monitor():
453
  name="SessionTimeoutMonitor"
454
  )
455
  _timeout_monitor_thread.start()
456
- print("Session timeout monitor started")
457
 
458
 
459
  def stop_timeout_monitor():
@@ -470,4 +475,4 @@ def stop_timeout_monitor():
470
  _timeout_monitor_running = False
471
  if _timeout_monitor_thread:
472
  _timeout_monitor_thread.join(timeout=2.0)
473
- print("Session timeout monitor stopped")
 
12
  实际的 OracleSession 运行在独立的工作进程中,通过代理对象进行通信。
13
  当同一用户第二次登录时,系统会自动清理旧会话的所有资源(进程、RAM、VRAM、状态数据等)。
14
  """
15
+ import logging
16
  import uuid
17
  import threading
 
18
  import time
19
  from process_session import ProcessSessionProxy
20
+ LOGGER = logging.getLogger("robomme.state_manager")
21
 
22
  # --- 全局会话存储 ---
23
  # 存储所有用户的 ProcessSessionProxy 实例
 
89
  with _state_lock:
90
  GLOBAL_SESSIONS[uid] = session
91
  SESSION_LAST_ACTIVITY[uid] = time.time()
92
+ LOGGER.info("create_session uid=%s total_sessions=%s", uid, len(GLOBAL_SESSIONS))
93
  return uid
94
 
95
 
 
291
  session = GLOBAL_SESSIONS.get(uid)
292
  if session:
293
  try:
294
+ LOGGER.info("cleanup_session uid=%s closing ProcessSessionProxy", uid)
295
  session.close()
296
+ LOGGER.info("cleanup_session uid=%s proxy closed", uid)
297
  except Exception as e:
298
+ LOGGER.exception("cleanup_session uid=%s proxy close failed: %s", uid, e)
 
299
 
300
  # 2. 从 GLOBAL_SESSIONS 中移除
301
  if uid in GLOBAL_SESSIONS:
302
  del GLOBAL_SESSIONS[uid]
303
+ LOGGER.debug("cleanup_session uid=%s removed from GLOBAL_SESSIONS", uid)
304
 
305
  # 3. 清理任务索引
306
  if uid in TASK_INDEX_MAP:
307
  del TASK_INDEX_MAP[uid]
308
+ LOGGER.debug("cleanup_session uid=%s task index cleaned", uid)
309
 
310
  # 4. 清理UI阶段
311
  if uid in UI_PHASE_MAP:
 
314
  # 清理播放按钮状态
315
  if uid in PLAY_BUTTON_CLICKED:
316
  del PLAY_BUTTON_CLICKED[uid]
317
+ LOGGER.debug("cleanup_session uid=%s play button state cleaned", uid)
318
 
319
  # 5. 清理活动时间跟踪
320
  if uid in SESSION_LAST_ACTIVITY:
321
  del SESSION_LAST_ACTIVITY[uid]
322
+ LOGGER.debug("cleanup_session uid=%s last activity cleaned", uid)
323
 
324
  # 6. 清理超时警告标志
325
  if uid in SESSION_TIMEOUT_WARNED:
326
  del SESSION_TIMEOUT_WARNED[uid]
327
+ LOGGER.debug("cleanup_session uid=%s timeout warning flag cleaned", uid)
328
 
329
  # 注意:不清理 EXECUTE_COUNTS,因为它是按任务跟踪的,不是按 session 跟踪的
330
  # 如果需要清理,应该在任务切换时调用 reset_execute_count
331
 
332
+ LOGGER.info("cleanup_session uid=%s done", uid)
333
 
334
 
335
  def update_session_activity(uid):
 
344
  # 如果之前被警告过,清除警告标志
345
  if uid in SESSION_TIMEOUT_WARNED:
346
  del SESSION_TIMEOUT_WARNED[uid]
347
+ LOGGER.debug("update_session_activity uid=%s cleared timeout warning", uid)
348
 
349
 
350
  def get_session_activity(uid):
 
398
  with _state_lock:
399
  SESSION_TIMEOUT_WARNED[uid] = True
400
  timeout_sessions.append(uid)
401
+ LOGGER.warning("session timeout warning uid=%s elapsed=%.2fs limit=%ss", uid, elapsed, SESSION_TIMEOUT)
402
  elif elapsed > SESSION_TIMEOUT + 5:
403
  # 已警告且再等5秒仍未活动,标记为需要清理
404
  warned_sessions_to_cleanup.append(uid)
405
 
406
  # 清理超时的session
407
  for uid in warned_sessions_to_cleanup:
408
+ LOGGER.warning(
409
+ "session timeout cleanup uid=%s elapsed_limit=%ss",
410
+ uid,
411
+ SESSION_TIMEOUT + 5,
412
+ )
413
  cleanup_session(uid)
414
  # cleanup_session内部会清理SESSION_LAST_ACTIVITY和SESSION_TIMEOUT_WARNED
415
 
 
430
  try:
431
  check_and_cleanup_timeout_sessions()
432
  except Exception as e:
433
+ LOGGER.exception("timeout monitor loop error: %s", e)
 
434
 
435
  # 每5秒检查一次
436
  for _ in range(50): # 5秒 = 50 * 0.1秒
 
448
 
449
  with _timeout_monitor_lock:
450
  if _timeout_monitor_running:
451
+ LOGGER.info("timeout monitor already running")
452
  return
453
 
454
  _timeout_monitor_running = True
 
458
  name="SessionTimeoutMonitor"
459
  )
460
  _timeout_monitor_thread.start()
461
+ LOGGER.info("session timeout monitor started")
462
 
463
 
464
  def stop_timeout_monitor():
 
475
  _timeout_monitor_running = False
476
  if _timeout_monitor_thread:
477
  _timeout_monitor_thread.join(timeout=2.0)
478
+ LOGGER.info("session timeout monitor stopped")