Spaces:

merve
/

SAM3-video-segmentation

Running on Zero

App Files Files Community

hysts HF Staff commited on Nov 28, 2025

Commit

9487a42

1 Parent(s): 9827da7

Rename

Browse files

Files changed (1) hide show

app.py +167 -175

app.py CHANGED Viewed

@@ -129,18 +129,18 @@ class AppState:
 def init_video_session(
-    GLOBAL_STATE: AppState, video: str | dict, active_tab: str = "point_box"
 ) -> tuple[AppState, int, int, Image.Image, str]:
-    GLOBAL_STATE.video_frames = []
-    GLOBAL_STATE.masks_by_frame = {}
-    GLOBAL_STATE.color_by_obj = {}
-    GLOBAL_STATE.color_by_prompt = {}
-    GLOBAL_STATE.text_prompts_by_frame_obj = {}
-    GLOBAL_STATE.clicks_by_frame_obj = {}
-    GLOBAL_STATE.boxes_by_frame_obj = {}
-    GLOBAL_STATE.composited_frames = {}
-    GLOBAL_STATE.inference_session = None
-    GLOBAL_STATE.active_tab = active_tab
     video_path: str | None = None
     if isinstance(video, dict):
@@ -165,14 +165,14 @@ def init_video_session(
         trimmed_note = f" (trimmed to {int(MAX_SECONDS)}s = {len(frames)} frames)"
         if isinstance(info, dict):
             info["num_frames"] = len(frames)
-    GLOBAL_STATE.video_frames = frames
-    GLOBAL_STATE.video_fps = float(fps_in) if fps_in else None
     raw_video = [np.array(frame) for frame in frames]
     if active_tab == "text":
         processor = TEXT_VIDEO_PROCESSOR
-        GLOBAL_STATE.inference_session = processor.init_video_session(
             video=frames,
             inference_device=DEVICE,
             inference_state_device=DEVICE,
@@ -182,7 +182,7 @@ def init_video_session(
         )
     else:
         processor = TRACKER_PROCESSOR
-        GLOBAL_STATE.inference_session = processor.init_video_session(
             video=raw_video,
             inference_device=DEVICE,
             inference_state_device=DEVICE,
@@ -195,15 +195,15 @@ def init_video_session(
     max_idx = len(frames) - 1
     if active_tab == "text":
         status = (
-            f"Loaded {len(frames)} frames @ {GLOBAL_STATE.video_fps or 'unknown'} fps{trimmed_note}. "
             f"Device: {DEVICE}, dtype: bfloat16. Ready for text prompting."
         )
     else:
         status = (
-            f"Loaded {len(frames)} frames @ {GLOBAL_STATE.video_fps or 'unknown'} fps{trimmed_note}. "
             f"Device: {DEVICE}, dtype: bfloat16. Video session initialized."
         )
-    return GLOBAL_STATE, 0, max_idx, first_frame, status
 def compose_frame(state: AppState, frame_idx: int) -> Image.Image:
@@ -596,24 +596,24 @@ def _get_active_prompts_display(state: AppState) -> str:
     return "**Active prompts:** None"
-def propagate_masks(GLOBAL_STATE: AppState) -> Iterator[tuple[AppState, str, dict]]:
-    if GLOBAL_STATE is None:
-        return GLOBAL_STATE, "Load a video first.", gr.update()
-    if GLOBAL_STATE.active_tab != "text" and GLOBAL_STATE.inference_session is None:
-        return GLOBAL_STATE, "Load a video first.", gr.update()
-    total = max(1, GLOBAL_STATE.num_frames)
     processed = 0
-    yield GLOBAL_STATE, f"Propagating masks: {processed}/{total}", gr.update()
     last_frame_idx = 0
     with torch.no_grad():
-        if GLOBAL_STATE.active_tab == "text":
-            if GLOBAL_STATE.inference_session is None:
-                yield GLOBAL_STATE, "Text video model not loaded.", gr.update()
                 return
             model = TEXT_VIDEO_MODEL
@@ -621,7 +621,7 @@ def propagate_masks(GLOBAL_STATE: AppState) -> Iterator[tuple[AppState, str, dic
             # Collect all unique prompts from existing frame annotations
             text_prompt_to_obj_ids = {}
-            for frame_idx, frame_texts in GLOBAL_STATE.text_prompts_by_frame_obj.items():
                 for obj_id, text_prompt in frame_texts.items():
                     if text_prompt not in text_prompt_to_obj_ids:
                         text_prompt_to_obj_ids[text_prompt] = []
@@ -629,8 +629,8 @@ def propagate_masks(GLOBAL_STATE: AppState) -> Iterator[tuple[AppState, str, dic
                         text_prompt_to_obj_ids[text_prompt].append(obj_id)
             # Also check if there are prompts already in the inference session
-            if hasattr(GLOBAL_STATE.inference_session, "prompts") and GLOBAL_STATE.inference_session.prompts:
-                for prompt_text in GLOBAL_STATE.inference_session.prompts.values():
                     if prompt_text not in text_prompt_to_obj_ids:
                         text_prompt_to_obj_ids[prompt_text] = []
@@ -638,31 +638,29 @@ def propagate_masks(GLOBAL_STATE: AppState) -> Iterator[tuple[AppState, str, dic
                 text_prompt_to_obj_ids[text_prompt].sort()
             if not text_prompt_to_obj_ids:
-                yield GLOBAL_STATE, "No text prompts found. Please add a text prompt first.", gr.update()
                 return
             # Add all prompts to the inference session (processor handles deduplication)
             for text_prompt in text_prompt_to_obj_ids:
-                GLOBAL_STATE.inference_session = processor.add_text_prompt(
-                    inference_session=GLOBAL_STATE.inference_session,
                     text=text_prompt,
                 )
-            earliest_frame = (
-                min(GLOBAL_STATE.text_prompts_by_frame_obj.keys()) if GLOBAL_STATE.text_prompts_by_frame_obj else 0
-            )
-            frames_to_track = GLOBAL_STATE.num_frames - earliest_frame
             outputs_per_frame = {}
             for model_outputs in model.propagate_in_video_iterator(
-                inference_session=GLOBAL_STATE.inference_session,
                 start_frame_idx=earliest_frame,
                 max_frame_num_to_track=frames_to_track,
             ):
                 processed_outputs = processor.postprocess_outputs(
-                    GLOBAL_STATE.inference_session,
                     model_outputs,
                 )
                 frame_idx = model_outputs.frame_idx
@@ -673,8 +671,8 @@ def propagate_masks(GLOBAL_STATE: AppState) -> Iterator[tuple[AppState, str, dic
                 scores = processed_outputs["scores"]
                 prompt_to_obj_ids = processed_outputs.get("prompt_to_obj_ids", {})
-                masks_for_frame = GLOBAL_STATE.masks_by_frame.setdefault(frame_idx, {})
-                frame_texts = GLOBAL_STATE.text_prompts_by_frame_obj.setdefault(frame_idx, {})
                 num_objects = len(object_ids)
                 if num_objects > 0:
@@ -701,137 +699,131 @@ def propagate_masks(GLOBAL_STATE: AppState) -> Iterator[tuple[AppState, str, dic
                         # Store prompt and assign color
                         if found_prompt:
                             frame_texts[current_obj_id] = found_prompt.strip()
-                        _ensure_color_for_obj(GLOBAL_STATE, current_obj_id)
-                GLOBAL_STATE.composited_frames.pop(frame_idx, None)
                 last_frame_idx = frame_idx
                 processed += 1
                 if processed % 30 == 0 or processed == total:
-                    yield GLOBAL_STATE, f"Propagating masks: {processed}/{total}", gr.update(value=frame_idx)
         else:
-            if GLOBAL_STATE.inference_session is None:
-                yield GLOBAL_STATE, "Tracker model not loaded.", gr.update()
                 return
             model = TRACKER_MODEL
             processor = TRACKER_PROCESSOR
-            for sam2_video_output in model.propagate_in_video_iterator(
-                inference_session=GLOBAL_STATE.inference_session
-            ):
                 video_res_masks = processor.post_process_masks(
                     [sam2_video_output.pred_masks],
-                    original_sizes=[
-                        [GLOBAL_STATE.inference_session.video_height, GLOBAL_STATE.inference_session.video_width]
-                    ],
                 )[0]
                 frame_idx = sam2_video_output.frame_idx
-                for i, out_obj_id in enumerate(GLOBAL_STATE.inference_session.obj_ids):
-                    _ensure_color_for_obj(GLOBAL_STATE, int(out_obj_id))
                     mask_2d = video_res_masks[i].cpu().numpy()
-                    masks_for_frame = GLOBAL_STATE.masks_by_frame.setdefault(frame_idx, {})
                     masks_for_frame[int(out_obj_id)] = mask_2d
-                    GLOBAL_STATE.composited_frames.pop(frame_idx, None)
                 last_frame_idx = frame_idx
                 processed += 1
                 if processed % 30 == 0 or processed == total:
-                    yield GLOBAL_STATE, f"Propagating masks: {processed}/{total}", gr.update(value=frame_idx)
     text = f"Propagated masks across {processed} frames."
-    yield GLOBAL_STATE, text, gr.update(value=last_frame_idx)
-def reset_prompts(GLOBAL_STATE: AppState) -> tuple[AppState, Image.Image, str, str]:
     """Reset prompts and all outputs, but keep processed frames and cached vision features."""
-    if GLOBAL_STATE is None or GLOBAL_STATE.inference_session is None:
-        active_prompts = _get_active_prompts_display(GLOBAL_STATE)
-        return GLOBAL_STATE, None, "No active session to reset.", active_prompts
-    if GLOBAL_STATE.active_tab != "text":
-        active_prompts = _get_active_prompts_display(GLOBAL_STATE)
-        return GLOBAL_STATE, None, "Reset prompts is only available for text prompting mode.", active_prompts
     # Reset inference session tracking data but keep cache and processed frames
-    if hasattr(GLOBAL_STATE.inference_session, "reset_tracking_data"):
-        GLOBAL_STATE.inference_session.reset_tracking_data()
     # Manually clear prompts (reset_tracking_data doesn't clear prompts themselves)
-    if hasattr(GLOBAL_STATE.inference_session, "prompts"):
-        GLOBAL_STATE.inference_session.prompts.clear()
-    if hasattr(GLOBAL_STATE.inference_session, "prompt_input_ids"):
-        GLOBAL_STATE.inference_session.prompt_input_ids.clear()
-    if hasattr(GLOBAL_STATE.inference_session, "prompt_embeddings"):
-        GLOBAL_STATE.inference_session.prompt_embeddings.clear()
-    if hasattr(GLOBAL_STATE.inference_session, "prompt_attention_masks"):
-        GLOBAL_STATE.inference_session.prompt_attention_masks.clear()
-    if hasattr(GLOBAL_STATE.inference_session, "obj_id_to_prompt_id"):
-        GLOBAL_STATE.inference_session.obj_id_to_prompt_id.clear()
     # Reset detection-tracking fusion state
-    if hasattr(GLOBAL_STATE.inference_session, "obj_id_to_score"):
-        GLOBAL_STATE.inference_session.obj_id_to_score.clear()
-    if hasattr(GLOBAL_STATE.inference_session, "obj_id_to_tracker_score_frame_wise"):
-        GLOBAL_STATE.inference_session.obj_id_to_tracker_score_frame_wise.clear()
-    if hasattr(GLOBAL_STATE.inference_session, "obj_id_to_last_occluded"):
-        GLOBAL_STATE.inference_session.obj_id_to_last_occluded.clear()
-    if hasattr(GLOBAL_STATE.inference_session, "max_obj_id"):
-        GLOBAL_STATE.inference_session.max_obj_id = -1
-    if hasattr(GLOBAL_STATE.inference_session, "obj_first_frame_idx"):
-        GLOBAL_STATE.inference_session.obj_first_frame_idx.clear()
-    if hasattr(GLOBAL_STATE.inference_session, "unmatched_frame_inds"):
-        GLOBAL_STATE.inference_session.unmatched_frame_inds.clear()
-    if hasattr(GLOBAL_STATE.inference_session, "overlap_pair_to_frame_inds"):
-        GLOBAL_STATE.inference_session.overlap_pair_to_frame_inds.clear()
-    if hasattr(GLOBAL_STATE.inference_session, "trk_keep_alive"):
-        GLOBAL_STATE.inference_session.trk_keep_alive.clear()
-    if hasattr(GLOBAL_STATE.inference_session, "removed_obj_ids"):
-        GLOBAL_STATE.inference_session.removed_obj_ids.clear()
-    if hasattr(GLOBAL_STATE.inference_session, "suppressed_obj_ids"):
-        GLOBAL_STATE.inference_session.suppressed_obj_ids.clear()
-    if hasattr(GLOBAL_STATE.inference_session, "hotstart_removed_obj_ids"):
-        GLOBAL_STATE.inference_session.hotstart_removed_obj_ids.clear()
     # Clear all app state outputs
-    GLOBAL_STATE.masks_by_frame.clear()
-    GLOBAL_STATE.text_prompts_by_frame_obj.clear()
-    GLOBAL_STATE.composited_frames.clear()
-    GLOBAL_STATE.color_by_obj.clear()
-    GLOBAL_STATE.color_by_prompt.clear()
     # Update display
-    current_idx = int(getattr(GLOBAL_STATE, "current_frame_idx", 0))
-    current_idx = max(0, min(current_idx, GLOBAL_STATE.num_frames - 1))
-    preview_img = update_frame_display(GLOBAL_STATE, current_idx)
-    active_prompts = _get_active_prompts_display(GLOBAL_STATE)
     status = "Prompts and outputs reset. Processed frames and cached vision features preserved."
-    return GLOBAL_STATE, preview_img, status, active_prompts
-def reset_session(GLOBAL_STATE: AppState) -> tuple[AppState, Image.Image, int, int, str, str]:
-    if not GLOBAL_STATE.video_frames:
-        return GLOBAL_STATE, None, 0, 0, "Session reset. Load a new video.", "**Active prompts:** None"
-    if GLOBAL_STATE.active_tab == "text":
-        if GLOBAL_STATE.video_frames:
             processor = TEXT_VIDEO_PROCESSOR
-            GLOBAL_STATE.inference_session = processor.init_video_session(
-                video=GLOBAL_STATE.video_frames,
                 inference_device=DEVICE,
                 processing_device="cpu",
                 video_storage_device="cpu",
                 dtype=DTYPE,
             )
-    elif GLOBAL_STATE.inference_session is not None and hasattr(
-        GLOBAL_STATE.inference_session, "reset_inference_session"
-    ):
-        GLOBAL_STATE.inference_session.reset_inference_session()
-    elif GLOBAL_STATE.video_frames:
         processor = TRACKER_PROCESSOR
-        raw_video = [np.array(frame) for frame in GLOBAL_STATE.video_frames]
-        GLOBAL_STATE.inference_session = processor.init_video_session(
             video=raw_video,
             inference_device=DEVICE,
             video_storage_device="cpu",
@@ -839,44 +831,44 @@ def reset_session(GLOBAL_STATE: AppState) -> tuple[AppState, Image.Image, int, i
             dtype=DTYPE,
         )
-    GLOBAL_STATE.masks_by_frame.clear()
-    GLOBAL_STATE.clicks_by_frame_obj.clear()
-    GLOBAL_STATE.boxes_by_frame_obj.clear()
-    GLOBAL_STATE.text_prompts_by_frame_obj.clear()
-    GLOBAL_STATE.composited_frames.clear()
-    GLOBAL_STATE.color_by_obj.clear()
-    GLOBAL_STATE.color_by_prompt.clear()
-    GLOBAL_STATE.pending_box_start = None
-    GLOBAL_STATE.pending_box_start_frame_idx = None
-    GLOBAL_STATE.pending_box_start_obj_id = None
     gc.collect()
-    current_idx = int(getattr(GLOBAL_STATE, "current_frame_idx", 0))
-    current_idx = max(0, min(current_idx, GLOBAL_STATE.num_frames - 1))
-    preview_img = update_frame_display(GLOBAL_STATE, current_idx)
-    slider_minmax = gr.update(minimum=0, maximum=max(GLOBAL_STATE.num_frames - 1, 0), interactive=True)
     slider_value = gr.update(value=current_idx)
     status = "Session reset. Prompts cleared; video preserved."
-    active_prompts = _get_active_prompts_display(GLOBAL_STATE)
-    return GLOBAL_STATE, preview_img, slider_minmax, slider_value, status, active_prompts
-def _on_video_change_pointbox(GLOBAL_STATE: AppState, video: str | dict) -> tuple[AppState, dict, Image.Image, str]:
-    GLOBAL_STATE, min_idx, max_idx, first_frame, status = init_video_session(GLOBAL_STATE, video, "point_box")
     return (
-        GLOBAL_STATE,
         gr.update(minimum=min_idx, maximum=max_idx, value=min_idx, interactive=True),
         first_frame,
         status,
     )
-def _on_video_change_text(GLOBAL_STATE: AppState, video: str | dict) -> tuple[AppState, dict, Image.Image, str, str]:
-    GLOBAL_STATE, min_idx, max_idx, first_frame, status = init_video_session(GLOBAL_STATE, video, "text")
-    active_prompts = _get_active_prompts_display(GLOBAL_STATE)
     return (
-        GLOBAL_STATE,
         gr.update(minimum=min_idx, maximum=max_idx, value=min_idx, interactive=True),
         first_frame,
         status,
@@ -885,7 +877,7 @@ def _on_video_change_text(GLOBAL_STATE: AppState, video: str | dict) -> tuple[Ap
 with gr.Blocks(title="SAM3", theme=Soft(primary_hue="blue", secondary_hue="rose", neutral_hue="slate")) as demo:
-    GLOBAL_STATE = gr.State(AppState())
     gr.Markdown(
         """
@@ -953,9 +945,9 @@ with gr.Blocks(title="SAM3", theme=Soft(primary_hue="blue", secondary_hue="rose"
             with gr.Row():
                 gr.Examples(
                     examples=examples_list_text,
-                    inputs=[GLOBAL_STATE, video_in_text],
                     fn=_on_video_change_text,
-                    outputs=[GLOBAL_STATE, frame_slider_text, preview_text, load_status_text, active_prompts_display],
                     label="Examples",
                     cache_examples=False,
                     examples_per_page=5,
@@ -1016,9 +1008,9 @@ with gr.Blocks(title="SAM3", theme=Soft(primary_hue="blue", secondary_hue="rose"
             with gr.Row():
                 gr.Examples(
                     examples=examples_list_pointbox,
-                    inputs=[GLOBAL_STATE, video_in_pointbox],
                     fn=_on_video_change_pointbox,
-                    outputs=[GLOBAL_STATE, frame_slider_pointbox, preview_pointbox, load_status_pointbox],
                     label="Examples",
                     cache_examples=False,
                     examples_per_page=5,
@@ -1026,8 +1018,8 @@ with gr.Blocks(title="SAM3", theme=Soft(primary_hue="blue", secondary_hue="rose"
     video_in_pointbox.change(
         _on_video_change_pointbox,
-        inputs=[GLOBAL_STATE, video_in_pointbox],
-        outputs=[GLOBAL_STATE, frame_slider_pointbox, preview_pointbox, load_status_pointbox],
         show_progress=True,
     )
@@ -1038,14 +1030,14 @@ with gr.Blocks(title="SAM3", theme=Soft(primary_hue="blue", secondary_hue="rose"
     frame_slider_pointbox.change(
         _sync_frame_idx_pointbox,
-        inputs=[GLOBAL_STATE, frame_slider_pointbox],
         outputs=preview_pointbox,
     )
     video_in_text.change(
         _on_video_change_text,
-        inputs=[GLOBAL_STATE, video_in_text],
-        outputs=[GLOBAL_STATE, frame_slider_text, preview_text, load_status_text, active_prompts_display],
         show_progress=True,
     )
@@ -1056,7 +1048,7 @@ with gr.Blocks(title="SAM3", theme=Soft(primary_hue="blue", secondary_hue="rose"
     frame_slider_text.change(
         _sync_frame_idx_text,
-        inputs=[GLOBAL_STATE, frame_slider_text],
         outputs=preview_text,
     )
@@ -1065,14 +1057,14 @@ with gr.Blocks(title="SAM3", theme=Soft(primary_hue="blue", secondary_hue="rose"
             s.current_obj_id = int(oid)
         return gr.update()
-    obj_id_inp.change(_sync_obj_id, inputs=[GLOBAL_STATE, obj_id_inp], outputs=[])
     def _sync_label(s: AppState, lab: str):
         if s is not None and lab is not None:
             s.current_label = str(lab)
         return gr.update()
-    label_radio.change(_sync_label, inputs=[GLOBAL_STATE, label_radio], outputs=[])
     def _sync_prompt_type(s: AppState, val: str):
         if s is not None and val is not None:
@@ -1087,13 +1079,13 @@ with gr.Blocks(title="SAM3", theme=Soft(primary_hue="blue", secondary_hue="rose"
     prompt_type.change(
         _sync_prompt_type,
-        inputs=[GLOBAL_STATE, prompt_type],
         outputs=[label_radio, clear_old_chk],
     )
     preview_pointbox.select(
         on_image_click,
-        [preview_pointbox, GLOBAL_STATE, frame_slider_pointbox, obj_id_inp, label_radio, clear_old_chk],
         preview_pointbox,
     )
@@ -1103,14 +1095,14 @@ with gr.Blocks(title="SAM3", theme=Soft(primary_hue="blue", secondary_hue="rose"
     text_apply_btn.click(
         _on_text_apply,
-        inputs=[GLOBAL_STATE, frame_slider_text, text_prompt_input],
         outputs=[preview_text, text_status, active_prompts_display],
     )
     reset_prompts_btn.click(
         reset_prompts,
-        inputs=[GLOBAL_STATE],
-        outputs=[GLOBAL_STATE, preview_text, text_status, active_prompts_display],
     )
     def _render_video(s: AppState):
@@ -1139,32 +1131,32 @@ with gr.Blocks(title="SAM3", theme=Soft(primary_hue="blue", secondary_hue="rose"
             print(f"Failed to render video with cv2: {e}")
             raise gr.Error(f"Failed to render video: {e}")
-    render_btn_pointbox.click(_render_video, inputs=[GLOBAL_STATE], outputs=[playback_video_pointbox])
-    render_btn_text.click(_render_video, inputs=[GLOBAL_STATE], outputs=[playback_video_text])
     propagate_btn_pointbox.click(
         propagate_masks,
-        inputs=[GLOBAL_STATE],
-        outputs=[GLOBAL_STATE, propagate_status_pointbox, frame_slider_pointbox],
     )
     propagate_btn_text.click(
         propagate_masks,
-        inputs=[GLOBAL_STATE],
-        outputs=[GLOBAL_STATE, propagate_status_text, frame_slider_text],
     )
     reset_btn_pointbox.click(
         reset_session,
-        inputs=GLOBAL_STATE,
-        outputs=[GLOBAL_STATE, preview_pointbox, frame_slider_pointbox, frame_slider_pointbox, load_status_pointbox],
     )
     reset_btn_text.click(
         reset_session,
-        inputs=GLOBAL_STATE,
         outputs=[
-            GLOBAL_STATE,
             preview_text,
             frame_slider_text,
             frame_slider_text,

 def init_video_session(
+    state: AppState, video: str | dict, active_tab: str = "point_box"
 ) -> tuple[AppState, int, int, Image.Image, str]:
+    state.video_frames = []
+    state.masks_by_frame = {}
+    state.color_by_obj = {}
+    state.color_by_prompt = {}
+    state.text_prompts_by_frame_obj = {}
+    state.clicks_by_frame_obj = {}
+    state.boxes_by_frame_obj = {}
+    state.composited_frames = {}
+    state.inference_session = None
+    state.active_tab = active_tab
     video_path: str | None = None
     if isinstance(video, dict):
         trimmed_note = f" (trimmed to {int(MAX_SECONDS)}s = {len(frames)} frames)"
         if isinstance(info, dict):
             info["num_frames"] = len(frames)
+    state.video_frames = frames
+    state.video_fps = float(fps_in) if fps_in else None
     raw_video = [np.array(frame) for frame in frames]
     if active_tab == "text":
         processor = TEXT_VIDEO_PROCESSOR
+        state.inference_session = processor.init_video_session(
             video=frames,
             inference_device=DEVICE,
             inference_state_device=DEVICE,
         )
     else:
         processor = TRACKER_PROCESSOR
+        state.inference_session = processor.init_video_session(
             video=raw_video,
             inference_device=DEVICE,
             inference_state_device=DEVICE,
     max_idx = len(frames) - 1
     if active_tab == "text":
         status = (
+            f"Loaded {len(frames)} frames @ {state.video_fps or 'unknown'} fps{trimmed_note}. "
             f"Device: {DEVICE}, dtype: bfloat16. Ready for text prompting."
         )
     else:
         status = (
+            f"Loaded {len(frames)} frames @ {state.video_fps or 'unknown'} fps{trimmed_note}. "
             f"Device: {DEVICE}, dtype: bfloat16. Video session initialized."
         )
+    return state, 0, max_idx, first_frame, status
 def compose_frame(state: AppState, frame_idx: int) -> Image.Image:
     return "**Active prompts:** None"
+def propagate_masks(state: AppState) -> Iterator[tuple[AppState, str, dict]]:
+    if state is None:
+        return state, "Load a video first.", gr.update()
+    if state.active_tab != "text" and state.inference_session is None:
+        return state, "Load a video first.", gr.update()
+    total = max(1, state.num_frames)
     processed = 0
+    yield state, f"Propagating masks: {processed}/{total}", gr.update()
     last_frame_idx = 0
     with torch.no_grad():
+        if state.active_tab == "text":
+            if state.inference_session is None:
+                yield state, "Text video model not loaded.", gr.update()
                 return
             model = TEXT_VIDEO_MODEL
             # Collect all unique prompts from existing frame annotations
             text_prompt_to_obj_ids = {}
+            for frame_idx, frame_texts in state.text_prompts_by_frame_obj.items():
                 for obj_id, text_prompt in frame_texts.items():
                     if text_prompt not in text_prompt_to_obj_ids:
                         text_prompt_to_obj_ids[text_prompt] = []
                         text_prompt_to_obj_ids[text_prompt].append(obj_id)
             # Also check if there are prompts already in the inference session
+            if hasattr(state.inference_session, "prompts") and state.inference_session.prompts:
+                for prompt_text in state.inference_session.prompts.values():
                     if prompt_text not in text_prompt_to_obj_ids:
                         text_prompt_to_obj_ids[prompt_text] = []
                 text_prompt_to_obj_ids[text_prompt].sort()
             if not text_prompt_to_obj_ids:
+                yield state, "No text prompts found. Please add a text prompt first.", gr.update()
                 return
             # Add all prompts to the inference session (processor handles deduplication)
             for text_prompt in text_prompt_to_obj_ids:
+                state.inference_session = processor.add_text_prompt(
+                    inference_session=state.inference_session,
                     text=text_prompt,
                 )
+            earliest_frame = min(state.text_prompts_by_frame_obj.keys()) if state.text_prompts_by_frame_obj else 0
+            frames_to_track = state.num_frames - earliest_frame
             outputs_per_frame = {}
             for model_outputs in model.propagate_in_video_iterator(
+                inference_session=state.inference_session,
                 start_frame_idx=earliest_frame,
                 max_frame_num_to_track=frames_to_track,
             ):
                 processed_outputs = processor.postprocess_outputs(
+                    state.inference_session,
                     model_outputs,
                 )
                 frame_idx = model_outputs.frame_idx
                 scores = processed_outputs["scores"]
                 prompt_to_obj_ids = processed_outputs.get("prompt_to_obj_ids", {})
+                masks_for_frame = state.masks_by_frame.setdefault(frame_idx, {})
+                frame_texts = state.text_prompts_by_frame_obj.setdefault(frame_idx, {})
                 num_objects = len(object_ids)
                 if num_objects > 0:
                         # Store prompt and assign color
                         if found_prompt:
                             frame_texts[current_obj_id] = found_prompt.strip()
+                        _ensure_color_for_obj(state, current_obj_id)
+                state.composited_frames.pop(frame_idx, None)
                 last_frame_idx = frame_idx
                 processed += 1
                 if processed % 30 == 0 or processed == total:
+                    yield state, f"Propagating masks: {processed}/{total}", gr.update(value=frame_idx)
         else:
+            if state.inference_session is None:
+                yield state, "Tracker model not loaded.", gr.update()
                 return
             model = TRACKER_MODEL
             processor = TRACKER_PROCESSOR
+            for sam2_video_output in model.propagate_in_video_iterator(inference_session=state.inference_session):
                 video_res_masks = processor.post_process_masks(
                     [sam2_video_output.pred_masks],
+                    original_sizes=[[state.inference_session.video_height, state.inference_session.video_width]],
                 )[0]
                 frame_idx = sam2_video_output.frame_idx
+                for i, out_obj_id in enumerate(state.inference_session.obj_ids):
+                    _ensure_color_for_obj(state, int(out_obj_id))
                     mask_2d = video_res_masks[i].cpu().numpy()
+                    masks_for_frame = state.masks_by_frame.setdefault(frame_idx, {})
                     masks_for_frame[int(out_obj_id)] = mask_2d
+                    state.composited_frames.pop(frame_idx, None)
                 last_frame_idx = frame_idx
                 processed += 1
                 if processed % 30 == 0 or processed == total:
+                    yield state, f"Propagating masks: {processed}/{total}", gr.update(value=frame_idx)
     text = f"Propagated masks across {processed} frames."
+    yield state, text, gr.update(value=last_frame_idx)
+def reset_prompts(state: AppState) -> tuple[AppState, Image.Image, str, str]:
     """Reset prompts and all outputs, but keep processed frames and cached vision features."""
+    if state is None or state.inference_session is None:
+        active_prompts = _get_active_prompts_display(state)
+        return state, None, "No active session to reset.", active_prompts
+    if state.active_tab != "text":
+        active_prompts = _get_active_prompts_display(state)
+        return state, None, "Reset prompts is only available for text prompting mode.", active_prompts
     # Reset inference session tracking data but keep cache and processed frames
+    if hasattr(state.inference_session, "reset_tracking_data"):
+        state.inference_session.reset_tracking_data()
     # Manually clear prompts (reset_tracking_data doesn't clear prompts themselves)
+    if hasattr(state.inference_session, "prompts"):
+        state.inference_session.prompts.clear()
+    if hasattr(state.inference_session, "prompt_input_ids"):
+        state.inference_session.prompt_input_ids.clear()
+    if hasattr(state.inference_session, "prompt_embeddings"):
+        state.inference_session.prompt_embeddings.clear()
+    if hasattr(state.inference_session, "prompt_attention_masks"):
+        state.inference_session.prompt_attention_masks.clear()
+    if hasattr(state.inference_session, "obj_id_to_prompt_id"):
+        state.inference_session.obj_id_to_prompt_id.clear()
     # Reset detection-tracking fusion state
+    if hasattr(state.inference_session, "obj_id_to_score"):
+        state.inference_session.obj_id_to_score.clear()
+    if hasattr(state.inference_session, "obj_id_to_tracker_score_frame_wise"):
+        state.inference_session.obj_id_to_tracker_score_frame_wise.clear()
+    if hasattr(state.inference_session, "obj_id_to_last_occluded"):
+        state.inference_session.obj_id_to_last_occluded.clear()
+    if hasattr(state.inference_session, "max_obj_id"):
+        state.inference_session.max_obj_id = -1
+    if hasattr(state.inference_session, "obj_first_frame_idx"):
+        state.inference_session.obj_first_frame_idx.clear()
+    if hasattr(state.inference_session, "unmatched_frame_inds"):
+        state.inference_session.unmatched_frame_inds.clear()
+    if hasattr(state.inference_session, "overlap_pair_to_frame_inds"):
+        state.inference_session.overlap_pair_to_frame_inds.clear()
+    if hasattr(state.inference_session, "trk_keep_alive"):
+        state.inference_session.trk_keep_alive.clear()
+    if hasattr(state.inference_session, "removed_obj_ids"):
+        state.inference_session.removed_obj_ids.clear()
+    if hasattr(state.inference_session, "suppressed_obj_ids"):
+        state.inference_session.suppressed_obj_ids.clear()
+    if hasattr(state.inference_session, "hotstart_removed_obj_ids"):
+        state.inference_session.hotstart_removed_obj_ids.clear()
     # Clear all app state outputs
+    state.masks_by_frame.clear()
+    state.text_prompts_by_frame_obj.clear()
+    state.composited_frames.clear()
+    state.color_by_obj.clear()
+    state.color_by_prompt.clear()
     # Update display
+    current_idx = int(getattr(state, "current_frame_idx", 0))
+    current_idx = max(0, min(current_idx, state.num_frames - 1))
+    preview_img = update_frame_display(state, current_idx)
+    active_prompts = _get_active_prompts_display(state)
     status = "Prompts and outputs reset. Processed frames and cached vision features preserved."
+    return state, preview_img, status, active_prompts
+def reset_session(state: AppState) -> tuple[AppState, Image.Image, int, int, str, str]:
+    if not state.video_frames:
+        return state, None, 0, 0, "Session reset. Load a new video.", "**Active prompts:** None"
+    if state.active_tab == "text":
+        if state.video_frames:
             processor = TEXT_VIDEO_PROCESSOR
+            state.inference_session = processor.init_video_session(
+                video=state.video_frames,
                 inference_device=DEVICE,
                 processing_device="cpu",
                 video_storage_device="cpu",
                 dtype=DTYPE,
             )
+    elif state.inference_session is not None and hasattr(state.inference_session, "reset_inference_session"):
+        state.inference_session.reset_inference_session()
+    elif state.video_frames:
         processor = TRACKER_PROCESSOR
+        raw_video = [np.array(frame) for frame in state.video_frames]
+        state.inference_session = processor.init_video_session(
             video=raw_video,
             inference_device=DEVICE,
             video_storage_device="cpu",
             dtype=DTYPE,
         )
+    state.masks_by_frame.clear()
+    state.clicks_by_frame_obj.clear()
+    state.boxes_by_frame_obj.clear()
+    state.text_prompts_by_frame_obj.clear()
+    state.composited_frames.clear()
+    state.color_by_obj.clear()
+    state.color_by_prompt.clear()
+    state.pending_box_start = None
+    state.pending_box_start_frame_idx = None
+    state.pending_box_start_obj_id = None
     gc.collect()
+    current_idx = int(getattr(state, "current_frame_idx", 0))
+    current_idx = max(0, min(current_idx, state.num_frames - 1))
+    preview_img = update_frame_display(state, current_idx)
+    slider_minmax = gr.update(minimum=0, maximum=max(state.num_frames - 1, 0), interactive=True)
     slider_value = gr.update(value=current_idx)
     status = "Session reset. Prompts cleared; video preserved."
+    active_prompts = _get_active_prompts_display(state)
+    return state, preview_img, slider_minmax, slider_value, status, active_prompts
+def _on_video_change_pointbox(state: AppState, video: str | dict) -> tuple[AppState, dict, Image.Image, str]:
+    state, min_idx, max_idx, first_frame, status = init_video_session(state, video, "point_box")
     return (
+        state,
         gr.update(minimum=min_idx, maximum=max_idx, value=min_idx, interactive=True),
         first_frame,
         status,
     )
+def _on_video_change_text(state: AppState, video: str | dict) -> tuple[AppState, dict, Image.Image, str, str]:
+    state, min_idx, max_idx, first_frame, status = init_video_session(state, video, "text")
+    active_prompts = _get_active_prompts_display(state)
     return (
+        state,
         gr.update(minimum=min_idx, maximum=max_idx, value=min_idx, interactive=True),
         first_frame,
         status,
 with gr.Blocks(title="SAM3", theme=Soft(primary_hue="blue", secondary_hue="rose", neutral_hue="slate")) as demo:
+    app_state = gr.State(AppState())
     gr.Markdown(
         """
             with gr.Row():
                 gr.Examples(
                     examples=examples_list_text,
+                    inputs=[app_state, video_in_text],
                     fn=_on_video_change_text,
+                    outputs=[app_state, frame_slider_text, preview_text, load_status_text, active_prompts_display],
                     label="Examples",
                     cache_examples=False,
                     examples_per_page=5,
             with gr.Row():
                 gr.Examples(
                     examples=examples_list_pointbox,
+                    inputs=[app_state, video_in_pointbox],
                     fn=_on_video_change_pointbox,
+                    outputs=[app_state, frame_slider_pointbox, preview_pointbox, load_status_pointbox],
                     label="Examples",
                     cache_examples=False,
                     examples_per_page=5,
     video_in_pointbox.change(
         _on_video_change_pointbox,
+        inputs=[app_state, video_in_pointbox],
+        outputs=[app_state, frame_slider_pointbox, preview_pointbox, load_status_pointbox],
         show_progress=True,
     )
     frame_slider_pointbox.change(
         _sync_frame_idx_pointbox,
+        inputs=[app_state, frame_slider_pointbox],
         outputs=preview_pointbox,
     )
     video_in_text.change(
         _on_video_change_text,
+        inputs=[app_state, video_in_text],
+        outputs=[app_state, frame_slider_text, preview_text, load_status_text, active_prompts_display],
         show_progress=True,
     )
     frame_slider_text.change(
         _sync_frame_idx_text,
+        inputs=[app_state, frame_slider_text],
         outputs=preview_text,
     )
             s.current_obj_id = int(oid)
         return gr.update()
+    obj_id_inp.change(_sync_obj_id, inputs=[app_state, obj_id_inp], outputs=[])
     def _sync_label(s: AppState, lab: str):
         if s is not None and lab is not None:
             s.current_label = str(lab)
         return gr.update()
+    label_radio.change(_sync_label, inputs=[app_state, label_radio], outputs=[])
     def _sync_prompt_type(s: AppState, val: str):
         if s is not None and val is not None:
     prompt_type.change(
         _sync_prompt_type,
+        inputs=[app_state, prompt_type],
         outputs=[label_radio, clear_old_chk],
     )
     preview_pointbox.select(
         on_image_click,
+        [preview_pointbox, app_state, frame_slider_pointbox, obj_id_inp, label_radio, clear_old_chk],
         preview_pointbox,
     )
     text_apply_btn.click(
         _on_text_apply,
+        inputs=[app_state, frame_slider_text, text_prompt_input],
         outputs=[preview_text, text_status, active_prompts_display],
     )
     reset_prompts_btn.click(
         reset_prompts,
+        inputs=[app_state],
+        outputs=[app_state, preview_text, text_status, active_prompts_display],
     )
     def _render_video(s: AppState):
             print(f"Failed to render video with cv2: {e}")
             raise gr.Error(f"Failed to render video: {e}")
+    render_btn_pointbox.click(_render_video, inputs=[app_state], outputs=[playback_video_pointbox])
+    render_btn_text.click(_render_video, inputs=[app_state], outputs=[playback_video_text])
     propagate_btn_pointbox.click(
         propagate_masks,
+        inputs=[app_state],
+        outputs=[app_state, propagate_status_pointbox, frame_slider_pointbox],
     )
     propagate_btn_text.click(
         propagate_masks,
+        inputs=[app_state],
+        outputs=[app_state, propagate_status_text, frame_slider_text],
     )
     reset_btn_pointbox.click(
         reset_session,
+        inputs=app_state,
+        outputs=[app_state, preview_pointbox, frame_slider_pointbox, frame_slider_pointbox, load_status_pointbox],
     )
     reset_btn_text.click(
         reset_session,
+        inputs=app_state,
         outputs=[
+            app_state,
             preview_text,
             frame_slider_text,
             frame_slider_text,