Spaces:

dwellbot
/

dwellbot_stream3r

Configuration error

App Files Files Community

brian4dwell commited on Sep 17, 2025

Commit

4c075ec

1 Parent(s): 6805b8e

add saving and reloading of session

Browse files

Files changed (4) hide show

app.py +229 -50
configs/stream_session.json +3 -0
stream3r/stream_session.py +33 -4
tests/test_stream_session_cache.py +17 -0

app.py CHANGED Viewed

@@ -64,7 +64,13 @@ def extract_images_from_zip(zip_path: str, outdir: str) -> list[str]:
             if ext not in ALLOWED_IMG_EXT:
                 continue
             # Construct final path safely
-            dest_path = os.path.join(outdir, os.path.basename(name))
             # Zip-slip guard (in case filename has ../ etc.)
             if not _is_within_dir(outdir, dest_path):
                 continue
@@ -74,19 +80,82 @@ def extract_images_from_zip(zip_path: str, outdir: str) -> list[str]:
     return extracted
 # -------------------------------------------------------------------------
 # 1) Core model inference
 # -------------------------------------------------------------------------
 @spaces.GPU(duration=180)   # triggers ZeroGPU allocation for this call
-def run_model(target_dir: str, model: STream3R, mode: str="causal", streaming: bool=False) -> dict:
     """
-    Run the STream3R model on images in the 'target_dir/images' folder and return predictions.
     Args:
         target_dir: Directory containing the images subfolder
         model: STream3R model instance
         mode: Processing mode ("causal", "window", or "full")
         streaming: If True, use StreamSession for sequential processing; if False, use batch processing
     """
     print(f"Processing images from {target_dir}")
@@ -113,6 +182,8 @@ def run_model(target_dir: str, model: STream3R, mode: str="causal", streaming: b
     print(f"Running inference in {'streaming' if streaming else 'batch'} mode...")
     dtype = torch.bfloat16 if torch.cuda.get_device_capability()[0] >= 8 else torch.float16
     with torch.no_grad():
         with torch.amp.autocast(dtype=dtype, device_type=device):
             if streaming:
@@ -123,12 +194,34 @@ def run_model(target_dir: str, model: STream3R, mode: str="causal", streaming: b
                 session = StreamSession(model, mode=mode)
-                # Process images one by one to simulate streaming inference
-                for i in range(images.shape[0]):
-                    image = images[i : i + 1]
-                    predictions = session.forward_stream(image)
-                session.clear()
             else:
                 # Use batch processing (original behavior)
                 predictions = model(images, mode=mode)
@@ -153,19 +246,20 @@ def run_model(target_dir: str, model: STream3R, mode: str="causal", streaming: b
     # Clean up
     torch.cuda.empty_cache()
-    return predictions
 # -------------------------------------------------------------------------
 # 2) Handle uploaded video/images --> produce target_dir + images
 # -------------------------------------------------------------------------
-def handle_uploads(input_video, input_images, input_zip=None):
     """
     Create a new 'target_dir' + 'images' subfolder.
     - Copies uploaded images
     - Optionally extracts images from a ZIP
     - Optionally extracts frames from a video (1 fps)
-    Returns (target_dir, image_paths).
     """
     start_time = time.time()
     gc.collect()
@@ -173,11 +267,23 @@ def handle_uploads(input_video, input_images, input_zip=None):
     # Create a unique folder name
     timestamp = datetime.now().strftime("%Y%m%d_%H%M%S_%f")
-    target_dir = os.path.join("demo_cache", f"input_images_{timestamp}")
-    target_dir_images = os.path.join(target_dir, "images")
-    if os.path.exists(target_dir):
-        shutil.rmtree(target_dir)
     os.makedirs(target_dir_images, exist_ok=True)
     image_paths: list[str] = []
@@ -186,9 +292,8 @@ def handle_uploads(input_video, input_images, input_zip=None):
     if input_images:
         for file_data in input_images:
             file_path = file_data["name"] if isinstance(file_data, dict) and "name" in file_data else file_data
-            dst_path = os.path.join(target_dir_images, os.path.basename(file_path))
-            shutil.copy(file_path, dst_path)
-            image_paths.append(dst_path)
     # --- Handle ZIP (extract images) ---
     if input_zip:
@@ -203,7 +308,7 @@ def handle_uploads(input_video, input_images, input_zip=None):
         fps = vs.get(cv2.CAP_PROP_FPS) or 30.0
         frame_interval = max(1, int(fps * 1))  # 1 frame/sec
         count = 0
-        video_frame_num = 0
         while True:
             gotit, frame = vs.read()
             if not gotit:
@@ -218,23 +323,44 @@ def handle_uploads(input_video, input_images, input_zip=None):
     image_paths = sorted(set(image_paths))  # de-dupe + sort
     end_time = time.time()
     print(f"Prepared {len(image_paths)} files in {target_dir_images}; took {end_time - start_time:.3f}s")
-    return target_dir, image_paths
 # -------------------------------------------------------------------------
 # 3) Update gallery on upload
 # -------------------------------------------------------------------------
-def update_gallery_on_upload(input_video, input_images, input_zip):
     """
     Handle any new uploads (video, images, or zip) and render preview.
     """
-    if not input_video and not input_images and not input_zip:
-        return None, None, None, None
-    target_dir, image_paths = handle_uploads(input_video, input_images, input_zip)
-    return None, target_dir, image_paths, "Upload complete. Click 'Reconstruct' to begin 3D processing."
@@ -271,12 +397,19 @@ def gradio_demo(
     print("Running run_model...")
     with torch.no_grad():
-        predictions = run_model(target_dir, model, mode=mode, streaming=streaming)
     # Save predictions
     prediction_save_path = os.path.join(target_dir, "predictions.npz")
     np.savez(prediction_save_path, **predictions)
     # Handle None frame_filter
     if frame_filter is None:
         frame_filter = "All"
@@ -310,7 +443,12 @@ def gradio_demo(
     print(f"Total time: {end_time - start_time:.2f} seconds (including IO)")
     log_msg = f"Reconstruction Success ({len(all_files)} frames). Waiting for visualization."
-    return glbfile, log_msg, gr.Dropdown(choices=frame_filter_choices, value=frame_filter, interactive=True)
 # -------------------------------------------------------------------------
@@ -331,7 +469,16 @@ def update_log():
 def update_visualization(
-    target_dir, conf_thres, frame_filter, mask_black_bg, mask_white_bg, show_cam, mask_sky, prediction_mode, is_example
 ):
     """
     Reload saved predictions from npz, create (or reuse) the GLB for new parameters,
@@ -364,9 +511,10 @@ def update_visualization(
     loaded = np.load(predictions_path)
     predictions = {key: np.array(loaded[key]) for key in key_list}
     glbfile = os.path.join(
         target_dir,
-        f"glbscene_{conf_thres}_{frame_filter.replace('.', '_').replace(':', '').replace(' ', '_')}_maskb{mask_black_bg}_maskw{mask_white_bg}_cam{show_cam}_sky{mask_sky}_pred{prediction_mode.replace(' ', '_')}_mode{mode}.glb",
     )
     if not os.path.exists(glbfile):
@@ -504,6 +652,7 @@ with gr.Blocks(
             input_video = gr.Video(label="Upload Video", interactive=True)
             input_images = gr.File(file_count="multiple", label="Upload Images", interactive=True)
             input_zip = gr.File(file_types=[".zip"], label="Upload ZIP of Images", interactive=True)
             image_gallery = gr.Gallery(
                 label="Preview",
@@ -521,11 +670,22 @@ with gr.Blocks(
                     "Please upload a video or images, then click Reconstruct.", elem_classes=["custom-log"]
                 )
                 reconstruction_output = gr.Model3D(height=520, zoom_speed=0.5, pan_speed=0.5)
             with gr.Row():
                 submit_btn = gr.Button("Reconstruct", scale=1, variant="primary")
                 clear_btn = gr.ClearButton(
-                    [input_video, input_images, reconstruction_output, log_output, target_dir_output, image_gallery],
                     scale=1,
                 )
@@ -626,13 +786,22 @@ with gr.Blocks(
         3) Return model3D + logs + new_dir + updated dropdown + gallery
         We do NOT return is_example. It's just an input.
         """
-        target_dir, image_paths = handle_uploads(input_video, input_images)
         # Always use "All" for frame_filter in examples
         frame_filter = "All"
-        glbfile, log_msg, dropdown = gradio_demo(
-            target_dir, conf_thres, frame_filter, mask_black_bg, mask_white_bg, show_cam, mask_sky, prediction_mode, mode
         )
-        return glbfile, log_msg, target_dir, dropdown, image_paths
     gr.Markdown("Click any row to load an example.", elem_classes=["example-log"])
@@ -652,7 +821,14 @@ with gr.Blocks(
             is_example,
             mode,
         ],
-        outputs=[reconstruction_output, log_output, target_dir_output, frame_filter, image_gallery],
         fn=example_pipeline,
         cache_examples=False,
         examples_per_page=50,
@@ -681,7 +857,7 @@ with gr.Blocks(
             mode,
             streaming,
         ],
-        outputs=[reconstruction_output, log_output, frame_filter],
     ).then(
         fn=lambda: "False", inputs=[], outputs=[is_example]  # set is_example to "False"
     )
@@ -700,6 +876,7 @@ with gr.Blocks(
             show_cam,
             mask_sky,
             prediction_mode,
             is_example,
         ],
         [reconstruction_output, log_output],
@@ -715,6 +892,7 @@ with gr.Blocks(
             show_cam,
             mask_sky,
             prediction_mode,
             is_example,
         ],
         [reconstruction_output, log_output],
@@ -730,6 +908,7 @@ with gr.Blocks(
             show_cam,
             mask_sky,
             prediction_mode,
             is_example,
         ],
         [reconstruction_output, log_output],
@@ -745,6 +924,7 @@ with gr.Blocks(
             show_cam,
             mask_sky,
             prediction_mode,
             is_example,
         ],
         [reconstruction_output, log_output],
@@ -760,6 +940,7 @@ with gr.Blocks(
             show_cam,
             mask_sky,
             prediction_mode,
             is_example,
         ],
         [reconstruction_output, log_output],
@@ -775,6 +956,7 @@ with gr.Blocks(
             show_cam,
             mask_sky,
             prediction_mode,
             is_example,
         ],
         [reconstruction_output, log_output],
@@ -790,6 +972,7 @@ with gr.Blocks(
             show_cam,
             mask_sky,
             prediction_mode,
             is_example,
         ],
         [reconstruction_output, log_output],
@@ -798,20 +981,16 @@ with gr.Blocks(
     # -------------------------------------------------------------------------
     # Auto-update gallery whenever user uploads or changes their files
     # -------------------------------------------------------------------------
-    input_video.change(
-        fn=update_gallery_on_upload,
-        inputs=[input_video, input_images, input_zip],
-        outputs=[reconstruction_output, target_dir_output, image_gallery, log_output],
-    )
-    input_images.change(
-        fn=update_gallery_on_upload,
-        inputs=[input_video, input_images, input_zip],
-        outputs=[reconstruction_output, target_dir_output, image_gallery, log_output],
-    )
-    input_zip.change(
         fn=update_gallery_on_upload,
-        inputs=[input_video, input_images, input_zip],
-        outputs=[reconstruction_output, target_dir_output, image_gallery, log_output],
     )
     demo.queue(max_size=20).launch(show_error=True, share=False)

             if ext not in ALLOWED_IMG_EXT:
                 continue
             # Construct final path safely
+            base_name = os.path.basename(name)
+            name_root, name_ext = os.path.splitext(base_name)
+            dest_path = os.path.join(outdir, base_name)
+            counter = 1
+            while os.path.exists(dest_path):
+                dest_path = os.path.join(outdir, f"{name_root}_{counter}{name_ext}")
+                counter += 1
             # Zip-slip guard (in case filename has ../ etc.)
             if not _is_within_dir(outdir, dest_path):
                 continue
     return extracted
+def extract_session_state(zip_path: str, extract_root: str) -> str:
+    """Extract a previously saved session archive into *extract_root*.
+    Returns the directory that contains the restored session data.
+    """
+    if os.path.exists(extract_root):
+        shutil.rmtree(extract_root)
+    os.makedirs(extract_root, exist_ok=True)
+    with zipfile.ZipFile(zip_path, "r") as zf:
+        zf.extractall(extract_root)
+    entries = [os.path.join(extract_root, entry) for entry in os.listdir(extract_root)]
+    dirs = [entry for entry in entries if os.path.isdir(entry)]
+    files = [entry for entry in entries if os.path.isfile(entry)]
+    if len(dirs) == 1 and not files:
+        return dirs[0]
+    return extract_root
+def package_session_state(target_dir: str) -> str:
+    """Create a zip archive containing the entire session directory."""
+    if not os.path.isdir(target_dir):
+        raise ValueError(f"Target directory does not exist: {target_dir}")
+    os.makedirs("demo_cache", exist_ok=True)
+    archive_name = f"{os.path.basename(os.path.normpath(target_dir))}_session.zip"
+    archive_path = os.path.join("demo_cache", archive_name)
+    if os.path.exists(archive_path):
+        os.remove(archive_path)
+    with zipfile.ZipFile(archive_path, "w", compression=zipfile.ZIP_DEFLATED) as zf:
+        for root, _, files in os.walk(target_dir):
+            for fname in files:
+                file_path = os.path.join(root, fname)
+                if os.path.abspath(file_path) == os.path.abspath(archive_path):
+                    continue
+                arcname = os.path.join(os.path.basename(target_dir), os.path.relpath(file_path, target_dir))
+                zf.write(file_path, arcname)
+    return archive_path
+def _copy_with_unique_name(src_path: str, dst_dir: str) -> str:
+    """Copy *src_path* into *dst_dir*, avoiding filename collisions."""
+    base_name = os.path.basename(src_path)
+    name, ext = os.path.splitext(base_name)
+    candidate = base_name
+    counter = 1
+    dest_path = os.path.join(dst_dir, candidate)
+    while os.path.exists(dest_path):
+        candidate = f"{name}_{counter}{ext}"
+        dest_path = os.path.join(dst_dir, candidate)
+        counter += 1
+    shutil.copy(src_path, dest_path)
+    return dest_path
 # -------------------------------------------------------------------------
 # 1) Core model inference
 # -------------------------------------------------------------------------
 @spaces.GPU(duration=180)   # triggers ZeroGPU allocation for this call
+def run_model(target_dir: str, model: STream3R, mode: str="causal", streaming: bool=False) -> tuple[dict, str | None]:
     """
+    Run the STream3R model on images in the 'target_dir/images' folder.
     Args:
         target_dir: Directory containing the images subfolder
         model: STream3R model instance
         mode: Processing mode ("causal", "window", or "full")
         streaming: If True, use StreamSession for sequential processing; if False, use batch processing
+    Returns:
+        tuple[dict, str | None]: Predictions dictionary and optional path to the saved session cache when
+        streaming mode is used.
     """
     print(f"Processing images from {target_dir}")
     print(f"Running inference in {'streaming' if streaming else 'batch'} mode...")
     dtype = torch.bfloat16 if torch.cuda.get_device_capability()[0] >= 8 else torch.float16
+    session_cache_path: str | None = None
     with torch.no_grad():
         with torch.amp.autocast(dtype=dtype, device_type=device):
             if streaming:
                 session = StreamSession(model, mode=mode)
+                kv_cache_path = os.path.join(target_dir, "kv_cache.pt")
+                if os.path.exists(kv_cache_path):
+                    print(f"Loading existing session cache from {kv_cache_path}")
+                    session.load_cache(kv_cache_path, device=images.device)
+                existing_predictions = session.get_all_predictions()
+                existing_frames = 0
+                for value in existing_predictions.values():
+                    if isinstance(value, torch.Tensor) and value.dim() >= 2:
+                        existing_frames = max(existing_frames, value.shape[1])
+                total_frames = images.shape[0]
+                if existing_frames > total_frames:
+                    raise ValueError(
+                        "Session cache contains more frames than available images. Please ensure the images folder "
+                        "matches the saved session state."
+                    )
+                if existing_frames == total_frames:
+                    print("No new frames detected; reusing cached predictions.")
+                else:
+                    for i in range(existing_frames, total_frames):
+                        image = images[i : i + 1]
+                        session.forward_stream(image)
+                predictions = session.get_all_predictions()
+                session.save_cache(kv_cache_path)
+                session_cache_path = kv_cache_path
             else:
                 # Use batch processing (original behavior)
                 predictions = model(images, mode=mode)
     # Clean up
     torch.cuda.empty_cache()
+    return predictions, session_cache_path
 # -------------------------------------------------------------------------
 # 2) Handle uploaded video/images --> produce target_dir + images
 # -------------------------------------------------------------------------
+def handle_uploads(input_video, input_images, input_zip=None, session_state=None, current_target_dir: str | None = None):
     """
     Create a new 'target_dir' + 'images' subfolder.
     - Copies uploaded images
     - Optionally extracts images from a ZIP
     - Optionally extracts frames from a video (1 fps)
+    - Optionally loads a previously saved session archive
+    Returns (target_dir, image_paths, session_loaded).
     """
     start_time = time.time()
     gc.collect()
     # Create a unique folder name
     timestamp = datetime.now().strftime("%Y%m%d_%H%M%S_%f")
+    session_loaded = False
+    if session_state:
+        session_path = session_state.get("name") if isinstance(session_state, dict) and "name" in session_state else getattr(session_state, "name", None)
+        session_path = session_path or session_state
+        extract_root = os.path.join("demo_cache", f"session_{timestamp}")
+        target_dir = extract_session_state(session_path, extract_root)
+        session_loaded = True
+    elif current_target_dir and os.path.isdir(current_target_dir):
+        target_dir = current_target_dir
+    else:
+        target_dir = os.path.join("demo_cache", f"input_images_{timestamp}")
+        if os.path.exists(target_dir):
+            shutil.rmtree(target_dir)
+        os.makedirs(target_dir, exist_ok=True)
+    target_dir_images = os.path.join(target_dir, "images")
     os.makedirs(target_dir_images, exist_ok=True)
     image_paths: list[str] = []
     if input_images:
         for file_data in input_images:
             file_path = file_data["name"] if isinstance(file_data, dict) and "name" in file_data else file_data
+            copied_path = _copy_with_unique_name(file_path, target_dir_images)
+            image_paths.append(copied_path)
     # --- Handle ZIP (extract images) ---
     if input_zip:
         fps = vs.get(cv2.CAP_PROP_FPS) or 30.0
         frame_interval = max(1, int(fps * 1))  # 1 frame/sec
         count = 0
+        video_frame_num = len(os.listdir(target_dir_images))
         while True:
             gotit, frame = vs.read()
             if not gotit:
     image_paths = sorted(set(image_paths))  # de-dupe + sort
+    # Ensure gallery reflects existing files in the images directory
+    existing_images = sorted(glob.glob(os.path.join(target_dir_images, "*")))
+    image_paths = existing_images
     end_time = time.time()
     print(f"Prepared {len(image_paths)} files in {target_dir_images}; took {end_time - start_time:.3f}s")
+    return target_dir, image_paths, session_loaded
 # -------------------------------------------------------------------------
 # 3) Update gallery on upload
 # -------------------------------------------------------------------------
+def update_gallery_on_upload(input_video, input_images, input_zip, session_state, current_target_dir):
     """
     Handle any new uploads (video, images, or zip) and render preview.
     """
+    if not input_video and not input_images and not input_zip and not session_state:
+        return None, current_target_dir, None, None, None
+    target_dir, image_paths, session_loaded = handle_uploads(
+        input_video,
+        input_images,
+        input_zip,
+        session_state=session_state,
+        current_target_dir=current_target_dir,
+    )
+    if session_loaded:
+        message = "Session state loaded. Add new frames and click 'Reconstruct' to continue."
+    else:
+        message = "Upload complete. Click 'Reconstruct' to begin 3D processing."
+    return None, target_dir, image_paths, message, None
+def update_gallery_without_session(input_video, input_images, input_zip, current_target_dir):
+    return update_gallery_on_upload(input_video, input_images, input_zip, None, current_target_dir)
     print("Running run_model...")
     with torch.no_grad():
+        predictions, session_cache_path = run_model(target_dir, model, mode=mode, streaming=streaming)
     # Save predictions
     prediction_save_path = os.path.join(target_dir, "predictions.npz")
     np.savez(prediction_save_path, **predictions)
+    session_state_file = None
+    if streaming:
+        if session_cache_path is None:
+            session_cache_path = os.path.join(target_dir, "kv_cache.pt")
+        if os.path.exists(session_cache_path):
+            session_state_file = package_session_state(target_dir)
     # Handle None frame_filter
     if frame_filter is None:
         frame_filter = "All"
     print(f"Total time: {end_time - start_time:.2f} seconds (including IO)")
     log_msg = f"Reconstruction Success ({len(all_files)} frames). Waiting for visualization."
+    return (
+        glbfile,
+        log_msg,
+        gr.Dropdown(choices=frame_filter_choices, value=frame_filter, interactive=True),
+        session_state_file,
+    )
 # -------------------------------------------------------------------------
 def update_visualization(
+    target_dir,
+    conf_thres,
+    frame_filter,
+    mask_black_bg,
+    mask_white_bg,
+    show_cam,
+    mask_sky,
+    prediction_mode,
+    mode_value,
+    is_example,
 ):
     """
     Reload saved predictions from npz, create (or reuse) the GLB for new parameters,
     loaded = np.load(predictions_path)
     predictions = {key: np.array(loaded[key]) for key in key_list}
+    sanitized_frame = frame_filter.replace('.', '_').replace(':', '').replace(' ', '_') if frame_filter else "All"
     glbfile = os.path.join(
         target_dir,
+        f"glbscene_{conf_thres}_{sanitized_frame}_maskb{mask_black_bg}_maskw{mask_white_bg}_cam{show_cam}_sky{mask_sky}_pred{prediction_mode.replace(' ', '_')}_mode{mode_value}.glb",
     )
     if not os.path.exists(glbfile):
             input_video = gr.Video(label="Upload Video", interactive=True)
             input_images = gr.File(file_count="multiple", label="Upload Images", interactive=True)
             input_zip = gr.File(file_types=[".zip"], label="Upload ZIP of Images", interactive=True)
+            session_state_input = gr.File(file_types=[".zip"], label="Load Session State", interactive=True)
             image_gallery = gr.Gallery(
                 label="Preview",
                     "Please upload a video or images, then click Reconstruct.", elem_classes=["custom-log"]
                 )
                 reconstruction_output = gr.Model3D(height=520, zoom_speed=0.5, pan_speed=0.5)
+                session_state_output = gr.File(label="Download Session State", interactive=False)
             with gr.Row():
                 submit_btn = gr.Button("Reconstruct", scale=1, variant="primary")
                 clear_btn = gr.ClearButton(
+                    [
+                        input_video,
+                        input_images,
+                        input_zip,
+                        session_state_input,
+                        reconstruction_output,
+                        log_output,
+                        target_dir_output,
+                        image_gallery,
+                        session_state_output,
+                    ],
                     scale=1,
                 )
         3) Return model3D + logs + new_dir + updated dropdown + gallery
         We do NOT return is_example. It's just an input.
         """
+        target_dir, image_paths, _ = handle_uploads(input_video, input_images)
         # Always use "All" for frame_filter in examples
         frame_filter = "All"
+        glbfile, log_msg, dropdown, session_file = gradio_demo(
+            target_dir,
+            conf_thres,
+            frame_filter,
+            mask_black_bg,
+            mask_white_bg,
+            show_cam,
+            mask_sky,
+            prediction_mode,
+            mode,
+            False,
         )
+        return glbfile, log_msg, target_dir, dropdown, image_paths, session_file
     gr.Markdown("Click any row to load an example.", elem_classes=["example-log"])
             is_example,
             mode,
         ],
+        outputs=[
+            reconstruction_output,
+            log_output,
+            target_dir_output,
+            frame_filter,
+            image_gallery,
+            session_state_output,
+        ],
         fn=example_pipeline,
         cache_examples=False,
         examples_per_page=50,
             mode,
             streaming,
         ],
+        outputs=[reconstruction_output, log_output, frame_filter, session_state_output],
     ).then(
         fn=lambda: "False", inputs=[], outputs=[is_example]  # set is_example to "False"
     )
             show_cam,
             mask_sky,
             prediction_mode,
+            mode,
             is_example,
         ],
         [reconstruction_output, log_output],
             show_cam,
             mask_sky,
             prediction_mode,
+            mode,
             is_example,
         ],
         [reconstruction_output, log_output],
             show_cam,
             mask_sky,
             prediction_mode,
+            mode,
             is_example,
         ],
         [reconstruction_output, log_output],
             show_cam,
             mask_sky,
             prediction_mode,
+            mode,
             is_example,
         ],
         [reconstruction_output, log_output],
             show_cam,
             mask_sky,
             prediction_mode,
+            mode,
             is_example,
         ],
         [reconstruction_output, log_output],
             show_cam,
             mask_sky,
             prediction_mode,
+            mode,
             is_example,
         ],
         [reconstruction_output, log_output],
             show_cam,
             mask_sky,
             prediction_mode,
+            mode,
             is_example,
         ],
         [reconstruction_output, log_output],
     # -------------------------------------------------------------------------
     # Auto-update gallery whenever user uploads or changes their files
     # -------------------------------------------------------------------------
+    upload_outputs = [reconstruction_output, target_dir_output, image_gallery, log_output, session_state_output]
+    no_session_inputs = [input_video, input_images, input_zip, target_dir_output]
+    input_video.change(fn=update_gallery_without_session, inputs=no_session_inputs, outputs=upload_outputs)
+    input_images.change(fn=update_gallery_without_session, inputs=no_session_inputs, outputs=upload_outputs)
+    input_zip.change(fn=update_gallery_without_session, inputs=no_session_inputs, outputs=upload_outputs)
+    session_state_input.change(
         fn=update_gallery_on_upload,
+        inputs=[input_video, input_images, input_zip, session_state_input, target_dir_output],
+        outputs=upload_outputs,
     )
     demo.queue(max_size=20).launch(show_error=True, share=False)

configs/stream_session.json ADDED Viewed

	@@ -0,0 +1,3 @@

+{
+  "window_size": 25
+}

stream3r/stream_session.py CHANGED Viewed

@@ -1,3 +1,4 @@
 import os
 from typing import Any, Dict, Optional
@@ -9,12 +10,13 @@ class StreamSession:
     """
     A causal streaming inference session with KV cache management for STream3R.
     """
-    def __init__(self, model: STream3R, mode: str):
         self.model = model
         self.mode = mode
         self.aggregator_kv_cache_depth = model.aggregator.depth
         self.camera_head_kv_cache_depth = model.camera_head.trunk_depth
         self.camera_head_iterations = 4
         if self.mode not in ["causal", "window"]:
             raise ValueError(f"Unsupported attention mode when using kv_cache: {self.mode}")
@@ -41,13 +43,12 @@ class StreamSession:
             self.aggregator_kv_cache_list = aggregator_kv_cache_list
             self.camera_head_kv_cache_list = camera_head_kv_cache_list
         elif self.mode == "window":
-            window_size = 25
             for k in range(2):
                 for i in range(self.aggregator_kv_cache_depth):
                     h, w = self.predictions["depth"].shape[2], self.predictions["depth"].shape[3]
                     P = h * w // self.model.aggregator.patch_size // self.model.aggregator.patch_size + self.model.aggregator.patch_start_idx
                     anchor_token = aggregator_kv_cache_list[i][k][:, :, :P]
-                    window_tokens = aggregator_kv_cache_list[i][k][:, :, max(P, aggregator_kv_cache_list[i][k].size(2)-window_size*P):]
                     self.aggregator_kv_cache_list[i][k] = torch.cat(
                         [
                             anchor_token,
@@ -58,7 +59,7 @@ class StreamSession:
                 for i in range(self.camera_head_iterations):
                     for j in range(self.camera_head_kv_cache_depth):
                         anchor_token = camera_head_kv_cache_list[i][j][k][:, :, :1]
-                        window_tokens = camera_head_kv_cache_list[i][j][k][:, :, max(1, camera_head_kv_cache_list[i][j][k].size(2)-window_size):]
                         self.camera_head_kv_cache_list[i][j][k] = torch.cat(
                             [
                                 anchor_token,
@@ -112,6 +113,32 @@ class StreamSession:
         except StopIteration:
             return torch.device("cpu")
     def save_cache(self, file_path: str) -> None:
         aggregator_cache, camera_cache = self._get_cache()
@@ -121,6 +148,7 @@ class StreamSession:
                 "aggregator_depth": self.aggregator_kv_cache_depth,
                 "camera_head_depth": self.camera_head_kv_cache_depth,
                 "camera_head_iterations": self.camera_head_iterations,
                 "patch_size": getattr(self.model.aggregator, "patch_size", None),
                 "patch_start_idx": getattr(self.model.aggregator, "patch_start_idx", None),
             },
@@ -148,6 +176,7 @@ class StreamSession:
             "aggregator_depth": self.aggregator_kv_cache_depth,
             "camera_head_depth": self.camera_head_kv_cache_depth,
             "camera_head_iterations": self.camera_head_iterations,
         }
         for key, expected_value in expected_metadata.items():

+import json
 import os
 from typing import Any, Dict, Optional
     """
     A causal streaming inference session with KV cache management for STream3R.
     """
+    def __init__(self, model: STream3R, mode: str, *, window_size: Optional[int] = None, config_path: Optional[str] = None):
         self.model = model
         self.mode = mode
         self.aggregator_kv_cache_depth = model.aggregator.depth
         self.camera_head_kv_cache_depth = model.camera_head.trunk_depth
         self.camera_head_iterations = 4
+        self.window_size = self._resolve_window_size(window_size, config_path)
         if self.mode not in ["causal", "window"]:
             raise ValueError(f"Unsupported attention mode when using kv_cache: {self.mode}")
             self.aggregator_kv_cache_list = aggregator_kv_cache_list
             self.camera_head_kv_cache_list = camera_head_kv_cache_list
         elif self.mode == "window":
             for k in range(2):
                 for i in range(self.aggregator_kv_cache_depth):
                     h, w = self.predictions["depth"].shape[2], self.predictions["depth"].shape[3]
                     P = h * w // self.model.aggregator.patch_size // self.model.aggregator.patch_size + self.model.aggregator.patch_start_idx
                     anchor_token = aggregator_kv_cache_list[i][k][:, :, :P]
+                    window_tokens = aggregator_kv_cache_list[i][k][:, :, max(P, aggregator_kv_cache_list[i][k].size(2)-self.window_size*P):]
                     self.aggregator_kv_cache_list[i][k] = torch.cat(
                         [
                             anchor_token,
                 for i in range(self.camera_head_iterations):
                     for j in range(self.camera_head_kv_cache_depth):
                         anchor_token = camera_head_kv_cache_list[i][j][k][:, :, :1]
+                        window_tokens = camera_head_kv_cache_list[i][j][k][:, :, max(1, camera_head_kv_cache_list[i][j][k].size(2)-self.window_size):]
                         self.camera_head_kv_cache_list[i][j][k] = torch.cat(
                             [
                                 anchor_token,
         except StopIteration:
             return torch.device("cpu")
+    def _resolve_window_size(self, override: Optional[int], config_path: Optional[str]) -> int:
+        if override is not None:
+            return override
+        config_path = config_path or os.path.abspath(
+            os.path.join(os.path.dirname(__file__), "..", "configs", "stream_session.json")
+        )
+        default_window_size = 25
+        if not os.path.exists(config_path):
+            return default_window_size
+        try:
+            with open(config_path, "r", encoding="utf-8") as handle:
+                data = json.load(handle)
+        except (json.JSONDecodeError, OSError):
+            return default_window_size
+        window_size = data.get("window_size")
+        if isinstance(window_size, int) and window_size > 0:
+            return window_size
+        return default_window_size
     def save_cache(self, file_path: str) -> None:
         aggregator_cache, camera_cache = self._get_cache()
                 "aggregator_depth": self.aggregator_kv_cache_depth,
                 "camera_head_depth": self.camera_head_kv_cache_depth,
                 "camera_head_iterations": self.camera_head_iterations,
+                "window_size": self.window_size,
                 "patch_size": getattr(self.model.aggregator, "patch_size", None),
                 "patch_start_idx": getattr(self.model.aggregator, "patch_start_idx", None),
             },
             "aggregator_depth": self.aggregator_kv_cache_depth,
             "camera_head_depth": self.camera_head_kv_cache_depth,
             "camera_head_iterations": self.camera_head_iterations,
+            "window_size": self.window_size,
         }
         for key, expected_value in expected_metadata.items():

tests/test_stream_session_cache.py CHANGED Viewed

@@ -1,3 +1,4 @@
 import os
 import tempfile
 import unittest
@@ -101,6 +102,22 @@ else:
                 restored_tensor = restored_session.predictions[key]
                 self.assertTrue(torch.equal(original_tensor, restored_tensor))
 if __name__ == "__main__":  # pragma: no cover - manual execution
     unittest.main()

+import json
 import os
 import tempfile
 import unittest
                 restored_tensor = restored_session.predictions[key]
                 self.assertTrue(torch.equal(original_tensor, restored_tensor))
+        def test_window_size_from_config(self):
+            model = _DummyModel()
+            with tempfile.TemporaryDirectory() as tmpdir:
+                config_path = os.path.join(tmpdir, "stream_session.json")
+                with open(config_path, "w", encoding="utf-8") as handle:
+                    json.dump({"window_size": 7}, handle)
+                session = StreamSession(model, mode="window", config_path=config_path)
+            self.assertEqual(session.window_size, 7)
+        def test_window_size_override(self):
+            model = _DummyModel()
+            session = StreamSession(model, mode="window", window_size=11)
+            self.assertEqual(session.window_size, 11)
 if __name__ == "__main__":  # pragma: no cover - manual execution
     unittest.main()