Spaces:

enoky
/

2D-to-Stereo-3D

Running

App Files Files Community

enoky commited on 15 days ago

Commit

ed6a23d

verified ·

1 Parent(s): 4cc8594

Improve Forward Warp

Browse files

Files changed (1) hide show

app.py +227 -161

app.py CHANGED Viewed

@@ -1,8 +1,10 @@
 import gradio as gr
 import torch
 import numpy as np
 import cv2
 from PIL import Image
 from transformers import AutoModelForDepthEstimation, AutoImageProcessor
 from huggingface_hub import hf_hub_download
 import os
@@ -11,11 +13,172 @@ import os
 device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
 print(f"Running on device: {device}")
 # === LOAD MODELS ===
 def load_models():
     print("Loading Depth Anything V2 Large...")
-    # 1. Depth Model (Depth Anything V2 Large)
-    # We use AutoModel to automatically load the correct architecture
     depth_model = AutoModelForDepthEstimation.from_pretrained(
         "depth-anything/Depth-Anything-V2-Large-hf"
     ).to(device)
@@ -24,173 +187,61 @@ def load_models():
     )
     print("Loading LaMa Inpainting Model...")
-    # 2. LaMa Inpainting Model (TorchScript)
     try:
         model_path = hf_hub_download(repo_id="fashn-ai/LaMa", filename="big-lama.pt")
-        print(f"Loading LaMa from: {model_path}")
         lama_model = torch.jit.load(model_path, map_location=device)
         lama_model.eval()
     except Exception as e:
         print(f"Error loading LaMa model: {e}")
         raise e
-    return depth_model, depth_processor, lama_model
 # Load models once at startup
-depth_model, depth_processor, lama_model = load_models()
 # === DEPTH ESTIMATION ===
 @torch.no_grad()
 def estimate_depth(image_pil, model, processor):
     original_size = image_pil.size
-    # Preprocess image
     inputs = processor(images=image_pil, return_tensors="pt").to(device)
-    # Inference
     depth = model(**inputs).predicted_depth
-    # Interpolate depth back to ORIGINAL image size
     depth = torch.nn.functional.interpolate(
         depth.unsqueeze(1),
         size=(original_size[1], original_size[0]),
         mode="bicubic",
         align_corners=False,
-    ).squeeze() # Shape: (H, W)
-    # Normalize depth to 0-1 range
     depth_min, depth_max = depth.min(), depth.max()
     if depth_max - depth_min > 0:
         depth = (depth - depth_min) / (depth_max - depth_min)
     else:
         depth = torch.zeros_like(depth)
     return depth
 # === DEPTH MANIPULATION ===
 def erode_depth(depth_tensor, kernel_size):
-    """
-    Shrinks the foreground (bright areas) of the depth map to reduce halos.
-    Uses -MaxPool2d(-x) to simulate Erosion on GPU.
-    """
-    if kernel_size <= 0:
-        return depth_tensor
-    # Ensure odd kernel size for symmetry
     k = kernel_size if kernel_size % 2 == 1 else kernel_size + 1
-    # Reshape for pooling: (H, W) -> (1, 1, H, W)
     x = depth_tensor.unsqueeze(0).unsqueeze(0)
-    # Erosion = -MaxPool(-x)
-    # Padding = k // 2 ensures output size matches input size
     padding = k // 2
     x_eroded = -torch.nn.functional.max_pool2d(-x, kernel_size=k, stride=1, padding=padding)
     return x_eroded.squeeze()
-# === PYTORCH FORWARD WARP ===
-@torch.no_grad()
-def generate_right_and_mask_torch(image_pil, depth_tensor, divergence, convergence):
-    """
-    High-performance PyTorch Forward Warp implementation.
-    Mimics the behavior of custom CUDA forward warp kernels but uses standard PyTorch.
-    Args:
-        image_pil: Input PIL image
-        depth_tensor: Normalized depth tensor (H, W) on GPU
-        divergence: float (pixels)
-        convergence: float (0-1)
-    """
-    # 1. Prepare Data
-    w, h = image_pil.size
-    # Convert image to tensor (H, W, 3) -> (N, 3)
-    # We do this on GPU to stay fast
-    image_tensor = torch.from_numpy(np.array(image_pil)).to(device).float()
-    # Calculate Shift Map (N,)
-    # Shift = (Depth - Convergence) * Divergence
-    # Positive shift = Leftwards (Pop-out)
-    shift = (depth_tensor - convergence) * divergence
-    # 2. Create Grid Coordinates
-    y_coords, x_coords = torch.meshgrid(
-        torch.arange(h, device=device),
-        torch.arange(w, device=device),
-        indexing='ij'
-    )
-    # 3. Calculate Target Coordinates
-    # Target X = Source X - Shift
-    target_x = x_coords - shift.round() # Round to nearest pixel for sharp mapping
-    # 4. Flatten for advanced indexing
-    flat_y = y_coords.reshape(-1).long()
-    flat_x_target = target_x.reshape(-1).long()
-    flat_x_source = x_coords.reshape(-1).long()
-    # 5. Filter Invalid Points (Out of bounds)
-    valid_mask = (flat_x_target >= 0) & (flat_x_target < w)
-    flat_y = flat_y[valid_mask]
-    flat_x_target = flat_x_target[valid_mask]
-    flat_x_source = flat_x_source[valid_mask]
-    flat_shift = shift.reshape(-1)[valid_mask]
-    # 6. Z-BUFFERING / PAINTER'S ALGORITHM (Crucial for correct occlusion)
-    # We sort pixels by shift (depth).
-    # Less shift = Background (draw first)
-    # More shift = Foreground (draw last)
-    # This ensures foreground objects overwrite background objects at collision points.
-    sort_idx = torch.argsort(flat_shift)
-    flat_y = flat_y[sort_idx]
-    flat_x_target = flat_x_target[sort_idx]
-    flat_x_source = flat_x_source[sort_idx]
-    # 7. Write to Output
-    # Create output canvas (Black)
-    right_tensor = torch.zeros_like(image_tensor)
-    # Create mask (1.0 = hole, 0.0 = filled)
-    mask_tensor = torch.ones((h, w), device=device, dtype=torch.float32)
-    # Compute linear indices for target positions
-    # target_idx = y * w + x
-    target_indices = flat_y * w + flat_x_target
-    source_indices = flat_y * w + flat_x_source
-    # Flatten image for indexing
-    image_flat = image_tensor.reshape(-1, 3)
-    right_flat = right_tensor.reshape(-1, 3)
-    mask_flat = mask_tensor.reshape(-1)
-    # Perform the Warp
-    # Since we sorted by depth, the last write to any index wins (Foreground wins)
-    right_flat[target_indices] = image_flat[source_indices]
-    mask_flat[target_indices] = 0.0
-    # Reshape back
-    right_img = right_flat.reshape(h, w, 3).cpu().numpy().astype(np.uint8)
-    mask_img = mask_flat.reshape(h, w).cpu().numpy()
-    return right_img, mask_img
 # === LOCAL INPAINTING ===
 @torch.no_grad()
 def run_local_lama(image_bgr, mask_float):
-    """
-    Runs LaMa locally.
-    image_bgr: HxWx3 uint8 numpy array
-    mask_float: HxW float32 numpy array (1.0 = hole, 0.0 = valid)
-    """
-    # 0. Dilate Mask (Fixes smearing/streaking)
-    kernel = np.ones((5, 5), np.uint8)
     mask_uint8 = (mask_float * 255).astype(np.uint8)
     mask_dilated = cv2.dilate(mask_uint8, kernel, iterations=1)
-    # 1. Resize to be divisible by 8 (LaMa requirement)
     h, w = image_bgr.shape[:2]
     new_h = (h // 8) * 8
     new_w = (w // 8) * 8
@@ -198,7 +249,7 @@ def run_local_lama(image_bgr, mask_float):
     img_resized = cv2.resize(image_bgr, (new_w, new_h))
     mask_resized = cv2.resize(mask_dilated, (new_w, new_h), interpolation=cv2.INTER_NEAREST)
-    # 2. Convert to Torch Tensors
     img_t = torch.from_numpy(img_resized).float().permute(2, 0, 1).unsqueeze(0) / 255.0
     img_t = img_t[:, [2, 1, 0], :, :] # BGR to RGB
@@ -209,17 +260,14 @@ def run_local_lama(image_bgr, mask_float):
     mask_t = mask_t.to(device)
     # 3. Inference
-    img_t = img_t * (1 - mask_t) # Zero out holes
     inpainted_t = lama_model(img_t, mask_t)
     # 4. Post-process
     inpainted = inpainted_t[0].permute(1, 2, 0).cpu().numpy()
     inpainted = np.clip(inpainted * 255, 0, 255).astype(np.uint8)
-    # Swap back RGB to BGR
     inpainted = cv2.cvtColor(inpainted, cv2.COLOR_RGB2BGR)
-    # Resize back to original
     if new_h != h or new_w != w:
         inpainted = cv2.resize(inpainted, (w, h))
@@ -237,41 +285,60 @@ def make_anaglyph(left, right):
 # === PIPELINE ===
 def stereo_pipeline(image_pil, divergence, convergence, edge_erosion):
     if image_pil is None:
-        return None, None
-    # Resize input if too large (Max Width: 1920)
     w, h = image_pil.size
     if w > 1920:
         ratio = 1920 / w
         new_h = int(h * ratio)
-        print(f"Resizing input from {w}x{h} to 1920x{new_h}")
         image_pil = image_pil.resize((1920, new_h), Image.LANCZOS)
-    # 1. Depth (Using Depth Anything V2)
-    # Now returns a Tensor on GPU
     depth_tensor = estimate_depth(image_pil, depth_model, depth_processor)
-    # 2. Depth Manipulation (Erosion)
-    # This shrinks the foreground depth mask slightly to prevent "halo" pixels
-    # from being pulled along with the object.
     if edge_erosion > 0:
         depth_tensor = erode_depth(depth_tensor, int(edge_erosion))
-    # 3. Forward Warp (PyTorch)
-    # Replaces the old NumPy warp + sorting
-    right_img_rgb, mask = generate_right_and_mask_torch(image_pil, depth_tensor, divergence, convergence)
-    # Convert to BGR for Inpainting
     right_img_bgr = cv2.cvtColor(right_img_rgb, cv2.COLOR_RGB2BGR)
-    # 4. Inpainting (Local LaMa)
-    right_filled_bgr = run_local_lama(right_img_bgr, mask)
-    # 5. Final Processing
     left = image_pil
     right = Image.fromarray(cv2.cvtColor(right_filled_bgr, cv2.COLOR_BGR2RGB))
-    # 6. Composition
     width, height = left.size
     combined_image = Image.new('RGB', (width * 2, height))
     combined_image.paste(left, (0, 0))
@@ -279,11 +346,9 @@ def stereo_pipeline(image_pil, divergence, convergence, edge_erosion):
     anaglyph_image = make_anaglyph(left, right)
-    return combined_image, anaglyph_image
 # === GRADIO UI ===
-# Custom CSS to limit width on large screens
 css = """
 .gradio-container {
     max-width: 1400px !important;
@@ -291,15 +356,13 @@ css = """
 }
 """
-with gr.Blocks(title="2D to 3D Stereo") as demo:
-    # WORKAROUND: Inject CSS via HTML to avoid "unexpected keyword argument" error
     gr.HTML(f"<style>{css}</style>")
-    gr.Markdown("## 2D to 3D Stereo Generator (Depth Anything V2)")
-    gr.Markdown("Generates stereo pairs using **Depth Anything V2 Large**, **PyTorch Forward Warp**, and Local LaMa Inpainting.")
     with gr.Row():
-        # --- LEFT COLUMN: INPUT & CONTROLS ---
         with gr.Column(scale=1):
             input_img = gr.Image(type="pil", label="Input Image", height=320)
@@ -308,30 +371,33 @@ with gr.Blocks(title="2D to 3D Stereo") as demo:
                 divergence_slider = gr.Slider(
                     minimum=0, maximum=100, value=30, step=1,
                     label="3D Strength (Divergence)",
-                    info="Max pixel separation."
                 )
                 convergence_slider = gr.Slider(
-                    minimum=0.0, maximum=1.0, value=0.1, step=0.05,
                     label="Focus Plane (Convergence)",
                     info="0.0 = Background at screen. 1.0 = Foreground at screen."
                 )
                 erosion_slider = gr.Slider(
-                    minimum=0, maximum=20, value=5, step=1,
                     label="Edge Masking (Erosion)",
-                    info="Shrinks foreground depth to prevent halos/ghosting. Increase if edges look messy."
                 )
             btn = gr.Button("Generate 3D", variant="primary")
-        # --- RIGHT COLUMN: OUTPUTS ---
         with gr.Column(scale=1):
             out_anaglyph = gr.Image(label="Anaglyph (Red/Cyan)", height=320)
             out_stereo = gr.Image(label="Side-by-Side Stereo Pair", height=320)
     btn.click(
         fn=stereo_pipeline,
         inputs=[input_img, divergence_slider, convergence_slider, erosion_slider],
-        outputs=[out_stereo, out_anaglyph]
     )
 if __name__ == "__main__":

 import gradio as gr
 import torch
+import torch.nn as nn
 import numpy as np
 import cv2
 from PIL import Image
+from torch.autograd import Function
 from transformers import AutoModelForDepthEstimation, AutoImageProcessor
 from huggingface_hub import hf_hub_download
 import os
 device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
 print(f"Running on device: {device}")
+# ==============================================================================
+# 1. FORWARD WARP IMPLEMENTATION (From forward_warp_pytorch.py)
+# ==============================================================================
+class ForwardWarpFunction(Function):
+    @staticmethod
+    def forward(ctx, im0, flow, interpolation_mode_int):
+        # Input validation
+        assert(len(im0.shape) == len(flow.shape) == 4)
+        assert(interpolation_mode_int == 0 or interpolation_mode_int == 1)
+        assert(im0.shape[0] == flow.shape[0])
+        assert(im0.shape[-2:] == flow.shape[1:3])
+        assert(flow.shape[3] == 2)
+        B, C, H, W = im0.shape
+        im1 = torch.zeros_like(im0, device=im0.device, dtype=im0.dtype)
+        # Grid creation
+        grid_x, grid_y = torch.meshgrid(
+            torch.arange(W, device=im0.device, dtype=im0.dtype),
+            torch.arange(H, device=im0.device, dtype=im0.dtype),
+            indexing='xy'
+        )
+        grid_x = grid_x.unsqueeze(0).expand(B, -1, -1)
+        grid_y = grid_y.unsqueeze(0).expand(B, -1, -1)
+        # Destination coordinates
+        x_dest = grid_x + flow[:, :, :, 0]
+        y_dest = grid_y + flow[:, :, :, 1]
+        if interpolation_mode_int == 0:  # Bilinear Splatting
+            x_f = torch.floor(x_dest).long()
+            y_f = torch.floor(y_dest).long()
+            x_c = x_f + 1
+            y_c = y_f + 1
+            # Weights
+            nw_k = (x_c.float() - x_dest) * (y_c.float() - y_dest)
+            ne_k = (x_dest - x_f.float()) * (y_c.float() - y_dest)
+            sw_k = (x_c.float() - x_dest) * (y_dest - y_f.float())
+            se_k = (x_dest - x_f.float()) * (y_dest - y_f.float())
+            # Clamp coords
+            x_f_clamped = torch.clamp(x_f, 0, W - 1)
+            y_f_clamped = torch.clamp(y_f, 0, H - 1)
+            x_c_clamped = torch.clamp(x_c, 0, W - 1)
+            y_c_clamped = torch.clamp(y_c, 0, H - 1)
+            # Valid mask (source pixels that land inside canvas)
+            valid_mask = (x_f >= 0) & (x_c < W) & (y_f >= 0) & (y_c < H)
+            # Reshape for broadcasting
+            nw_k = nw_k.unsqueeze(1)
+            ne_k = ne_k.unsqueeze(1)
+            sw_k = sw_k.unsqueeze(1)
+            se_k = se_k.unsqueeze(1)
+            valid_mask = valid_mask.unsqueeze(1)
+            # Flatten indices for scatter_add
+            b_indices = torch.arange(B, device=im0.device).view(B, 1, 1, 1).expand(-1, C, H, W)
+            c_indices = torch.arange(C, device=im0.device).view(1, C, 1, 1).expand(B, -1, H, W)
+            base_idx = b_indices * (C * H * W) + c_indices * (H * W)
+            # Scatter to 4 neighbors (Accumulate/Splat)
+            def scatter_corner(y_idx, x_idx, weights):
+                flat_idx = base_idx + y_idx.unsqueeze(1) * W + x_idx.unsqueeze(1)
+                values = (im0 * weights) * valid_mask.float()
+                im1.view(-1).scatter_add_(0, flat_idx.view(-1), values.view(-1))
+            scatter_corner(y_f_clamped, x_f_clamped, nw_k) # NW
+            scatter_corner(y_f_clamped, x_c_clamped, ne_k) # NE
+            scatter_corner(y_c_clamped, x_f_clamped, sw_k) # SW
+            scatter_corner(y_c_clamped, x_c_clamped, se_k) # SE
+        else:  # Nearest Neighbor (Legacy fallback)
+            x_nearest = torch.round(x_dest).long()
+            y_nearest = torch.round(y_dest).long()
+            valid_mask = (x_nearest >= 0) & (x_nearest < W) & (y_nearest >= 0) & (y_nearest < H)
+            valid_mask = valid_mask.unsqueeze(1)
+            x_clamped = torch.clamp(x_nearest, 0, W - 1)
+            y_clamped = torch.clamp(y_nearest, 0, H - 1)
+            b_indices = torch.arange(B, device=im0.device).view(B, 1, 1, 1).expand(-1, C, H, W)
+            c_indices = torch.arange(C, device=im0.device).view(1, C, 1, 1).expand(B, -1, H, W)
+            dest_idx = b_indices*(C*H*W) + c_indices*(H*W) + y_clamped.unsqueeze(1)*W + x_clamped.unsqueeze(1)
+            source_values = im0 * valid_mask.float()
+            im1.view(-1).scatter_(0, dest_idx.view(-1), source_values.view(-1))
+        return im1
+    @staticmethod
+    def backward(ctx, grad_output):
+        # We don't need backward for inference, so we skip implementation for speed/simplicity
+        return None, None, None
+class forward_warp(nn.Module):
+    def __init__(self, interpolation_mode="Bilinear"):
+        super(forward_warp, self).__init__()
+        self.interpolation_mode_int = 0 if interpolation_mode == "Bilinear" else 1
+    def forward(self, im0, flow):
+        return ForwardWarpFunction.apply(im0, flow, self.interpolation_mode_int)
+# ==============================================================================
+# 2. STEREO WARPER (From splatting_gui.py)
+# ==============================================================================
+class ForwardWarpStereo(nn.Module):
+    """
+    Weighted Splatting wrapper.
+    Handles Occlusions using exponential depth weights (Soft Z-Buffering).
+    """
+    def __init__(self, eps=1e-6):
+        super(ForwardWarpStereo, self).__init__()
+        self.eps = eps
+        self.fw = forward_warp(interpolation_mode="Bilinear")
+    def forward(self, im, disp, convergence, divergence):
+        # Create Flow from Disparity
+        # Shift = (Depth - Convergence) * Divergence
+        # We negate it because standard flow is source->dest, but disparity logic varies.
+        # For Right Eye view: Target = Source - Shift. So Flow = -Shift.
+        shift = (disp - convergence) * divergence
+        flow_x = -shift
+        # Stack flow (x, y=0) -> (B, H, W, 2)
+        flow_y = torch.zeros_like(flow_x)
+        flow = torch.stack((flow_x, flow_y), dim=-1).permute(0, 2, 3, 1) # (B, H, W, 2)
+        # 1. Calculate Weights (Soft Z-Buffer)
+        # Closer objects (higher disparity) get exponentially higher weight.
+        # This allows foreground to overwrite background during accumulation.
+        # Using 1.414^disp (or similar base) is a common heuristic.
+        weights_map = disp - disp.min()
+        weights_map = (1.5) ** weights_map # Tuned base for separation
+        # 2. Warp Image * Weights (Accumulate Weighted Color)
+        # Input im is (B, C, H, W), weights is (B, 1, H, W)
+        res_accum = self.fw(im * weights_map, flow)
+        # 3. Warp Weights (Accumulate Weights)
+        mask_accum = self.fw(weights_map, flow)
+        # 4. Normalize (Color / TotalWeight)
+        # Add epsilon to avoid divide-by-zero in empty regions
+        mask_accum.clamp_(min=self.eps)
+        res = res_accum / mask_accum
+        # 5. Generate Binary Occlusion Mask (for Inpainting)
+        # Splat a grid of ones. Where sum is 0, we have a hole.
+        ones = torch.ones_like(disp)
+        occupancy = self.fw(ones, flow)
+        # Valid pixels have occupancy > 0.
+        # We want holes = 1.0, filled = 0.0
+        occlusion_mask = (occupancy < self.eps).float()
+        return res, occlusion_mask
+# ==============================================================================
+# 3. APP LOGIC
+# ==============================================================================
 # === LOAD MODELS ===
 def load_models():
     print("Loading Depth Anything V2 Large...")
     depth_model = AutoModelForDepthEstimation.from_pretrained(
         "depth-anything/Depth-Anything-V2-Large-hf"
     ).to(device)
     )
     print("Loading LaMa Inpainting Model...")
     try:
         model_path = hf_hub_download(repo_id="fashn-ai/LaMa", filename="big-lama.pt")
         lama_model = torch.jit.load(model_path, map_location=device)
         lama_model.eval()
     except Exception as e:
         print(f"Error loading LaMa model: {e}")
         raise e
+    # Initialize the new Stereo Warper
+    stereo_warper = ForwardWarpStereo().to(device)
+    return depth_model, depth_processor, lama_model, stereo_warper
 # Load models once at startup
+depth_model, depth_processor, lama_model, stereo_warper = load_models()
 # === DEPTH ESTIMATION ===
 @torch.no_grad()
 def estimate_depth(image_pil, model, processor):
     original_size = image_pil.size
     inputs = processor(images=image_pil, return_tensors="pt").to(device)
     depth = model(**inputs).predicted_depth
     depth = torch.nn.functional.interpolate(
         depth.unsqueeze(1),
         size=(original_size[1], original_size[0]),
         mode="bicubic",
         align_corners=False,
+    ).squeeze()
     depth_min, depth_max = depth.min(), depth.max()
     if depth_max - depth_min > 0:
         depth = (depth - depth_min) / (depth_max - depth_min)
     else:
         depth = torch.zeros_like(depth)
     return depth
 # === DEPTH MANIPULATION ===
 def erode_depth(depth_tensor, kernel_size):
+    if kernel_size <= 0: return depth_tensor
     k = kernel_size if kernel_size % 2 == 1 else kernel_size + 1
     x = depth_tensor.unsqueeze(0).unsqueeze(0)
     padding = k // 2
     x_eroded = -torch.nn.functional.max_pool2d(-x, kernel_size=k, stride=1, padding=padding)
     return x_eroded.squeeze()
 # === LOCAL INPAINTING ===
 @torch.no_grad()
 def run_local_lama(image_bgr, mask_float):
+    # 0. Dilate Mask slightly to catch edge artifacts from splatting
+    kernel = np.ones((3, 3), np.uint8)
     mask_uint8 = (mask_float * 255).astype(np.uint8)
     mask_dilated = cv2.dilate(mask_uint8, kernel, iterations=1)
+    # 1. Resize to be divisible by 8
     h, w = image_bgr.shape[:2]
     new_h = (h // 8) * 8
     new_w = (w // 8) * 8
     img_resized = cv2.resize(image_bgr, (new_w, new_h))
     mask_resized = cv2.resize(mask_dilated, (new_w, new_h), interpolation=cv2.INTER_NEAREST)
+    # 2. Convert to Torch
     img_t = torch.from_numpy(img_resized).float().permute(2, 0, 1).unsqueeze(0) / 255.0
     img_t = img_t[:, [2, 1, 0], :, :] # BGR to RGB
     mask_t = mask_t.to(device)
     # 3. Inference
+    img_t = img_t * (1 - mask_t)
     inpainted_t = lama_model(img_t, mask_t)
     # 4. Post-process
     inpainted = inpainted_t[0].permute(1, 2, 0).cpu().numpy()
     inpainted = np.clip(inpainted * 255, 0, 255).astype(np.uint8)
     inpainted = cv2.cvtColor(inpainted, cv2.COLOR_RGB2BGR)
     if new_h != h or new_w != w:
         inpainted = cv2.resize(inpainted, (w, h))
 # === PIPELINE ===
 def stereo_pipeline(image_pil, divergence, convergence, edge_erosion):
     if image_pil is None:
+        return None, None, None, None
+    # Resize input if too large
     w, h = image_pil.size
     if w > 1920:
         ratio = 1920 / w
         new_h = int(h * ratio)
         image_pil = image_pil.resize((1920, new_h), Image.LANCZOS)
+    # 1. Depth Estimation
     depth_tensor = estimate_depth(image_pil, depth_model, depth_processor)
+    # 2. Depth Erosion (optional halo reduction)
     if edge_erosion > 0:
         depth_tensor = erode_depth(depth_tensor, int(edge_erosion))
+    # Visualize Depth
+    depth_vis = (depth_tensor.cpu().numpy() * 255).astype(np.uint8)
+    depth_image = Image.fromarray(depth_vis)
+    # 3. Forward Warp (Weighted Bilinear Splatting)
+    # Convert image to tensor (B, C, H, W)
+    image_tensor = torch.from_numpy(np.array(image_pil)).float().to(device).permute(2, 0, 1).unsqueeze(0) / 255.0
+    # Prepare depth tensor (B, 1, H, W)
+    depth_input = depth_tensor.unsqueeze(0).unsqueeze(0)
+    # Run the new Stereo Warper
+    # Note: We scale divergence by width/100 to make the slider roughly %-based or consistent pixels
+    # Or keep raw pixels. Let's keep raw pixels as user requested previously.
+    with torch.no_grad():
+        right_img_tensor, mask_tensor = stereo_warper(
+            image_tensor,
+            depth_input,
+            float(convergence),
+            float(divergence)
+        )
+    # Convert results back to CPU/Numpy
+    right_img_rgb = (right_img_tensor.squeeze(0).permute(1, 2, 0).cpu().numpy() * 255).astype(np.uint8)
+    mask_vis = (mask_tensor.squeeze(0).squeeze(0).cpu().numpy() * 255).astype(np.uint8)
+    mask_image = Image.fromarray(mask_vis)
+    # 4. Inpainting
     right_img_bgr = cv2.cvtColor(right_img_rgb, cv2.COLOR_RGB2BGR)
+    mask_float = mask_tensor.squeeze().cpu().numpy()
+    right_filled_bgr = run_local_lama(right_img_bgr, mask_float)
+    # 5. Finalize
     left = image_pil
     right = Image.fromarray(cv2.cvtColor(right_filled_bgr, cv2.COLOR_BGR2RGB))
     width, height = left.size
     combined_image = Image.new('RGB', (width * 2, height))
     combined_image.paste(left, (0, 0))
     anaglyph_image = make_anaglyph(left, right)
+    return combined_image, anaglyph_image, depth_image, mask_image
 # === GRADIO UI ===
 css = """
 .gradio-container {
     max-width: 1400px !important;
 }
 """
+with gr.Blocks(title="2D to 3D Stereo", css=css) as demo:
     gr.HTML(f"<style>{css}</style>")
+    gr.Markdown("## 2D to 3D Stereo Generator (High-Quality Splatting)")
+    gr.Markdown("Uses **Depth Anything V2**, **Bilinear Weighted Splatting** (Soft Z-Buffer), and **LaMa Inpainting**.")
     with gr.Row():
         with gr.Column(scale=1):
             input_img = gr.Image(type="pil", label="Input Image", height=320)
                 divergence_slider = gr.Slider(
                     minimum=0, maximum=100, value=30, step=1,
                     label="3D Strength (Divergence)",
+                    info="Max separation in pixels."
                 )
                 convergence_slider = gr.Slider(
+                    minimum=0.0, maximum=1.0, value=0.5, step=0.05,
                     label="Focus Plane (Convergence)",
                     info="0.0 = Background at screen. 1.0 = Foreground at screen."
                 )
                 erosion_slider = gr.Slider(
+                    minimum=0, maximum=20, value=2, step=1,
                     label="Edge Masking (Erosion)",
+                    info="Cleanup edges. Set to 0 for raw splatting."
                 )
             btn = gr.Button("Generate 3D", variant="primary")
         with gr.Column(scale=1):
             out_anaglyph = gr.Image(label="Anaglyph (Red/Cyan)", height=320)
             out_stereo = gr.Image(label="Side-by-Side Stereo Pair", height=320)
+            with gr.Row():
+                out_depth = gr.Image(label="Depth Map", height=200)
+                out_mask = gr.Image(label="Inpainting Mask (Holes)", height=200)
     btn.click(
         fn=stereo_pipeline,
         inputs=[input_img, divergence_slider, convergence_slider, erosion_slider],
+        outputs=[out_stereo, out_anaglyph, out_depth, out_mask]
     )
 if __name__ == "__main__":