Spaces:

enoky
/

2D-to-Stereo-3D

Running

App Files Files Community

enoky commited on 14 days ago

Commit

549ff77

verified ·

1 Parent(s): 18f40ab

Update app.py

Browse files

Files changed (1) hide show

app.py +139 -186

app.py CHANGED Viewed

@@ -4,7 +4,6 @@ import torch.nn as nn
 import numpy as np
 import cv2
 from PIL import Image
-from torch.autograd import Function
 from transformers import AutoModelForDepthEstimation, AutoImageProcessor
 from huggingface_hub import hf_hub_download
 import os
@@ -14,130 +13,100 @@ device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
 print(f"Running on device: {device}")
 # ==============================================================================
-# 1. FORWARD WARP (unchanged — your version was already excellent)
 # ==============================================================================
-class ForwardWarpFunction(Function):
-    @staticmethod
-    def forward(ctx, im0, flow, interpolation_mode_int):
-        B, C, H, W = im0.shape
-        im1 = torch.zeros_like(im0, device=im0.device, dtype=im0.dtype).contiguous()
-        grid_x = torch.arange(W, device=im0.device, dtype=im0.dtype).unsqueeze(0).expand(H, W)
-        grid_y = torch.arange(H, device=im0.device, dtype=im0.dtype).unsqueeze(1).expand(H, W)
-        grid_x = grid_x.unsqueeze(0).expand(B, H, W)
-        grid_y = grid_y.unsqueeze(0).expand(B, H, W)
-        x_dest = grid_x + flow[:, :, :, 0]
-        y_dest = grid_y + flow[:, :, :, 1]
-        x_f = torch.floor(x_dest).long()
-        y_f = torch.floor(y_dest).long()
-        x_c = x_f + 1
-        y_c = y_f + 1
-        nw_k = (x_c.float() - x_dest) * (y_c.float() - y_dest)
-        ne_k = (x_dest - x_f.float()) * (y_c.float() - y_dest)
-        sw_k = (x_c.float() - x_dest) * (y_dest - y_f.float())
-        se_k = (x_dest - x_f.float()) * (y_dest - y_f.float())
-        x_f_clamped = torch.clamp(x_f, 0, W - 1)
-        y_f_clamped = torch.clamp(y_f, 0, H - 1)
-        x_c_clamped = torch.clamp(x_c, 0, W - 1)
-        y_c_clamped = torch.clamp(y_c, 0, H - 1)
-        mask_nw = (x_f >= 0) & (x_f < W) & (y_f >= 0) & (y_f < H)
-        mask_ne = (x_c >= 0) & (x_c < W) & (y_f >= 0) & (y_f < H)
-        mask_sw = (x_f >= 0) & (x_f < W) & (y_c >= 0) & (y_c < H)
-        mask_se = (x_c >= 0) & (x_c < W) & (y_c >= 0) & (y_c < H)
-        nw_k = nw_k.unsqueeze(1)
-        ne_k = ne_k.unsqueeze(1)
-        sw_k = sw_k.unsqueeze(1)
-        se_k = se_k.unsqueeze(1)
-        mask_nw = mask_nw.unsqueeze(1)
-        mask_ne = mask_ne.unsqueeze(1)
-        mask_sw = mask_sw.unsqueeze(1)
-        mask_se = mask_se.unsqueeze(1)
-        b_indices = torch.arange(B, device=im0.device).view(B, 1, 1, 1).expand(-1, C, H, W)
-        c_indices = torch.arange(C, device=im0.device).view(1, C, 1, 1).expand(B, -1, H, W)
-        base_idx = b_indices * (C * H * W) + c_indices * (H * W)
-        def scatter_corner(y_idx, x_idx, weights, mask):
-            flat_idx = base_idx + y_idx.unsqueeze(1) * W + x_idx.unsqueeze(1)
-            values = (im0 * weights) * mask.float()
-            im1.reshape(-1).scatter_add_(0, flat_idx.contiguous().reshape(-1), values.contiguous().reshape(-1))
-        scatter_corner(y_f_clamped, x_f_clamped, nw_k, mask_nw)
-        scatter_corner(y_f_clamped, x_c_clamped, ne_k, mask_ne)
-        scatter_corner(y_c_clamped, x_f_clamped, sw_k, mask_sw)
-        scatter_corner(y_c_clamped, x_c_clamped, se_k, mask_se)
-        return im1
-    @staticmethod
-    def backward(ctx, grad_output):
-        return None, None, None
-class forward_warp(nn.Module):
-    def __init__(self): super().__init__()
-    def forward(self, im0, flow):
-        return ForwardWarpFunction.apply(im0, flow, 0)
 # ==============================================================================
-# 2. STEREO WARPER — FIXED Z-BUFFER + SMART MASK DILATION
 # ==============================================================================
 class ForwardWarpStereo(nn.Module):
     def __init__(self, eps=1e-6):
         super().__init__()
         self.eps = eps
-        self.fw = forward_warp()
-    def forward(self, im, shift, disp_for_weights):
-        flow_x = -shift
         flow_y = torch.zeros_like(flow_x)
-        flow = torch.stack((flow_x, flow_y), dim=-1)
-        # ────── FIXED: Linear + bias weights (no more detached arms) ──────
-        disp_norm = disp_for_weights / (disp_for_weights.max() + 1e-8)
-        weights_map = disp_norm + 0.05
-        # ─────────────────────────────────────────────────────────────────────
-        res_accum = self.fw(im * weights_map.unsqueeze(1), flow)
-        mask_accum = self.fw(weights_map.unsqueeze(1), flow)
-        mask_accum.clamp_(min=self.eps)
-        res = res_accum / mask_accum
-        # Occupancy for occlusion detection
-        ones = torch.ones_like(im[:,0:1,:,:])
-        occupancy = self.fw(ones, flow)
-        occlusion_mask = (occupancy < self.eps).float()
-        # ────── NEW: Smart, foreground-preserving mask dilation ──────
         with torch.no_grad():
-            # Protect clear foreground from over-dilation
-            fg_thresh = torch.quantile(disp_for_weights, 0.88)
             fg_mask = (disp_for_weights > fg_thresh).float().unsqueeze(0)
-            # Aggressive but safe dilation
-            k = 15
             dilated = torch.nn.functional.conv2d(
-                occlusion_mask, torch.ones(1,1,k,k,device=occlusion_mask.device),
-                padding=k//2) > 0.1
             safe_dilation = dilated.float() * (1 - fg_mask)
-            occlusion_mask = torch.clamp(occlusion_mask + safe_dilation, 0, 1)
-        # ─────────────────────────────────────────────────────────────────
-        return res, occlusion_mask
 # ==============================================================================
-# 3. MODELS & HELPERS (unchanged except LaMa now runs twice for perfection)
 # ==============================================================================
 def load_models():
     print("Loading Depth Anything V2 Large...")
     depth_model = AutoModelForDepthEstimation.from_pretrained(
         "depth-anything/Depth-Anything-V2-Large-hf"
-    ).to(device)
     depth_processor = AutoImageProcessor.from_pretrained(
         "depth-anything/Depth-Anything-V2-Large-hf"
     )
@@ -145,180 +114,164 @@ def load_models():
     print("Loading LaMa Inpainting Model...")
     try:
         model_path = hf_hub_download(repo_id="fashn-ai/LaMa", filename="big-lama.pt")
-        lama_model = torch.jit.load(model_path, map_location=device)
-        lama_model.eval()
     except Exception as e:
         print(f"LaMa load failed: {e}")
         lama_model = None
     stereo_warper = ForwardWarpStereo().to(device)
     return depth_model, depth_processor, lama_model, stereo_warper
 depth_model, depth_processor, lama_model, stereo_warper = load_models()
 @torch.no_grad()
-def estimate_depth(image_pil, model, processor):
     original_size = image_pil.size
-    inputs = processor(images=image_pil, return_tensors="pt").to(device)
-    depth = model(**inputs).predicted_depth
     depth = torch.nn.functional.interpolate(
         depth.unsqueeze(1),
         size=(original_size[1], original_size[0]),
         mode="bicubic",
         align_corners=False,
-    ).squeeze()
     d_min, d_max = depth.min(), depth.max()
-    if d_max - d_min > 0:
         depth = (depth - d_min) / (d_max - d_min)
-    else:
-        depth = torch.zeros_like(depth)
     return depth
 @torch.no_grad()
-def run_lama_twice(image_bgr, mask_float):
     if lama_model is None:
         return image_bgr
-    # First pass
-    img1 = run_local_lama(image_bgr, mask_float)
-    # Second pass with slightly larger mask
-    kernel = np.ones((9,9), np.uint8)
-    mask_dilated = cv2.dilate(mask_float, kernel, iterations=2)
-    img2 = run_local_lama(img1, mask_dilated)
-    return img2
-def run_local_lama(image_bgr, mask_float):
-    if lama_model is None:
-        return image_bgr
-    kernel = np.ones((5,5), np.uint8)
     mask_uint8 = (mask_float * 255).astype(np.uint8)
     mask_dilated = cv2.dilate(mask_uint8, kernel, iterations=2)
     h, w = image_bgr.shape[:2]
     new_h = (h // 8) * 8
     new_w = (w // 8) * 8
     img_resized = cv2.resize(image_bgr, (new_w, new_h))
     mask_resized = cv2.resize(mask_dilated, (new_w, new_h), interpolation=cv2.INTER_NEAREST)
-    img_t = torch.from_numpy(img_resized).float().permute(2,0,1).unsqueeze(0)/255.0
-    img_t = img_t[:,[2,1,0],:,:].to(device)
-    mask_t = torch.from_numpy(mask_resized).float().unsqueeze(0).unsqueeze(0)/255.0
     mask_t = (mask_t > 0.5).float().to(device)
     img_t = img_t * (1 - mask_t)
-    inpainted_t = lama_model(img_t, mask_t)
-    inpainted = inpainted_t[0].permute(1,2,0).cpu().numpy()
-    inpainted = np.clip(inpainted*255, 0, 255).astype(np.uint8)
-    inpainted = cv2.cvtColor(inpainted, cv2.COLOR_RGB2BGR)
-    if new_h != h or new_w != w:
-        inpainted = cv2.resize(inpainted, (w, h))
-    return inpainted
 def make_anaglyph(left, right):
     l = np.array(left)
     r = np.array(right)
     ana = np.zeros_like(l)
-    ana[:,:,0] = l[:,:,0]  # Red   ← Left eye
-    ana[:,:,1] = r[:,:,1]  # Green ← Right eye
-    ana[:,:,2] = r[:,:,2]  # Blue  ← Right eye
     return Image.fromarray(ana)
 # ==============================================================================
-# 4. MAIN PIPELINE — FINAL CLEAN VERSION
 # ==============================================================================
 @torch.no_grad()
-def stereo_pipeline(image_pil, divergence_percent, convergence_plane):
     if image_pil is None:
         return None, None, None, None
     w, h = image_pil.size
     if w > 1920:
         ratio = 1920 / w
-        image_pil = image_pil.resize((int(w*ratio), int(h*ratio)), Image.LANCZOS)
         w, h = image_pil.size
     # 1. Depth
-    depth_tensor = estimate_depth(image_pil, depth_model, depth_processor)
-    depth_vis = (depth_tensor.cpu().numpy() * 255).astype(np.uint8)
-    depth_image = Image.fromarray(depth_vis)
-    # 2. Disparity (square for better volume)
-    disp_raw = depth_tensor ** 2
-    disp_max = torch.quantile(disp_raw, 0.995)
-    disp_clipped = torch.clamp(disp_raw, max=disp_max)
-    # 3. Shift calculation
-    max_shift_px = w * (divergence_percent / 100.0)
-    shift_pixels_raw = disp_clipped * max_shift_px
-    shift_min, shift_max = shift_pixels_raw.min(), shift_pixels_raw.max()
     convergence_offset = shift_min + convergence_plane * (shift_max - shift_min)
-    final_shift_pixels = shift_pixels_raw - convergence_offset
-    print(f"Shift range: {final_shift_pixels.min():.1f} → {final_shift_pixels.max():.1f} px")
-    # 4. Warp
-    image_tensor = torch.from_numpy(np.array(image_pil)).float().to(device) / 255.0
-    image_tensor = image_tensor.permute(2,0,1).unsqueeze(0)
-    shift_input = final_shift_pixels.unsqueeze(0).to(device)
-    disp_for_weights = disp_clipped.unsqueeze(0).to(device)
-    right_tensor, occlusion_mask = stereo_warper(image_tensor, shift_input, disp_for_weights)
-    # 5. Convert to numpy
-    right_rgb = (right_tensor.squeeze(0).permute(1,2,0).cpu().numpy() * 255).astype(np.uint8)
-    right_bgr = cv2.cvtColor(right_rgb, cv2.COLOR_RGB2BGR)
-    mask_np = occlusion_mask.squeeze().cpu().numpy()
-    # 6. Two-pass LaMa (perfect edges)
-    right_filled_bgr = run_lama_twice(right_bgr, mask_np)
     right_filled = Image.fromarray(cv2.cvtColor(right_filled_bgr, cv2.COLOR_BGR2RGB))
     # 7. Outputs
     mask_vis = Image.fromarray((mask_np * 255).astype(np.uint8))
-    combined = Image.new('RGB', (w*2, h))
-    combined.paste(image_pil, (0, 0))
-    combined.paste(right_filled, (w, 0))
     anaglyph = make_anaglyph(image_pil, right_filled)
-    return combined, anaglyph, depth_image, mask_vis
 # ==============================================================================
-# 5. GRADIO UI — Simplified (erosion slider removed)
 # ==============================================================================
-with gr.Blocks(title="2D → 3D Stereo (Final Pro Version)") as demo:
-    gr.HTML("<h1 style='text-align:center;'>2D to 3D Stereo — Pro Quality</h1>")
-    gr.Markdown("Depth Anything V2 + Forward Warp + Smart Inpainting")
     with gr.Row():
         with gr.Column(scale=1):
-            input_img = gr.Image(type="pil", label="Upload Image", height=500)
             with gr.Accordion("Settings", open=True):
-                divergence = gr.Slider(0.5, 8.0, value=3.2, step=0.1,
-                                     label="3D Strength (%)")
                 convergence = gr.Slider(0.0, 1.0, value=0.08, step=0.01,
-                                      label="Convergence Plane (0 = pop-out, 1 = deep-in)")
             btn = gr.Button("Generate 3D", variant="primary", size="lg")
         with gr.Column(scale=1):
-            out_anaglyph = gr.Image(label="Anaglyph (Red/Cyan Glasses)", height=500)
-            out_sbs = gr.Image(label="Side-by-Side Pair", height=300)
             with gr.Row():
                 out_depth = gr.Image(label="Depth Map", height=200)
-                out_mask  = gr.Image(label="Inpainting Mask", height=200)
-    btn.click(fn=stereo_pipeline,
-              inputs=[input_img, divergence, convergence],
-              outputs=[out_sbs, out_anaglyph, out_depth, out_mask])
-    gr.Markdown("**Tip:** Red/Cyan glasses → anaglyph • Cross-eye or parallel → side-by-side")
 if __name__ == "__main__":
     demo.launch(share=True)

 import numpy as np
 import cv2
 from PIL import Image
 from transformers import AutoModelForDepthEstimation, AutoImageProcessor
 from huggingface_hub import hf_hub_download
 import os
 print(f"Running on device: {device}")
 # ==============================================================================
+# 1. SAFE & FAST FORWARD WARPER USING grid_sample (NO MORE BLACK IMAGES!)
 # ==============================================================================
+class SafeForwardWarp(nn.Module):
+    def forward(self, img, flow):
+        """
+        img:   [B, C, H, W] in [0,1]
+        flow:  [B, H, W, 2]  flow[...,0] = delta_x (positive = right), flow[...,1] = delta_y
+        """
+        B, C, H, W = img.shape
+        # Create sampling grid in normalized coordinates [-1, 1]
+        grid_x, grid_y = torch.meshgrid(
+            torch.arange(W, device=img.device),
+            torch.arange(H, device=img.device),
+            indexing='ij'
+        )
+        grid_x = grid_x.float().unsqueeze(0).expand(B, -1, -1)  # [B, H, W]
+        grid_y = grid_y.float().unsqueeze(0).expand(B, -1, -1)
+        dest_x = grid_x + flow[..., 0]   # source pixel moves to x + dx
+        dest_y = grid_y + flow[..., 1]
+        # Normalize to [-1, 1]
+        norm_x = 2.0 * dest_x / (W - 1) - 1.0
+        norm_y = 2.0 * dest_y / (H - 1) - 1.0
+        grid = torch.stack((norm_x, norm_y), dim=-1)  # [B, H, W, 2]
+        grid = grid.clamp(-1, 1)
+        warped = torch.nn.functional.grid_sample(
+            img,
+            grid,
+            mode='bilinear',
+            padding_mode='zeros',
+            align_corners=True
+        )
+        return warped
 # ==============================================================================
+# 2. STEREO WARPER — Improved weighting + safer dilation
 # ==============================================================================
 class ForwardWarpStereo(nn.Module):
     def __init__(self, eps=1e-6):
         super().__init__()
         self.eps = eps
+        self.warp = SafeForwardWarp()
+    def forward(self, img, shift, disp_for_weights):
+        # shift: [B, H, W]  (positive = shift right-eye left → object pops out)
+        flow_x = -shift  # negative = move pixels left for right eye
         flow_y = torch.zeros_like(flow_x)
+        flow = torch.stack((flow_x, flow_y), dim=-1)  # [B, H, W, 2]
+        # Better weighting: closer pixels contribute more
+        weights = 1.0 / (disp_for_weights + 0.1)
+        weights = weights / (weights.max() + 1e-8)
+        weighted_img = img * weights.unsqueeze(1)
+        warped_img = self.warp(weighted_img, flow)
+        warped_weights = self.warp(weights.unsqueeze(1), flow)
+        # Avoid division by zero
+        warped_weights = torch.clamp(warped_weights, min=self.eps)
+        result = warped_img / warped_weights
+        # Occlusion mask via occupancy count
+        ones = torch.ones_like(img[:, :1])
+        occupancy = self.warp(ones, flow)
+        occlusion = (occupancy < self.eps).float()
+        # Smart dilation — preserve foreground edges
         with torch.no_grad():
+            fg_thresh = torch.quantile(disp_for_weights, 0.90)
             fg_mask = (disp_for_weights > fg_thresh).float().unsqueeze(0)
+            k = 9
             dilated = torch.nn.functional.conv2d(
+                occlusion,
+                torch.ones(1, 1, k, k, device=occlusion.device),
+                padding=k // 2
+            ) > 0.5
             safe_dilation = dilated.float() * (1 - fg_mask)
+            occlusion = torch.clamp(occlusion + safe_dilation, 0, 1)
+        return result, occlusion
 # ==============================================================================
+# 3. MODELS & HELPERS
 # ==============================================================================
 def load_models():
     print("Loading Depth Anything V2 Large...")
     depth_model = AutoModelForDepthEstimation.from_pretrained(
         "depth-anything/Depth-Anything-V2-Large-hf"
+    ).to(device).eval()
     depth_processor = AutoImageProcessor.from_pretrained(
         "depth-anything/Depth-Anything-V2-Large-hf"
     )
     print("Loading LaMa Inpainting Model...")
     try:
         model_path = hf_hub_download(repo_id="fashn-ai/LaMa", filename="big-lama.pt")
+        lama_model = torch.jit.load(model_path, map_location=device).eval()
     except Exception as e:
         print(f"LaMa load failed: {e}")
         lama_model = None
     stereo_warper = ForwardWarpStereo().to(device)
     return depth_model, depth_processor, lama_model, stereo_warper
 depth_model, depth_processor, lama_model, stereo_warper = load_models()
 @torch.no_grad()
+def estimate_depth(image_pil):
     original_size = image_pil.size
+    inputs = depth_processor(images=image_pil, return_tensors="pt").to(device)
+    outputs = depth_model(**inputs)
+    depth = outputs.predicted_depth
     depth = torch.nn.functional.interpolate(
         depth.unsqueeze(1),
         size=(original_size[1], original_size[0]),
         mode="bicubic",
         align_corners=False,
+    ).squeeze(0).squeeze(0)
+    # Normalize to [0,1]
     d_min, d_max = depth.min(), depth.max()
+    if d_max > d_min:
         depth = (depth - d_min) / (d_max - d_min)
     return depth
 @torch.no_grad()
+def run_lama(image_bgr, mask_float):
     if lama_model is None:
         return image_bgr
     mask_uint8 = (mask_float * 255).astype(np.uint8)
+    kernel = np.ones((7, 7), np.uint8)
     mask_dilated = cv2.dilate(mask_uint8, kernel, iterations=2)
     h, w = image_bgr.shape[:2]
     new_h = (h // 8) * 8
     new_w = (w // 8) * 8
     img_resized = cv2.resize(image_bgr, (new_w, new_h))
     mask_resized = cv2.resize(mask_dilated, (new_w, new_h), interpolation=cv2.INTER_NEAREST)
+    img_t = torch.from_numpy(img_resized).float().permute(2, 0, 1).unsqueeze(0) / 255.0
+    img_t = img_t[:, [2, 1, 0]].to(device)  # BGR → RGB
+    mask_t = torch.from_numpy(mask_resized).float().unsqueeze(0).unsqueeze(0) / 255.0
     mask_t = (mask_t > 0.5).float().to(device)
     img_t = img_t * (1 - mask_t)
+    inpainted = lama_model(img_t, mask_t)
+    result = (inpainted[0].permute(1, 2, 0).cpu().numpy() * 255).clip(0, 255).astype(np.uint8)
+    result = cv2.cvtColor(result, cv2.COLOR_RGB2BGR)
+    if (new_h, new_w) != (h, w):
+        result = cv2.resize(result, (w, h))
+    return result
 def make_anaglyph(left, right):
     l = np.array(left)
     r = np.array(right)
     ana = np.zeros_like(l)
+    ana[:, :, 0] = l[:, :, 0]  # Red   ← Left
+    ana[:, :, 1] = r[:, :, 1]  # Green ← Right
+    ana[:, :, 2] = r[:, :, 2]  # Blue  ← Right
     return Image.fromarray(ana)
 # ==============================================================================
+# 4. MAIN PIPELINE
 # ==============================================================================
 @torch.no_grad()
+def stereo_pipeline(image_pil, divergence_percent=3.2, convergence_plane=0.08):
     if image_pil is None:
         return None, None, None, None
     w, h = image_pil.size
     if w > 1920:
         ratio = 1920 / w
+        image_pil = image_pil.resize((int(w * ratio), int(h * ratio)), Image.LANCZOS)
         w, h = image_pil.size
     # 1. Depth
+    depth = estimate_depth(image_pil)  # [H, W] in [0,1]
+    depth_vis = Image.fromarray((depth.cpu().numpy() * 255).astype(np.uint8))
+    # 2. Disparity (stronger volume with square)
+    disp_raw = depth ** 2
+    disp_clipped = torch.clamp(disp_raw, max=torch.quantile(disp_raw, 0.995))
+    # 3. Shift
+    max_shift = w * (divergence_percent / 100.0)
+    shift_raw = disp_clipped * max_shift
+    shift_min, shift_max = shift_raw.min(), shift_raw.max()
     convergence_offset = shift_min + convergence_plane * (shift_max - shift_min)
+    final_shift = shift_raw - convergence_offset
+    print(f"Final shift range: {final_shift.min():.1f} → {final_shift.max():.1f anywhere} px")
+    # 4. Warp right eye
+    img_tensor = torch.from_numpy(np.array(image_pil)).float().to(device) / 255.0
+    img_tensor = img_tensor.permute(2, 0, 1).unsqueeze(0)  # [1,3,H,W]
+    shift_tensor = final_shift.unsqueeze(0).to(device)  # [1,H,W]
+    disp_tensor = disp_clipped.unsqueeze(0).to(device)
+    right_tensor, occlusion_mask = stereo_warper(img_tensor, shift_tensor, disp_tensor)
+    # 5. To numpy
+    right_np = (right_tensor.squeeze(0).permute(1, 2, 0).cpu().numpy() * 255).astype(np.uint8)
+    right_bgr = cv2.cvtColor(right_np, cv2.COLOR_RGB2BGR)
+    mask_np = occlusion_mask.squeeze(0).cpu().numpy()
+    # 6. Inpaint occlusions
+    right_filled_bgr = run_lama(right_bgr, mask_np)
     right_filled = Image.fromarray(cv2.cvtColor(right_filled_bgr, cv2.COLOR_BGR2RGB))
     # 7. Outputs
     mask_vis = Image.fromarray((mask_np * 255).astype(np.uint8))
+    sbs = Image.new('RGB', (w * 2, h))
+    sbs.paste(image_pil, (0, 0))
+    sbs.paste(right_filled, (w, 0))
     anaglyph = make_anaglyph(image_pil, right_filled)
+    return sbs, anaglyph, depth_vis, mask_vis
 # ==============================================================================
+# 5. GRADIO UI
 # ==============================================================================
+with gr.Blocks(title="2D → 3D Stereo — Pro & Stable") as demo:
+    gr.HTML("<h1 style='text-align:center;'>2D to 3D Stereo — Pro Quality (Fixed & Stable)</h1>")
+    gr.Markdown("Depth Anything V2 + Safe Forward Warping + LaMa Inpainting")
     with gr.Row():
         with gr.Column(scale=1):
+            input_img = gr.Image(type="pil", label="Upload Image", height=520)
             with gr.Accordion("Settings", open=True):
+                divergence = gr.Slider(0.5, 8.0, value=3.5, step=0.1, label="3D Strength (%)")
                 convergence = gr.Slider(0.0, 1.0, value=0.08, step=0.01,
+                                      label="Convergence Plane (0 = pop-out, 1 = deep)")
             btn = gr.Button("Generate 3D", variant="primary", size="lg")
         with gr.Column(scale=1):
+            out_anaglyph = gr.Image(label="Anaglyph (Red/Cyan Glasses)", height=520)
+            out_sbs = gr.Image(label="Side-by-Side (Cross-eye / Parallel)", height=300)
             with gr.Row():
                 out_depth = gr.Image(label="Depth Map", height=200)
+                out_mask = gr.Image(label="Occlusion Mask", height=200)
+    btn.click(
+        fn=stereo_pipeline,
+        inputs=[input_img, divergence, convergence],
+        outputs=[out_sbs, out_anaglyph, out_depth, out_mask]
+    )
+    gr.Markdown("**Tip:** Use Red/Cyan glasses for anaglyph • Cross-eye or parallel view for SBS")
 if __name__ == "__main__":
     demo.launch(share=True)