Spaces:

enoky
/

2D-to-Stereo-3D

Running

App Files Files Community

enoky commited on 14 days ago

Commit

be50bae

verified ·

1 Parent(s): b5cd334

Update app.py

Browse files

Files changed (1) hide show

app.py +84 -135

app.py CHANGED Viewed

@@ -7,14 +7,13 @@ from PIL import Image
 from torch.autograd import Function
 from transformers import AutoModelForDepthEstimation, AutoImageProcessor
 from huggingface_hub import hf_hub_download
-import os
 # === DEVICE ===
 device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
 print(f"Running on device: {device}")
 # ==============================================================================
-# 1. FIXED FORWARD WARP WITH BILINEAR SPLATTING (Contiguous & Stable)
 # ==============================================================================
 class ForwardWarpFunction(Function):
     @staticmethod
@@ -26,7 +25,6 @@ class ForwardWarpFunction(Function):
         B, C, H, W = im0.shape
         im1 = torch.zeros_like(im0)
-        # Grid: [B, H, W]
         grid_y, grid_x = torch.meshgrid(
             torch.arange(H, device=im0.device, dtype=torch.float32),
             torch.arange(W, device=im0.device, dtype=torch.float32),
@@ -44,51 +42,44 @@ class ForwardWarpFunction(Function):
             x1 = x0 + 1
             y1 = y0 + 1
-            # Bilinear weights
-            w00 = (x1.float() - x_dest) * (y1.float() - y_dest)  # top-left
-            w10 = (x_dest - x0.float()) * (y1.float() - y_dest)  # top-right
-            w01 = (x1.float() - x_dest) * (y_dest - y0.float())  # bottom-left
-            w11 = (x_dest - x0.float()) * (y_dest - y0.float())  # bottom-right
-            # Clamp coordinates
             x0c = x0.clamp(0, W - 1)
             y0c = y0.clamp(0, H - 1)
             x1c = x1.clamp(0, W - 1)
             y1c = y1.clamp(0, H - 1)
-            valid = (x0 >= 0) & (x1 < W) & (y0 >= 0) & (y1 < H)  # [B, H, W]
-            # Ensure contiguous
             im0 = im0.contiguous()
-            valid = valid.unsqueeze(1).float()  # [B, 1, H, W]
             def splat(y_idx, x_idx, weight):
-                weight = (weight.unsqueeze(1) * valid).contiguous()          # [B,1,H,W]
-                values = (im0 * weight).reshape(B * C, -1)                    # [B*C, H*W]
-                # Compute flat indices: B,C,H,W → global index
-                b_idx = torch.arange(B, device=im0.device).view(B, 1, 1, 1)
-                c_idx = torch.arange(C, device=im0.device).view(1, C, 1, 1)
-                base = (b_idx * C * H * W + c_idx * H * W).expand(-1, -1, H, W)
-                idx = base + y_idx.unsqueeze(1) * W + x_idx.unsqueeze(1)
-                idx = idx.reshape(B * C, -1).contiguous()
-                im1.view(-1).scatter_add_(0, idx.view(-1), values.view(-1))
             splat(y0c, x0c, w00)
             splat(y0c, x1c, w10)
             splat(y1c, x0c, w01)
             splat(y1c, x1c, w11)
-        else:  # Nearest neighbor (fallback)
             x_nn = torch.round(x_dest).long().clamp(0, W - 1)
             y_nn = torch.round(y_dest).long().clamp(0, H - 1)
             b_idx = torch.arange(B, device=im0.device)[:, None, None, None]
             c_idx = torch.arange(C, device=im0.device)[None, :, None, None]
-            idx = (b_idx * C * H * W + c_idx * H * W + y_nn.unsqueeze(1) * W + x_nn.unsqueeze(1))
-            idx = idx.reshape(-1)
             valid = ((x_nn >= 0) & (x_nn < W) & (y_nn >= 0) & (y_nn < H)).unsqueeze(1)
             values = (im0 * valid.float()).reshape(-1)
@@ -112,7 +103,7 @@ class forward_warp(nn.Module):
 # ==============================================================================
-# 2. STEREO WARPER (Soft Z-Buffer Splatting)
 # ==============================================================================
 class ForwardWarpStereo(nn.Module):
     def __init__(self, eps=1e-6):
@@ -121,23 +112,19 @@ class ForwardWarpStereo(nn.Module):
         self.fw = forward_warp(interpolation_mode="Bilinear")
     def forward(self, im, disp, convergence, divergence):
-        disp = disp.squeeze(1)  # [B, H, W]
         shift = (disp - convergence) * divergence
         flow_x = -shift
-        flow = torch.zeros_like(flow_x)
-        flow = torch.stack([flow_x, flow_y], dim=-1)  # [B, H, W, 2]
-        # Soft Z-buffer weights (closer = higher weight)
         weights = (1.5) ** (disp - disp.min())
-        # Warp color * weight
         accum_color = self.fw(im * weights.unsqueeze(1), flow)
         accum_weight = self.fw(weights.unsqueeze(1), flow)
-        # Normalize
         result = accum_color / (accum_weight + self.eps)
-        # Occlusion mask (holes)
         ones = torch.ones_like(disp)
         occupancy = self.fw(ones.unsqueeze(1), flow)
         occlusion_mask = (occupancy < self.eps).float()
@@ -146,7 +133,7 @@ class ForwardWarpStereo(nn.Module):
 # ==============================================================================
-# 3. MODELS & PIPELINE
 # ==============================================================================
 def load_models():
     print("Loading Depth Anything V2 Large...")
@@ -158,30 +145,28 @@ def load_models():
         "depth-anything/Depth-Anything-V2-Large-hf"
     )
-    print("Loading LaMa Inpainting Model...")
     model_path = hf_hub_download(repo_id="fashn-ai/LaMa", filename="big-lama.pt")
     lama_model = torch.jit.load(model_path, map_location=device).eval()
     stereo_warper = ForwardWarpStereo().to(device)
     return depth_model, depth_processor, lama_model, stereo_warper
-# Load once at startup
 depth_model, depth_processor, lama_model, stereo_warper = load_models()
 @torch.inference_mode()
 def estimate_depth(image_pil):
-    original_size = image_pil.size
     inputs = depth_processor(images=image_pil, return_tensors="pt").to(device)
-    depth = depth_model(**inputs).predicted_depth  # [1, H, W]
     depth = torch.nn.functional.interpolate(
-        depth.unsqueeze(1),
-        size=(original_size[1], original_size[0]),
-        mode="bicubic",
-        align_corners=False,
     ).squeeze()
     depth = (depth - depth.min()) / (depth.max() - depth.min() + 1e-8)
@@ -193,148 +178,112 @@ def erode_depth(depth_tensor, kernel_size):
         return depth_tensor
     k = kernel_size if kernel_size % 2 == 1 else kernel_size + 1
     x = depth_tensor.unsqueeze(0).unsqueeze(0)
-    padding = k // 2
-    eroded = -torch.nn.functional.max_pool2d(-x, kernel_size=k, stride=1, padding=padding)
-    return eroded.squeeze()
 @torch.inference_mode()
 def run_local_lama(image_bgr, mask_float):
-    kernel = np.ones((3, 3), np.uint8)
-    mask_uint8 = (mask_float * 255).astype(np.uint8)
-    mask_dilated = cv2.dilate(mask_uint8, kernel, iterations=1)
     h, w = image_bgr.shape[:2]
-    new_h = (h // 8) * 8
-    new_w = (w // 8) * 8
     img_resized = cv2.resize(image_bgr, (new_w, new_h))
     mask_resized = cv2.resize(mask_dilated, (new_w, new_h), interpolation=cv2.INTER_NEAREST)
-    img_t = torch.from_numpy(img_resized).float().permute(2, 0, 1).unsqueeze(0) / 255.0
-    img_t = img_t[:, [2, 1, 0], :, :]  # BGR → RGB
-    mask_t = torch.from_numpy(mask_resized).float().unsqueeze(0).unsqueeze(0) / 255.0
-    mask_t = (mask_t > 0.5).float()
-    img_t = img_t.to(device)
-    mask_t = mask_t.to(device)
-    img_t = img_t * (1 - mask_t)
-    with torch.no_grad():
-        inpainted_t = lama_model(img_t, mask_t)
-    inpainted = inpainted_t[0].permute(1, 2, 0).cpu().numpy()
-    inpainted = np.clip(inpainted * 255, 0, 255).astype(np.uint8)
-    inpainted = cv2.cvtColor(inpainted, cv2.COLOR_RGB2BGR)
-    if (new_h != h) or (new_w != w):
-        inpainted = cv2.resize(inpainted, (w, h), interpolation=cv2.INTER_LANCZOS4)
-    return inpainted
 def make_anaglyph(left, right):
     l = np.array(left)
     r = np.array(right)
-    anaglyph = np.zeros_like(l)
-    anaglyph[:, :, 0] = l[:, :, 0]  # Red ← Left
-    anaglyph[:, :, 1] = r[:, :, 1]  # Green ← Right
-    anaglyph[:, :, 2] = r[:, :, 2]  # Blue ← Right
-    return Image.fromarray(anaglyph)
-# ==============================================================================
-# MAIN PIPELINE
-# ==============================================================================
 def stereo_pipeline(image_pil, divergence, convergence, edge_erosion):
     if image_pil is None:
         return None, None, None, None
-    # Resize if too large (HF Spaces limit)
-    w, h = image_pil.size
-    if w > 1920:
-        ratio = 1920 / w
-        image_pil = image_pil.resize((1920, int(h * ratio)), Image.LANCZOS)
-    # 1. Depth
     depth = estimate_depth(image_pil)
     if edge_erosion > 0:
         depth = erode_depth(depth, int(edge_erosion))
-    depth_vis = Image.fromarray((depth.cpu().numpy() * 255).astype(np.uint8))
-    # 2. Prepare tensors
-    img_tensor = torch.from_numpy(np.array(image_pil)).float().to(device)
-    img_tensor = img_tensor.permute(2, 0, 1).unsqueeze(0) / 255.0  # [1,3,H,W]
-    depth_tensor = depth.unsqueeze(0).unsqueeze(0)  # [1,1,H,W]
-    # 3. Stereo warp
     with torch.inference_mode():
-        right_tensor, mask_tensor = stereo_warper(
-            img_tensor, depth_tensor, float(convergence), float(divergence)
-        )
-    right_np = (right_tensor.squeeze(0).permute(1,2,0).cpu().numpy() * 255).astype(np.uint8)
-    mask_np = (mask_tensor.squeeze().cpu().numpy() * 255).astype(np.uint8)
-    # 4. Inpaint holes
     right_bgr = cv2.cvtColor(right_np, cv2.COLOR_RGB2BGR)
-    mask_float = mask_tensor.squeeze().cpu().numpy()
-    right_filled_bgr = run_local_lama(right_bgr, mask_float)
-    right_filled = Image.fromarray(cv2.cvtColor(right_filled_bgr, cv2.COLOR_BGR2RGB))
-    # 5. Outputs
     w, h = image_pil.size
-    sbs = Image.new("RGB", (w * 2, h))
-    sbs.paste(image_pil, (0, 0))
-    sbs.paste(right_filled, (w, 0))
-    anaglyph = make_anaglyph(image_pil, right_filled)
     return sbs, anaglyph, depth_vis, Image.fromarray(mask_np)
 # ==============================================================================
-# GRADIO UI
 # ==============================================================================
-css = ".gradio-container {max-width: 1400px !important; margin: auto !important;}"
-with gr.Blocks(css=css, title="2D → 3D Stereo (Depth Anything + Splatting)") as demo:
     gr.Markdown("# 2D to 3D Stereo Generator")
-    gr.Markdown("High-quality automatic stereo conversion using **Depth Anything V2**, **bilinear splatting with soft Z-buffer**, and **LaMa inpainting**.")
     with gr.Row():
-        with gr.Column(scale=1):
-            input_img = gr.Image(type="pil", label="Upload Image", height=400)
-            with gr.Accordion("3D Settings", open=True):
-                divergence_slider = gr.Slider(0, 100, value=30, step=1,
-                                              label="3D Strength (Divergence)",
-                                              info="Higher = stronger 3D pop-out")
-                convergence_slider = gr.Slider(0.0, 1.0, value=0.5, step=0.05,
-                                               label="Focus Plane (Convergence)",
-                                               info="0 = background at screen, 1 = foreground at screen")
-                erosion_slider = gr.Slider(0, 20, value=3, step=1,
-                                           label="Edge Cleanup (Depth Erosion)",
-                                           info="Reduces halos, 0 = raw")
-            btn = gr.Button("Generate 3D Stereo", variant="primary", size="lg")
-        with gr.Column(scale=1):
-            out_stereo = gr.Image(label="Side-by-Side Stereo Pair", height=400)
-            out_anaglyph = gr.Image(label="Anaglyph (Red/Cyan Glasses)", height=400)
-            with gr.Row():
-                out_depth = gr.Image(label="Estimated Depth Map", height=200)
-                out_mask = gr.Image(label="Inpainting Mask (Holes)", height=200)
-    btn.click(
-        fn=stereo_pipeline,
-        inputs=[input_img, divergence_slider, convergence_slider, erosion_slider],
-        outputs=[out_stereo, out_anaglyph, out_depth, out_mask]
-    )
-    gr.Markdown("Made with Depth Anything V2 • Bilinear Splatting • LaMa • Gradio")
 if __name__ == "__main__":
     demo.launch()

 from torch.autograd import Function
 from transformers import AutoModelForDepthEstimation, AutoImageProcessor
 from huggingface_hub import hf_hub_download
 # === DEVICE ===
 device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
 print(f"Running on device: {device}")
 # ==============================================================================
+# 1. FIXED FORWARD WARP (Bilinear Splatting – fully contiguous)
 # ==============================================================================
 class ForwardWarpFunction(Function):
     @staticmethod
         B, C, H, W = im0.shape
         im1 = torch.zeros_like(im0)
         grid_y, grid_x = torch.meshgrid(
             torch.arange(H, device=im0.device, dtype=torch.float32),
             torch.arange(W, device=im0.device, dtype=torch.float32),
             x1 = x0 + 1
             y1 = y0 + 1
+            w00 = (x1.float() - x_dest) * (y1.float() - y_dest)
+            w10 = (x_dest - x0.float()) * (y1.float() - y_dest)
+            w01 = (x1.float() - x_dest) * (y_dest - y0.float())
+            w11 = (x_dest - x0.float()) * (y_dest - y0.float())
             x0c = x0.clamp(0, W - 1)
             y0c = y0.clamp(0, H - 1)
             x1c = x1.clamp(0, W - 1)
             y1c = y1.clamp(0, H - 1)
+            valid = (x0 >= 0) & (x1 < W) & (y0 >= 0) & (y1 < H)
             im0 = im0.contiguous()
+            valid = valid.unsqueeze(1).float()
             def splat(y_idx, x_idx, weight):
+                weight = (weight.unsqueeze(1) * valid).contiguous()
+                values = (im0 * weight).reshape(B * C, -1)
+                b_idx = torch.arange(B, device=im0.device)[:, None, None, None]
+                c_idx = torch.arange(C, device=im0.device)[None, :, None, None]
+                base = b_idx * C * H * W + c_idx * H * W
+                idx = (base + y_idx.unsqueeze(1) * W + x_idx.unsqueeze(1)).reshape(B * C, -1)
+                im1.view(-1).scatter_add_(0, idx.reshape(-1), values.reshape(-1))
             splat(y0c, x0c, w00)
             splat(y0c, x1c, w10)
             splat(y1c, x0c, w01)
             splat(y1c, x1c, w11)
+        else:  # Nearest neighbor fallback
             x_nn = torch.round(x_dest).long().clamp(0, W - 1)
             y_nn = torch.round(y_dest).long().clamp(0, H - 1)
             b_idx = torch.arange(B, device=im0.device)[:, None, None, None]
             c_idx = torch.arange(C, device=im0.device)[None, :, None, None]
+            idx = (b_idx * C * H * W + c_idx * H * W + y_nn.unsqueeze(1) * W + x_nn.unsqueeze(1)).reshape(-1)
             valid = ((x_nn >= 0) & (x_nn < W) & (y_nn >= 0) & (y_nn < H)).unsqueeze(1)
             values = (im0 * valid.float()).reshape(-1)
 # ==============================================================================
+# 2. STEREO WARPER
 # ==============================================================================
 class ForwardWarpStereo(nn.Module):
     def __init__(self, eps=1e-6):
         self.fw = forward_warp(interpolation_mode="Bilinear")
     def forward(self, im, disp, convergence, divergence):
+        disp = disp.squeeze(1)
         shift = (disp - convergence) * divergence
         flow_x = -shift
+        flow_y = torch.zeros_like(flow_x)
+        flow = torch.stack([flow_x, flow_y], dim=-1)
         weights = (1.5) ** (disp - disp.min())
         accum_color = self.fw(im * weights.unsqueeze(1), flow)
         accum_weight = self.fw(weights.unsqueeze(1), flow)
         result = accum_color / (accum_weight + self.eps)
         ones = torch.ones_like(disp)
         occupancy = self.fw(ones.unsqueeze(1), flow)
         occlusion_mask = (occupancy < self.eps).float()
 # ==============================================================================
+# 3. MODELS
 # ==============================================================================
 def load_models():
     print("Loading Depth Anything V2 Large...")
         "depth-anything/Depth-Anything-V2-Large-hf"
     )
+    print("Loading LaMa Inpainting...")
     model_path = hf_hub_download(repo_id="fashn-ai/LaMa", filename="big-lama.pt")
     lama_model = torch.jit.load(model_path, map_location=device).eval()
     stereo_warper = ForwardWarpStereo().to(device)
     return depth_model, depth_processor, lama_model, stereo_warper
 depth_model, depth_processor, lama_model, stereo_warper = load_models()
+# ==============================================================================
+# 4. PIPELINE
+# ==============================================================================
 @torch.inference_mode()
 def estimate_depth(image_pil):
+    w, h = image_pil.size
     inputs = depth_processor(images=image_pil, return_tensors="pt").to(device)
+    depth = depth_model(**inputs).predicted_depth
     depth = torch.nn.functional.interpolate(
+        depth.unsqueeze(1), size=(h, w), mode="bicubic", align_corners=False
     ).squeeze()
     depth = (depth - depth.min()) / (depth.max() - depth.min() + 1e-8)
         return depth_tensor
     k = kernel_size if kernel_size % 2 == 1 else kernel_size + 1
     x = depth_tensor.unsqueeze(0).unsqueeze(0)
+    return -torch.nn.functional.max_pool2d(-x, kernel_size=k, stride=1, padding=k//2).squeeze()
 @torch.inference_mode()
 def run_local_lama(image_bgr, mask_float):
+    kernel = np.ones((3,3), np.uint8)
+    mask_dilated = cv2.dilate((mask_float*255).astype(np.uint8), kernel, iterations=1)
     h, w = image_bgr.shape[:2]
+    new_h, new_w = (h//8)*8, (w//8)*8
     img_resized = cv2.resize(image_bgr, (new_w, new_h))
     mask_resized = cv2.resize(mask_dilated, (new_w, new_h), interpolation=cv2.INTER_NEAREST)
+    img_t = torch.from_numpy(img_resized).float().permute(2,0,1).unsqueeze(0)/255.0
+    img_t = img_t[:, [2,1,0]]  # BGR→RGB
+    mask_t = torch.from_numpy(mask_resized).float().unsqueeze(0).unsqueeze(0)/255.0 > 0.5
+    img_t = img_t.to(device) * (1 - mask_t.to(device))
+    inpainted = lama_model(img_t.to(device), mask_t.to(device))
+    out = (inpainted[0].permute(1,2,0).cpu().numpy()*255).clip(0,255).astype(np.uint8)
+    out = cv2.cvtColor(out, cv2.COLOR_RGB2BGR)
+    if (new_h, new_w) != (h, w):
+        out = cv2.resize(out, (w, h), interpolation=cv2.INTER_LANCZOS4)
+    return out
 def make_anaglyph(left, right):
     l = np.array(left)
     r = np.array(right)
+    a = np.zeros_like(l)
+    a[:,:,0] = l[:,:,0]
+    a[:,:,1] = r[:,:,1]
+    a[:,:,2] = r[:,:,2]
+    return Image.fromarray(a)
 def stereo_pipeline(image_pil, divergence, convergence, edge_erosion):
     if image_pil is None:
         return None, None, None, None
+    # Downscale huge images
+    if image_pil.width > 1920:
+        ratio = 1920 / image_pil.width
+        image_pil = image_pil.resize((1920, int(image_pil.height*ratio)), Image.LANCZOS)
     depth = estimate_depth(image_pil)
     if edge_erosion > 0:
         depth = erode_depth(depth, int(edge_erosion))
+    depth_vis = Image.fromarray((depth.cpu().numpy()*255).astype(np.uint8))
+    img_t = torch.from_numpy(np.array(image_pil)).float().to(device).permute(2,0,1).unsqueeze(0)/255.0
+    depth_t = depth.unsqueeze(0).unsqueeze(0)
     with torch.inference_mode():
+        right_t, mask_t = stereo_warper(img_t, depth_t, float(convergence), float(divergence))
+    right_np = (right_t[0].permute(1,2,0).cpu().numpy()*255).astype(np.uint8)
+    mask_np = (mask_t[0,0].cpu().numpy()*255).astype(np.uint8)
     right_bgr = cv2.cvtColor(right_np, cv2.COLOR_RGB2BGR)
+    right_filled = run_local_lama(right_bgr, mask_t[0,0].cpu().numpy())
+    right_pil = Image.fromarray(cv2.cvtColor(right_filled, cv2.COLOR_BGR2RGB))
+    # Side-by-side
     w, h = image_pil.size
+    sbs = Image.new("RGB", (w*2, h))
+    sbs.paste(image_pil, (0,0))
+    sbs.paste(right_pil, (w,0))
+    anaglyph = make_anaglyph(image_pil, right_pil)
     return sbs, anaglyph, depth_vis, Image.fromarray(mask_np)
 # ==============================================================================
+# GRADIO UI (compatible with latest Gradio)
 # ==============================================================================
+with gr.Blocks(title="2D to 3D Stereo – Depth Anything + Splatting") as demo:
+    gr.HTML("<style>.gradio-container {max-width: 1400px !important; margin: auto !important;}</style>")
     gr.Markdown("# 2D to 3D Stereo Generator")
+    gr.Markdown("Depth Anything V2 + Bilinear Splatting + LaMa Inpainting → beautiful 3D")
     with gr.Row():
+        with gr.Column():
+            inp = gr.Image(type="pil", label="Input Image", height=400)
+            with gr.Accordion("Settings", open=True):
+                div = gr.Slider(0, 100, 30, step=1, label="3D Strength (Divergence)")
+                conv = gr.Slider(0.0, 1.0, 0.5, step=0.05, label="Focus Plane (Convergence)")
+                ero = gr.Slider(0, 20, 3, step=1, label="Edge Cleanup (Erosion)")
+            btn = gr.Button("Generate 3D", variant="primary")
+        with gr.Column():
+            out_sbs = gr.Image(label="Side-by-Side", height=400)
+            out_ana = gr.Image(label="Anaglyph (Red/Cyan)", height=400)
+            with gr.Row():
+                out_depth = gr.Image(label="Depth Map")
+                out_mask  = gr.Image(label="Holes Mask")
+    btn.click(stereo_pipeline, [inp, div, conv, ero], [out_sbs, out_ana, out_depth, out_mask])
 if __name__ == "__main__":
     demo.launch()