Spaces:

enoky
/

2D-to-Stereo-3D

Running

App Files Files Community

enoky commited on 15 days ago

Commit

cd5cadf

verified ·

1 Parent(s): 04b39f7

Update app.py

Browse files

Files changed (1) hide show

app.py +146 -218

app.py CHANGED Viewed

@@ -7,15 +7,11 @@ from PIL import Image
 from torch.autograd import Function
 from transformers import AutoModelForDepthEstimation, AutoImageProcessor
 from huggingface_hub import hf_hub_download
-import os
-# === DEVICE ===
 device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
-print(f"Running on device: {device}")
-# ==============================================================================
-# 1. FORWARD WARP (unchanged)
-# ==============================================================================
 class ForwardWarpFunction(Function):
     @staticmethod
     def forward(ctx, im0, flow, interpolation_mode_int):
@@ -30,270 +26,202 @@ class ForwardWarpFunction(Function):
         x_dest = grid_x + flow[:, :, :, 0]
         y_dest = grid_y + flow[:, :, :, 1]
-        x_f = torch.floor(x_dest).long()
-        y_f = torch.floor(y_dest).long()
-        x_c = x_f + 1
-        y_c = y_f + 1
         nw_k = (x_c.float() - x_dest) * (y_c.float() - y_dest)
         ne_k = (x_dest - x_f.float()) * (y_c.float() - y_dest)
         sw_k = (x_c.float() - x_dest) * (y_dest - y_f.float())
         se_k = (x_dest - x_f.float()) * (y_dest - y_f.float())
-        x_f_clamped = torch.clamp(x_f, 0, W - 1)
-        y_f_clamped = torch.clamp(y_f, 0, H - 1)
-        x_c_clamped = torch.clamp(x_c, 0, W - 1)
-        y_c_clamped = torch.clamp(y_c, 0, H - 1)
         mask_nw = (x_f >= 0) & (x_f < W) & (y_f >= 0) & (y_f < H)
         mask_ne = (x_c >= 0) & (x_c < W) & (y_f >= 0) & (y_f < H)
         mask_sw = (x_f >= 0) & (x_f < W) & (y_c >= 0) & (y_c < H)
         mask_se = (x_c >= 0) & (x_c < W) & (y_c >= 0) & (y_c < H)
-        nw_k = nw_k.unsqueeze(1); ne_k = ne_k.unsqueeze(1)
-        sw_k = sw_k.unsqueeze(1); se_k = se_k.unsqueeze(1)
-        mask_nw = mask_nw.unsqueeze(1); mask_ne = mask_ne.unsqueeze(1)
-        mask_sw = mask_sw.unsqueeze(1); mask_se = mask_se.unsqueeze(1)
-        b_indices = torch.arange(B, device=im0.device).view(B, 1, 1, 1).expand(-1, C, H, W)
-        c_indices = torch.arange(C, device=im0.device).view(1, C, 1, 1).expand(B, -1, H, W)
-        base_idx = b_indices * (C * H * W) + c_indices * (H * W)
-        def scatter_corner(y_idx, x_idx, weights, mask):
-            flat_idx = base_idx + y_idx.unsqueeze(1) * W + x_idx.unsqueeze(1)
-            values = (im0 * weights) * mask.float()
-            im1.reshape(-1).scatter_add_(0, flat_idx.contiguous().reshape(-1), values.contiguous().reshape(-1))
-        scatter_corner(y_f_clamped, x_f_clamped, nw_k, mask_nw)
-        scatter_corner(y_f_clamped, x_c_clamped, ne_k, mask_ne)
-        scatter_corner(y_c_clamped, x_f_clamped, sw_k, mask_sw)
-        scatter_corner(y_c_clamped, x_c_clamped, se_k, mask_se)
         return im1
     @staticmethod
-    def backward(ctx, grad_output):
-        return None, None, None
 class forward_warp(nn.Module):
-    def __init__(self): super().__init__()
-    def forward(self, im0, flow):
-        return ForwardWarpFunction.apply(im0, flow, 0)
-# ==============================================================================
-# 2. STEREO WARPER – FIXED + SMART DILATION
-# ==============================================================================
 class ForwardWarpStereo(nn.Module):
-    def __init__(self, eps=1e-6):
-        super().__init__()
-        self.eps = eps
-        self.fw = forward_warp()
-    def forward(self, im, shift, disp_for_weights):
-        flow_x = -shift
-        flow_y = torch.zeros_like(flow_x)
-        flow = torch.stack((flow_x, flow_y), dim=-1)
-        # Fixed z-buffer weights (no detached limbs)
-        disp_norm = disp_for_weights / (disp_for_weights.max() + 1e-8)
-        weights_map = disp_norm + 0.05
-        res_accum = self.fw(im * weights_map.unsqueeze(1), flow)
-        mask_accum = self.fw(weights_map.unsqueeze(1), flow)
-        mask_accum.clamp_(min=self.eps)
-        res = res_accum / mask_accum
-        ones = torch.ones_like(im[:,0:1,:,:])
-        occupancy = self.fw(ones, flow)
-        occlusion_mask = (occupancy < self.eps).float()
-        # Smart foreground-preserving dilation
-        with torch.no_grad():
-            fg_thresh = torch.quantile(disp_for_weights, 0.88)
-            fg_mask = (disp_for_weights > fg_thresh).float().unsqueeze(0)
-            k = 15
-            dilated = torch.nn.functional.conv2d(
-                occlusion_mask, torch.ones(1,1,k,k,device=occlusion_mask.device),
-                padding=k//2) > 0.1
-            safe_dilation = dilated.float() * (1 - fg_mask)
-            occlusion_mask = torch.clamp(occlusion_mask + safe_dilation, 0, 1)
-        return res, occlusion_mask
-# ==============================================================================
-# 3. MODELS & HELPERS
-# ==============================================================================
-def load_models():
-    print("Loading Depth Anything V2 Large...")
-    depth_model = AutoModelForDepthEstimation.from_pretrained(
-        "depth-anything/Depth-Anything-V2-Large-hf"
-    ).to(device)
-    depth_processor = AutoImageProcessor.from_pretrained(
-        "depth-anything/Depth-Anything-V2-Large-hf"
-    )
-    print("Loading LaMa Inpainting Model...")
-    try:
-        model_path = hf_hub_download(repo_id="fashn-ai/LaMa", filename="big-lama.pt")
-        lama_model = torch.jit.load(model_path, map_location=device)
-        lama_model.eval()
-    except Exception as e:
-        print(f"LaMa load failed: {e}")
-        lama_model = None
-    stereo_warper = ForwardWarpStereo().to(device)
-    return depth_model, depth_processor, lama_model, stereo_warper
-depth_model, depth_processor, lama_model, stereo_warper = load_models()
 @torch.no_grad()
-def estimate_depth(image_pil, model, processor):
-    original_size = image_pil.size
-    inputs = processor(images=image_pil, return_tensors="pt").to(device)
-    depth = model(**inputs).predicted_depth
-    depth = torch.nn.functional.interpolate(
-        depth.unsqueeze(1),
-        size=(original_size[1], original_size[0]),
-        mode="bicubic",
-        align_corners=False,
-    ).squeeze()
-    d_min, d_max = depth.min(), depth.max()
-    depth = (depth - d_min) / (d_max - d_min + 1e-8) if d_max > d_min else torch.zeros_like(depth)
-    return depth
 @torch.no_grad()
-def run_local_lama(image_bgr, mask_float):
-    if lama_model is None: return image_bgr
-    kernel = np.ones((5,5), np.uint8)
-    mask_uint8 = (mask_float * 255).astype(np.uint8)
-    mask_dilated = cv2.dilate(mask_uint8, kernel, iterations=2)
-    h, w = image_bgr.shape[:2]
-    new_h, new_w = (h // 8) * 8, (w // 8) * 8
-    img_resized = cv2.resize(image_bgr, (new_w, new_h))
-    mask_resized = cv2.resize(mask_dilated, (new_w, new_h), interpolation=cv2.INTER_NEAREST)
-    img_t = torch.from_numpy(img_resized).float().permute(2,0,1).unsqueeze(0)/255.0
-    img_t = img_t[:,[2,1,0],:,:].to(device)
-    mask_t = torch.from_numpy(mask_resized).float().unsqueeze(0).unsqueeze(0)/255.0
-    mask_t = (mask_t > 0.5).float().to(device)
-    img_t = img_t * (1 - mask_t)
-    inpainted_t = lama_model(img_t, mask_t)
-    inpainted = inpainted_t[0].permute(1,2,0).cpu().numpy()
-    inpainted = np.clip(inpainted*255, 0, 255).astype(np.uint8)
-    inpainted = cv2.cvtColor(inpainted, cv2.COLOR_RGB2BGR)
-    if (new_h, new_w) != (h, w):
-        inpainted = cv2.resize(inpainted, (w, h))
-    return inpainted
-@torch.no_grad()
-def run_lama_twice(image_bgr, mask_float):
-    if lama_model is None: return image_bgr
-    img1 = run_local_lama(image_bgr, mask_float)
-    kernel = np.ones((9,9), np.uint8)
-    mask_dilated = cv2.dilate(mask_float, kernel, iterations=2)
-    return run_local_lama(img1, mask_dilated)
-def make_anaglyph(left, right):
-    l = np.array(left); r = np.array(right)
     a = np.zeros_like(l)
-    a[:,:,0] = l[:,:,0]   # Red   ← left eye
-    a[:,:,1] = r[:,:,1]   # Green ← right eye
-    a[:,:,2] = r[:,:,2]   # Blue  ← right eye
     return Image.fromarray(a)
-# ==============================================================================
-# 4. MAIN PIPELINE
-# ==============================================================================
 @torch.no_grad()
-def stereo_pipeline(image_pil, divergence_percent, convergence_plane):
-    if image_pil is None:
-        return None, None, None, None
-    w, h = image_pil.size
     if w > 1920:
-        ratio = 1920 / w
-        image_pil = image_pil.resize((int(w*ratio), int(h*ratio)), Image.LANCZOS)
-        w, h = image_pil.size
-    depth_tensor = estimate_depth(image_pil, depth_model, depth_processor)
-    depth_vis = Image.fromarray((depth_tensor.cpu().numpy() * 255).astype(np.uint8))
-    disp_raw = depth_tensor ** 2
-    disp_max = torch.quantile(disp_raw, 0.995)
-    disp_clipped = torch.clamp(disp_raw, max=disp_max)
-    max_shift_px = w * (divergence_percent / 100.0)
-    shift_pixels_raw = disp_clipped * max_shift_px
-    shift_min, shift_max = shift_pixels_raw.min(), shift_pixels_raw.max()
-    convergence_offset = shift_min + convergence_plane * (shift_max - shift_min)
-    final_shift_pixels = shift_pixels_raw - convergence_offset
-    image_tensor = torch.from_numpy(np.array(image_pil)).float().to(device) / 255.0
-    image_tensor = image_tensor.permute(2,0,1).unsqueeze(0)
-    right_tensor, occlusion_mask = stereo_warper(
-        image_tensor,
-        final_shift_pixels.unsqueeze(0).to(device),
-        disp_clipped.unsqueeze(0).to(device)
-    )
-    right_rgb = (right_tensor.squeeze(0).permute(1,2,0).cpu().numpy() * 255).astype(np.uint8)
-    right_bgr = cv2.cvtColor(right_rgb, cv2.COLOR_RGB2BGR)
-    mask_np = occlusion_mask.squeeze(0).cpu().numpy()
-    right_filled_bgr = run_lama_twice(right_bgr, mask_np)
-    right_filled = Image.fromarray(cv2.cvtColor(right_filled_bgr, cv2.COLOR_BGR2RGB))
-    mask_vis = Image.fromarray((mask_np * 255).astype(np.uint8))
-    combined = Image.new('RGB', (w*2, h))
-    combined.paste(image_pil, (0, 0))
-    combined.paste(right_filled, (w, 0))
-    anaglyph = make_anaglyph(image_pil, right_filled)
-    return combined, anaglyph, depth_vis, mask_vis
-# ==============================================================================
-# 5. GRADIO UI – COMPATIBLE WITH CURRENT GRADIO
-# ==============================================================================
-css = """
-.gradio-container {max-width: 1450px !important; margin: auto !important;}
-"""
-with gr.Blocks() as demo:                     # ← removed css= argument
-    gr.HTML(f"<style>{css}</style>")          # ← inject CSS here instead
-    gr.HTML("<h1 style='text-align:center;'>2D → 3D Stereo – Pro Quality</h1>")
-    gr.Markdown("Depth Anything V2 + Forward Warp + Smart LaMa Inpainting")
     with gr.Row():
-        with gr.Column(scale=1):
-            input_img = gr.Image(type="pil", label="Upload Image", height=520)
             with gr.Accordion("Settings", open=True):
-                divergence = gr.Slider(0.5, 8.0, value=3.2, step=0.1,
-                                     label="3D Strength (%)")
-                convergence = gr.Slider(0.0, 1.0, value=0.08, step=0.01,
-                                      label="Convergence Plane (0 = pop-out, 1 = deep-in)")
-            btn = gr.Button("Generate 3D", variant="primary", size="lg")
-        with gr.Column(scale=1):
-            out_anaglyph = gr.Image(label="Anaglyph (Red/Cyan Glasses)", height=520)
-            out_sbs = gr.Image(label="Side-by-Side Pair", height=320)
             with gr.Row():
-                out_depth = gr.Image(label="Depth Map", height=200)
-                out_mask  = gr.Image(label="Inpainting Mask", height=200)
-    btn.click(fn=stereo_pipeline,
-              inputs=[input_img, divergence, convergence],
-              outputs=[out_sbs, out_anaglyph, out_depth, out_mask])
-    gr.Markdown("**Tip:** Red/Cyan glasses → anaglyph • Cross-eye or parallel → side-by-side")
-if __name__ == "__main__":
-    demo.launch(share=True)

 from torch.autograd import Function
 from transformers import AutoModelForDepthEstimation, AutoImageProcessor
 from huggingface_hub import hf_hub_download
 device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+print(f"Running on {device}")
+# ==================== 1. FORWARD WARP (unchanged) ====================
 class ForwardWarpFunction(Function):
     @staticmethod
     def forward(ctx, im0, flow, interpolation_mode_int):
         x_dest = grid_x + flow[:, :, :, 0]
         y_dest = grid_y + flow[:, :, :, 1]
+        x_f = torch.floor(x_dest).long(); x_c = x_f + 1
+        y_f = torch.floor(y_dest).long(); y_c = y_f + 1
         nw_k = (x_c.float() - x_dest) * (y_c.float() - y_dest)
         ne_k = (x_dest - x_f.float()) * (y_c.float() - y_dest)
         sw_k = (x_c.float() - x_dest) * (y_dest - y_f.float())
         se_k = (x_dest - x_f.float()) * (y_dest - y_f.float())
+        x_f_clamped = torch.clamp(x_f, 0, W-1); x_c_clamped = torch.clamp(x_c, 0, W-1)
+        y_f_clamped = torch.clamp(y_f, 0, H-1); y_c_clamped = torch.clamp(y_c, 0, H-1)
         mask_nw = (x_f >= 0) & (x_f < W) & (y_f >= 0) & (y_f < H)
         mask_ne = (x_c >= 0) & (x_c < W) & (y_f >= 0) & (y_f < H)
         mask_sw = (x_f >= 0) & (x_f < W) & (y_c >= 0) & (y_c < H)
         mask_se = (x_c >= 0) & (x_c < W) & (y_c >= 0) & (y_c < H)
+        for w,k,m in [(nw_k,mask_nw),(ne_k,mask_ne),(sw_k,mask_sw),(se_k,mask_se)]:
+            w.unsqueeze_(1); m.unsqueeze_(1)
+        b_idx = torch.arange(B, device=im0.device).view(B,1,1,1).expand(-1,C,H,W)
+        c_idx = torch.arange(C, device=im0.device).view(1,C,1,1).expand(B,-1,H,W)
+        base = b_idx * (C*H*W) + c_idx * (H*W)
+        def scatter(y_idx, x_idx, weights, mask):
+            flat = base + y_idx.unsqueeze(1)*W + x_idx.unsqueeze(1)
+            val = (im0 * weights) * mask.float()
+            im1.reshape(-1).scatter_add_(0, flat.reshape(-1), val.reshape(-1))
+        scatter(y_f_clamped, x_f_clamped, nw_k, mask_nw)
+        scatter(y_f_clamped, x_c_clamped, ne_k, mask_ne)
+        scatter(y_c_clamped, x_f_clamped, sw_k, mask_sw)
+        scatter(y_c_clamped, x_c_clamped, se_k, mask_se)
         return im1
     @staticmethod
+    def backward(ctx, grad_output): return None,None,None
 class forward_warp(nn.Module):
+    def forward(self, im0, flow): return ForwardWarpFunction.apply(im0, flow, 0)
+# ==================== 2. STEREO WARPER (fixed + safe dilation) ====================
 class ForwardWarpStereo(nn.Module):
+    def __init__(self): super().__init__()
+    def forward(self, im, shift, disp):
+        flow = torch.stack((-shift, torch.zeros_like(shift)), dim=-1)
+        # Fixed linear weights – no more detached arms
+        weights = disp / (disp.max() + 1e-8) + 0.05
+        warped = forward_warp()(im * weights.unsqueeze(1), flow)
+        wmap   = forward_warp()(weights.unsqueeze(1), flow)
+        wmap.clamp_(min=1e-6)
+        res = warped / wmap
+        occ = forward_warp()(torch.ones_like(im[:,:1]), flow) < 1e-6
+        # Smart dilation that never eats foreground
+        with torch.no_grad():
+            fg = (disp > disp.quantile(0.88)).float().unsqueeze(0)
+            dilated = torch.nn.functional.conv2d(occ.float(), torch.ones(1,1,15,15,device=device), padding=7) > 0.1
+            occ = torch.clamp(occ.float() + dilated * (1-fg), 0, 1)
+        return res, occ
+stereo_warper = ForwardWarpStereo().to(device)
+# ==================== 3. MODELS ====================
+print("Loading Depth Anything V2 Large...")
+depth_model = AutoModelForDepthEstimation.from_pretrained(
+    "depth-anything/Depth-Anything-V2-Large-hf").to(device)
+processor = AutoImageProcessor.from_pretrained(
+    "depth-anything/Depth-Anything-V2-Large-hf")
+print("Loading LaMa...")
+try:
+    lama_path = hf_hub_download("fashn-ai/LaMa", "big-lama.pt")
+    lama_model = torch.jit.load(lama_path, map_location=device).eval()
+except:
+    lama_model = None
+    print("LaMa not available – inpainting will be skipped")
+# ==================== 4. HELPERS ====================
 @torch.no_grad()
+def estimate_depth(img_pil):
+    inputs = processor(images=img_pil, return_tensors="pt").to(device)
+    d = depth_model(**inputs).predicted_depth
+    d = torch.nn.functional.interpolate(d.unsqueeze(1), size=img_pil.size[::-1],
+                                       mode="bicubic", align_corners=False).squeeze()
+    d = (d - d.min()) / (d.max() - d.min() + 1e-8)
+    return d
+def safe_dilate(mask_np, k=5, it=2):
+    if mask_np.sum() == 0: return mask_np
+    kernel = cv2.getStructuringElement(cv2.MORPH_ELLIPSE, (k,k))
+    return cv2.dilate(mask_np, kernel, iterations=it)
 @torch.no_grad()
+def lama_inpaint(img_bgr, mask_np):
+    if lama_model is None or mask_np.sum() == 0:
+        return img_bgr
+    mask_dil = safe_dilate((mask_np*255).astype(np.uint8), k=7, it=3) / 255.0
+    h, w = img_bgr.shape[:2]
+    nh, nw = (h//8)*8, (w//8)*8
+    img_res = cv2.resize(img_bgr, (nw, nh))
+    mask_res = cv2.resize(mask_dil, (nw, nh), interpolation=cv2.INTER_NEAREST)
+    img_t = torch.from_numpy(img_res).float().permute(2,0,1).unsqueeze(0)/255.0
+    img_t = img_t[:,[2,1,0]].to(device)
+    mask_t = torch.from_numpy(mask_res > 0.5).float().unsqueeze(0).unsqueeze(0).to(device)
+    img_t = img_t * (1 - mask_t)
+    out = lama_model(img_t, mask_t)[0].permute(1,2,0).cpu().numpy()
+    out = np.clip(out*255, 0, 255).astype(np.uint8)
+    out = cv2.cvtColor(out, cv2.COLOR_RGB2BGR)
+    if (nh,nw) != (h,w):
+        out = cv2.resize(out, (w,h))
+    return out
+def make_anaglyph(l, r):
+    l = np.array(l); r = np.array(r)
     a = np.zeros_like(l)
+    a[...,0] = l[...,0]
+    a[...,1] = r[...,1]
+    a[...,2] = r[...,2]
     return Image.fromarray(a)
+# ==================== 5. MAIN PIPELINE ====================
 @torch.no_grad()
+def stereo_pipeline(img_pil, strength=3.2, convergence=0.08):
+    if img_pil is None: return None,None,None,None
+    w, h = img_pil.size
     if w > 1920:
+        ratio = 1920/w
+        img_pil = img_pil.resize((int(w*ratio), int(h*ratio)), Image.LANCZOS)
+        w, h = img_pil.size
+    depth = estimate_depth(img_pil)
+    disp = torch.clamp(depth**2, max=torch.quantile(depth**2, 0.995))
+    max_shift = w * strength / 100.0
+    shift = disp * max_shift
+    shift = shift - shift.min() - convergence * (shift.max() - shift.min())
+    tensor = torch.from_numpy(np.array(img_pil)).float().to(device)/255.0
+    tensor = tensor.permute(2,0,1).unsqueeze(0)
+    right, occ = stereo_warper(tensor, shift.unsqueeze(0), disp.unsqueeze(0))
+    right_np = (right.squeeze(0).permute(1,2,0).cpu().numpy()*255).astype(np.uint8)
+    right_bgr = cv2.cvtColor(right_np, cv2.COLOR_RGB2BGR)
+    mask_np = occ.squeeze(0).cpu().numpy()
+    # Two-pass LaMa (safe + perfect edges)
+    right_filled = lama_inpaint(right_bgr, mask_np)
+    right_filled = lama_inpaint(right_filled, mask_np)  # second pass
+    right_pil = Image.fromarray(cv2.cvtColor(right_filled, cv2.COLOR_BGR2RGB))
+    sbs = Image.new("RGB", (w*2, h))
+    sbs.paste(img_pil, (0,0))
+    sbs.paste(right_pil, (w,0))
+    ana = make_anaglyph(img_pil, right_pil)
+    depth_vis = Image.fromarray((depth.cpu().numpy()*255).astype(np.uint8))
+    mask_vis = Image.fromarray((mask_np*255).astype(np.uint8))
+    return sbs, ana, depth_vis, mask_vis
+# ==================== 6. GRADIO UI ====================
+css = ".gradio-container {max-width: 1450px !important; margin: auto !important;}"
+with gr.Blocks() as demo:
+    gr.HTML(f"<style>{css}</style>")
+    gr.Markdown("# 2D → 3D Stereo – Pro Quality\nDepth Anything V2 + Forward Warp + Smart LaMa")
     with gr.Row():
+        with gr.Column():
+            inp = gr.Image(type="pil", label="Upload Image", height=520)
             with gr.Accordion("Settings", open=True):
+                strength = gr.Slider(0.5, 8, 3.2, step=0.1, label="3D Strength (%)")
+                conv = gr.Slider(0, 1, 0.08, step=0.01, label="Convergence (0=pop-out)")
+            btn = gr.Button("Generate 3D", variant="primary")
+        with gr.Column():
+            out_ana = gr.Image(label="Anaglyph (Red/Cyan)", height=520)
+            out_sbs = gr.Image(label="Side-by-Side", height=320)
             with gr.Row():
+                gr.Image(label="Depth Map", height=200)
+                gr.Image(label="Mask", height=200)
+    btn.click(stereo_pipeline, [inp, strength, conv],
+              [out_sbs, out_ana, gr.Image(), gr.Image()])
+    gr.Markdown("**Red/Cyan glasses** → anaglyph • **Cross-eye/parallel** → side-by-side")
+demo.launch(share=True)