Spaces:

enoky
/

2D-to-Stereo-3D

Running

App Files Files Community

enoky commited on 14 days ago

Commit

fe2b283

verified ·

1 Parent(s): 79bdec3

switch to Depth Anything V2 Large

Browse files

Files changed (1) hide show

app.py +25 -26

app.py CHANGED Viewed

@@ -3,7 +3,7 @@ import torch
 import numpy as np
 import cv2
 from PIL import Image
-from transformers import DPTForDepthEstimation, DPTImageProcessor
 from huggingface_hub import hf_hub_download
 import os
@@ -13,22 +13,23 @@ print(f"Running on device: {device}")
 # === LOAD MODELS ===
 def load_models():
-    print("Loading Depth Model...")
-    # 1. Depth Model
-    depth_model = DPTForDepthEstimation.from_pretrained("Intel/dpt-hybrid-midas").to(device)
-    depth_processor = DPTImageProcessor.from_pretrained("Intel/dpt-hybrid-midas")
     print("Loading LaMa Inpainting Model...")
     # 2. LaMa Inpainting Model (TorchScript)
-    # We download the .pt file directly from a repository that hosts the compiled JIT version.
     try:
         model_path = hf_hub_download(repo_id="fashn-ai/LaMa", filename="big-lama.pt")
         print(f"Loading LaMa from: {model_path}")
-        # Load the TorchScript model
         lama_model = torch.jit.load(model_path, map_location=device)
         lama_model.eval()
     except Exception as e:
         print(f"Error loading LaMa model: {e}")
         raise e
@@ -42,9 +43,14 @@ depth_model, depth_processor, lama_model = load_models()
 @torch.no_grad()
 def estimate_depth(image_pil, model, processor):
     original_size = image_pil.size
     inputs = processor(images=image_pil, return_tensors="pt").to(device)
     depth = model(**inputs).predicted_depth
     depth = torch.nn.functional.interpolate(
         depth.unsqueeze(1),
         size=(original_size[1], original_size[0]),
@@ -52,6 +58,7 @@ def estimate_depth(image_pil, model, processor):
         align_corners=False,
     ).squeeze().detach().cpu().numpy()
     depth_min, depth_max = depth.min(), depth.max()
     if depth_max - depth_min > 0:
         return (depth - depth_min) / (depth_max - depth_min)
@@ -65,8 +72,7 @@ def generate_right_and_mask(image, shift_map):
     target_x = x_coords - shift
     right = np.zeros_like(image)
-    # Mask: 1 (or 255) means HOLE/MISSING info.
-    # Initialize as all holes (255)
     mask = np.ones((height, width), dtype=np.float32)
     valid_mask = (target_x >= 0) & (target_x < width)
@@ -75,7 +81,7 @@ def generate_right_and_mask(image, shift_map):
     flat_x_source = x_coords[valid_mask]
     right[flat_y, flat_x_target] = image[flat_y, flat_x_source]
-    # Mark written pixels as valid (0)
     mask[flat_y, flat_x_target] = 0.0
     return right, mask
@@ -89,8 +95,7 @@ def run_local_lama(image_bgr, mask_float):
     mask_float: HxW float32 numpy array (1.0 = hole, 0.0 = valid)
     """
     # 0. Dilate Mask (Fixes smearing/streaking)
-    # We expand the "hole" area (values of 1) to cover the jagged edges
-    # created by the pixel shift. This forces LaMa to regenerate the boundary.
     kernel = np.ones((5, 5), np.uint8)
     mask_uint8 = (mask_float * 255).astype(np.uint8)
     mask_dilated = cv2.dilate(mask_uint8, kernel, iterations=1)
@@ -104,23 +109,18 @@ def run_local_lama(image_bgr, mask_float):
     mask_resized = cv2.resize(mask_dilated, (new_w, new_h), interpolation=cv2.INTER_NEAREST)
     # 2. Convert to Torch Tensors
-    # Image: (1, 3, H, W), RGB, 0-1
     img_t = torch.from_numpy(img_resized).float().permute(2, 0, 1).unsqueeze(0) / 255.0
     # Swap BGR to RGB
     img_t = img_t[:, [2, 1, 0], :, :]
-    # Mask: (1, 1, H, W), 0-1
     mask_t = torch.from_numpy(mask_resized).float().unsqueeze(0).unsqueeze(0) / 255.0
-    # Binary threshold just in case
     mask_t = (mask_t > 0.5).float()
     img_t = img_t.to(device)
     mask_t = mask_t.to(device)
     # 3. Inference
-    # LaMa expects the image to be masked (zeroed out) in the hole regions for best results
-    img_t = img_t * (1 - mask_t)
     inpainted_t = lama_model(img_t, mask_t)
     # 4. Post-process
@@ -130,7 +130,7 @@ def run_local_lama(image_bgr, mask_float):
     # Swap back RGB to BGR
     inpainted = cv2.cvtColor(inpainted, cv2.COLOR_RGB2BGR)
-    # Resize back to original if needed
     if new_h != h or new_w != w:
         inpainted = cv2.resize(inpainted, (w, h))
@@ -150,10 +150,9 @@ def stereo_pipeline(image_pil, divergence, convergence):
     if image_pil is None:
         return None, None
-    # Convert to BGR for OpenCV processing
     image_cv = cv2.cvtColor(np.array(image_pil), cv2.COLOR_RGB2BGR)
-    # 1. Depth
     depth = estimate_depth(image_pil, depth_model, depth_processor)
     # 2. Shift Map
@@ -162,7 +161,7 @@ def stereo_pipeline(image_pil, divergence, convergence):
     # 3. Warping
     right_img, mask = generate_right_and_mask(image_cv, shift)
-    # 4. Inpainting (Local)
     right_filled = run_local_lama(right_img, mask)
     left = image_pil
@@ -180,8 +179,8 @@ def stereo_pipeline(image_pil, divergence, convergence):
 # === GRADIO UI ===
 with gr.Blocks(title="2D to 3D Stereo") as demo:
-    gr.Markdown("## 2D to 3D Stereo Generator (Fully Local)")
-    gr.Markdown("Generates stereo pairs using Depth Estimation and **Local LaMa Inpainting**. No external APIs required.")
     with gr.Row():
         with gr.Column(scale=1):

 import numpy as np
 import cv2
 from PIL import Image
+from transformers import AutoModelForDepthEstimation, AutoImageProcessor
 from huggingface_hub import hf_hub_download
 import os
 # === LOAD MODELS ===
 def load_models():
+    print("Loading Depth Anything V2 Large...")
+    # 1. Depth Model (Depth Anything V2 Large)
+    # We use AutoModel to automatically load the correct architecture
+    depth_model = AutoModelForDepthEstimation.from_pretrained(
+        "depth-anything/Depth-Anything-V2-Large-hf"
+    ).to(device)
+    depth_processor = AutoImageProcessor.from_pretrained(
+        "depth-anything/Depth-Anything-V2-Large-hf"
+    )
     print("Loading LaMa Inpainting Model...")
     # 2. LaMa Inpainting Model (TorchScript)
     try:
         model_path = hf_hub_download(repo_id="fashn-ai/LaMa", filename="big-lama.pt")
         print(f"Loading LaMa from: {model_path}")
         lama_model = torch.jit.load(model_path, map_location=device)
         lama_model.eval()
     except Exception as e:
         print(f"Error loading LaMa model: {e}")
         raise e
 @torch.no_grad()
 def estimate_depth(image_pil, model, processor):
     original_size = image_pil.size
+    # Preprocess image
     inputs = processor(images=image_pil, return_tensors="pt").to(device)
+    # Inference
     depth = model(**inputs).predicted_depth
+    # Interpolate depth back to ORIGINAL image size
     depth = torch.nn.functional.interpolate(
         depth.unsqueeze(1),
         size=(original_size[1], original_size[0]),
         align_corners=False,
     ).squeeze().detach().cpu().numpy()
+    # Normalize depth to 0-1 range
     depth_min, depth_max = depth.min(), depth.max()
     if depth_max - depth_min > 0:
         return (depth - depth_min) / (depth_max - depth_min)
     target_x = x_coords - shift
     right = np.zeros_like(image)
+    # Mask: 1.0 means HOLE/MISSING info
     mask = np.ones((height, width), dtype=np.float32)
     valid_mask = (target_x >= 0) & (target_x < width)
     flat_x_source = x_coords[valid_mask]
     right[flat_y, flat_x_target] = image[flat_y, flat_x_source]
+    # Mark written pixels as valid (0.0)
     mask[flat_y, flat_x_target] = 0.0
     return right, mask
     mask_float: HxW float32 numpy array (1.0 = hole, 0.0 = valid)
     """
     # 0. Dilate Mask (Fixes smearing/streaking)
+    # We expand the "hole" area to cover jagged edges
     kernel = np.ones((5, 5), np.uint8)
     mask_uint8 = (mask_float * 255).astype(np.uint8)
     mask_dilated = cv2.dilate(mask_uint8, kernel, iterations=1)
     mask_resized = cv2.resize(mask_dilated, (new_w, new_h), interpolation=cv2.INTER_NEAREST)
     # 2. Convert to Torch Tensors
     img_t = torch.from_numpy(img_resized).float().permute(2, 0, 1).unsqueeze(0) / 255.0
     # Swap BGR to RGB
     img_t = img_t[:, [2, 1, 0], :, :]
     mask_t = torch.from_numpy(mask_resized).float().unsqueeze(0).unsqueeze(0) / 255.0
     mask_t = (mask_t > 0.5).float()
     img_t = img_t.to(device)
     mask_t = mask_t.to(device)
     # 3. Inference
+    img_t = img_t * (1 - mask_t) # Zero out holes
     inpainted_t = lama_model(img_t, mask_t)
     # 4. Post-process
     # Swap back RGB to BGR
     inpainted = cv2.cvtColor(inpainted, cv2.COLOR_RGB2BGR)
+    # Resize back to original
     if new_h != h or new_w != w:
         inpainted = cv2.resize(inpainted, (w, h))
     if image_pil is None:
         return None, None
     image_cv = cv2.cvtColor(np.array(image_pil), cv2.COLOR_RGB2BGR)
+    # 1. Depth (Using Depth Anything V2)
     depth = estimate_depth(image_pil, depth_model, depth_processor)
     # 2. Shift Map
     # 3. Warping
     right_img, mask = generate_right_and_mask(image_cv, shift)
+    # 4. Inpainting (Local LaMa)
     right_filled = run_local_lama(right_img, mask)
     left = image_pil
 # === GRADIO UI ===
 with gr.Blocks(title="2D to 3D Stereo") as demo:
+    gr.Markdown("## 2D to 3D Stereo Generator (Depth Anything V2)")
+    gr.Markdown("Generates stereo pairs using **Depth Anything V2 Large** and Local LaMa Inpainting.")
     with gr.Row():
         with gr.Column(scale=1):