Spaces:

enoky
/

2D-to-Stereo-3D

Running

App Files Files Community

enoky commited on 14 days ago

Commit

53f760e

verified ·

1 Parent(s): f98a0fe

run the LaMa model locally

Browse files

Files changed (2) hide show

app.py +199 -223
requirements.txt +2 -2

app.py CHANGED Viewed

@@ -1,224 +1,200 @@
-import gradio as gr
-import torch
-import numpy as np
-import cv2
-from PIL import Image
-from transformers import DPTForDepthEstimation, DPTImageProcessor
-from gradio_client import Client, handle_file
-import tempfile
-import os
-# === DEVICE ===
-device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
-# === DEPTH MODEL ===
-def load_depth_model():
-    # DPTImageProcessor is the modern replacement for FeatureExtractor
-    model = DPTForDepthEstimation.from_pretrained("Intel/dpt-hybrid-midas").to(device)
-    processor = DPTImageProcessor.from_pretrained("Intel/dpt-hybrid-midas")
-    return model, processor
-@torch.no_grad()
-def estimate_depth(image_pil, model, processor):
-    # Keep original size for restoration later
-    original_size = image_pil.size  # (width, height)
-    # Preprocess (processor handles resizing internally for the model)
-    inputs = processor(images=image_pil, return_tensors="pt").to(device)
-    depth = model(**inputs).predicted_depth
-    # Interpolate depth back to ORIGINAL image size
-    depth = torch.nn.functional.interpolate(
-        depth.unsqueeze(1),
-        size=(original_size[1], original_size[0]), # torch expects (H, W)
-        mode="bicubic",
-        align_corners=False,
-    ).squeeze().detach().cpu().numpy()
-    # Normalize
-    depth_min, depth_max = depth.min(), depth.max()
-    if depth_max - depth_min > 0:
-        return (depth - depth_min) / (depth_max - depth_min)
-    return depth
-def generate_right_and_mask(image, shift_map):
-    """
-    Vectorized shift operation.
-    shift_map: 2D array indicating how many pixels to shift left (positive) or right (negative).
-    """
-    height, width = image.shape[:2]
-    # Create a grid of coordinates
-    x_coords, y_coords = np.meshgrid(np.arange(width), np.arange(height))
-    # Calculate target coordinates (shift pixels to the left for right eye)
-    shift = shift_map.astype(int)
-    target_x = x_coords - shift
-    # Initialize output and mask
-    right = np.zeros_like(image)
-    mask = np.ones((height, width), dtype=np.uint8) * 255 # 255 = hole/inpainting area
-    # Valid indices mask (ensure pixels land within image bounds)
-    valid_mask = (target_x >= 0) & (target_x < width)
-    # Flatten arrays for advanced indexing
-    flat_y = y_coords[valid_mask]
-    flat_x_target = target_x[valid_mask]
-    flat_x_source = x_coords[valid_mask]
-    # Assign pixels
-    # Note: simple overwriting handles occlusions naively but effectively for this use case
-    right[flat_y, flat_x_target] = image[flat_y, flat_x_source]
-    # Update Mask: Areas that were written to are NOT holes (0)
-    mask[flat_y, flat_x_target] = 0
-    return right, mask
-def make_anaglyph(left, right):
-    """
-    Creates a Red-Cyan anaglyph.
-    Left image provides the Red channel.
-    Right image provides the Green and Blue channels.
-    """
-    # Convert to arrays
-    l_arr = np.array(left)
-    r_arr = np.array(right)
-    # Create output array (same shape)
-    anaglyph = np.zeros_like(l_arr)
-    # Red channel from Left
-    anaglyph[:, :, 0] = l_arr[:, :, 0]
-    # Green and Blue channels from Right
-    anaglyph[:, :, 1] = r_arr[:, :, 1]
-    anaglyph[:, :, 2] = r_arr[:, :, 2]
-    return Image.fromarray(anaglyph)
-# === LAMA INPAINTING (Via Gradio Client) ===
-# Note: You need a valid Space that accepts image + mask.
-try:
-    lama_client = Client("asif-k/LaMa-Inpainting")
-except Exception as e:
-    print(f"Could not connect to external LaMa client: {e}")
-    lama_client = None
-def run_lama_inpainting(image_bgr, mask):
-    if lama_client is None:
-        print("LaMa client unavailable, returning unfilled image.")
-        return image_bgr
-    # Prepare files for Gradio Client
-    # Convert BGR (OpenCV) to RGB for PIL
-    img_rgb = cv2.cvtColor(image_bgr, cv2.COLOR_BGR2RGB)
-    with tempfile.NamedTemporaryFile(suffix=".png", delete=False) as f_img, \
-         tempfile.NamedTemporaryFile(suffix=".png", delete=False) as f_mask:
-        Image.fromarray(img_rgb).save(f_img.name)
-        Image.fromarray(mask).save(f_mask.name)
-        try:
-            # Predict using the external space
-            result_path = lama_client.predict(
-                image=handle_file(f_img.name),
-                mask=handle_file(f_mask.name),
-                api_name="/predict"
-            )
-            # Result is a filepath
-            res_img = Image.open(result_path)
-            return cv2.cvtColor(np.array(res_img), cv2.COLOR_RGB2BGR)
-        except Exception as e:
-            print(f"Inpainting failed: {e}")
-            return image_bgr # Return original with holes if fail
-        finally:
-            # Cleanup
-            os.remove(f_img.name)
-            os.remove(f_mask.name)
-# === APP LOGIC ===
-depth_model, depth_processor = load_depth_model()
-def stereo_pipeline(image_pil, divergence, convergence):
-    if image_pil is None:
-        return None, None
-    image_cv = cv2.cvtColor(np.array(image_pil), cv2.COLOR_RGB2BGR)
-    # 1. Estimate Depth (0.0 far to 1.0 near)
-    depth = estimate_depth(image_pil, depth_model, depth_processor)
-    # 2. Calculate Shift Map
-    # Divergence: Overall separation strength (pixels)
-    # Convergence: The depth plane that stays still (0.0 - 1.0)
-    # Result:
-    #   Positive shift (Leftwards) = Pop out of screen (Near objects)
-    #   Negative shift (Rightwards) = Go into screen (Far objects)
-    shift = (depth - convergence) * divergence
-    # 3. Shift Pixels
-    right_img, mask = generate_right_and_mask(image_cv, shift)
-    # 4. Inpaint Holes
-    # Pass the mask where 255 indicates holes to be filled
-    right_filled = run_lama_inpainting(right_img, mask)
-    left = image_pil
-    right = Image.fromarray(cv2.cvtColor(right_filled, cv2.COLOR_BGR2RGB))
-    # === Combine into Side-by-Side ===
-    width, height = left.size
-    combined_image = Image.new('RGB', (width * 2, height))
-    combined_image.paste(left, (0, 0))
-    combined_image.paste(right, (width, 0))
-    # === Create Anaglyph ===
-    anaglyph_image = make_anaglyph(left, right)
-    return combined_image, anaglyph_image
-# === GRADIO UI ===
-with gr.Blocks(title="2D to 3D Stereo") as demo:
-    gr.Markdown("## 2D to 3D Stereo Generator")
-    gr.Markdown("Generates a side-by-side stereo pair and anaglyph using Depth Estimation and LaMa Inpainting.")
-    with gr.Row():
-        with gr.Column(scale=1):
-            input_img = gr.Image(type="pil", label="Input Image", height=480)
-            # === Controls ===
-            with gr.Group():
-                gr.Markdown("### 3D Controls")
-                divergence_slider = gr.Slider(
-                    minimum=0, maximum=100, value=30, step=1,
-                    label="3D Strength (Divergence)",
-                    info="Max pixel separation. Higher = Deeper 3D effect."
-                )
-                convergence_slider = gr.Slider(
-                    minimum=0.0, maximum=1.0, value=0.1, step=0.05,
-                    label="Focus Plane (Convergence)",
-                    info="0.0 = Background at screen depth. 0.5 = Mid-range at screen. 1.0 = Foreground at screen."
-                )
-            btn = gr.Button("Generate 3D", variant="primary")
-        with gr.Column(scale=1):
-            out_anaglyph = gr.Image(label="Anaglyph (Red/Cyan)", height=480)
-    with gr.Row():
-        out_stereo = gr.Image(label="Side-by-Side Stereo Pair", height=400)
-    btn.click(
-        fn=stereo_pipeline,
-        inputs=[input_img, divergence_slider, convergence_slider],
-        outputs=[out_stereo, out_anaglyph]
-    )
-if __name__ == "__main__":
     demo.launch()

+import gradio as gr
+import torch
+import numpy as np
+import cv2
+from PIL import Image
+from transformers import DPTForDepthEstimation, DPTImageProcessor
+from huggingface_hub import hf_hub_download
+import os
+# === DEVICE ===
+device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+print(f"Running on device: {device}")
+# === LOAD MODELS ===
+def load_models():
+    print("Loading Depth Model...")
+    # 1. Depth Model
+    depth_model = DPTForDepthEstimation.from_pretrained("Intel/dpt-hybrid-midas").to(device)
+    depth_processor = DPTImageProcessor.from_pretrained("Intel/dpt-hybrid-midas")
+    print("Loading LaMa Inpainting Model...")
+    # 2. LaMa Inpainting Model (TorchScript)
+    # We download the JIT traced model which is self-contained
+    model_path = hf_hub_download(repo_id="smartywu/big-lama", filename="big-lama.pt")
+    lama_model = torch.jit.load(model_path).to(device)
+    lama_model.eval()
+    return depth_model, depth_processor, lama_model
+# Load models once at startup
+depth_model, depth_processor, lama_model = load_models()
+# === DEPTH ESTIMATION ===
+@torch.no_grad()
+def estimate_depth(image_pil, model, processor):
+    original_size = image_pil.size
+    inputs = processor(images=image_pil, return_tensors="pt").to(device)
+    depth = model(**inputs).predicted_depth
+    depth = torch.nn.functional.interpolate(
+        depth.unsqueeze(1),
+        size=(original_size[1], original_size[0]),
+        mode="bicubic",
+        align_corners=False,
+    ).squeeze().detach().cpu().numpy()
+    depth_min, depth_max = depth.min(), depth.max()
+    if depth_max - depth_min > 0:
+        return (depth - depth_min) / (depth_max - depth_min)
+    return depth
+# === STEREO GENERATION LOGIC ===
+def generate_right_and_mask(image, shift_map):
+    height, width = image.shape[:2]
+    x_coords, y_coords = np.meshgrid(np.arange(width), np.arange(height))
+    shift = shift_map.astype(int)
+    target_x = x_coords - shift
+    right = np.zeros_like(image)
+    # Mask: 1 (or 255) means HOLE/MISSING info.
+    # Initialize as all holes (255)
+    mask = np.ones((height, width), dtype=np.float32)
+    valid_mask = (target_x >= 0) & (target_x < width)
+    flat_y = y_coords[valid_mask]
+    flat_x_target = target_x[valid_mask]
+    flat_x_source = x_coords[valid_mask]
+    right[flat_y, flat_x_target] = image[flat_y, flat_x_source]
+    # Mark written pixels as valid (0)
+    mask[flat_y, flat_x_target] = 0.0
+    return right, mask
+# === LOCAL INPAINTING ===
+@torch.no_grad()
+def run_local_lama(image_bgr, mask_float):
+    """
+    Runs LaMa locally.
+    image_bgr: HxWx3 uint8 numpy array
+    mask_float: HxW float32 numpy array (1.0 = hole, 0.0 = valid)
+    """
+    # 1. Resize to be divisible by 8 (LaMa requirement)
+    h, w = image_bgr.shape[:2]
+    new_h = (h // 8) * 8
+    new_w = (w // 8) * 8
+    img_resized = cv2.resize(image_bgr, (new_w, new_h))
+    mask_resized = cv2.resize(mask_float, (new_w, new_h), interpolation=cv2.INTER_NEAREST)
+    # 2. Convert to Torch Tensors
+    # Image: (1, 3, H, W), RGB, 0-1
+    img_t = torch.from_numpy(img_resized).float().permute(2, 0, 1).unsqueeze(0) / 255.0
+    # Swap BGR to RGB
+    img_t = img_t[:, [2, 1, 0], :, :]
+    # Mask: (1, 1, H, W), 0-1
+    mask_t = torch.from_numpy(mask_resized).float().unsqueeze(0).unsqueeze(0)
+    # Binary threshold just in case
+    mask_t = (mask_t > 0.5).float()
+    img_t = img_t.to(device)
+    mask_t = mask_t.to(device)
+    # 3. Inference
+    inpainted_t = lama_model(img_t, mask_t)
+    # 4. Post-process
+    inpainted = inpainted_t[0].permute(1, 2, 0).cpu().numpy()
+    inpainted = np.clip(inpainted * 255, 0, 255).astype(np.uint8)
+    # Swap back RGB to BGR
+    inpainted = cv2.cvtColor(inpainted, cv2.COLOR_RGB2BGR)
+    # Resize back to original if needed
+    if new_h != h or new_w != w:
+        inpainted = cv2.resize(inpainted, (w, h))
+    return inpainted
+def make_anaglyph(left, right):
+    l_arr = np.array(left)
+    r_arr = np.array(right)
+    anaglyph = np.zeros_like(l_arr)
+    anaglyph[:, :, 0] = l_arr[:, :, 0]
+    anaglyph[:, :, 1] = r_arr[:, :, 1]
+    anaglyph[:, :, 2] = r_arr[:, :, 2]
+    return Image.fromarray(anaglyph)
+# === PIPELINE ===
+def stereo_pipeline(image_pil, divergence, convergence):
+    if image_pil is None:
+        return None, None
+    # Convert to BGR for OpenCV processing
+    image_cv = cv2.cvtColor(np.array(image_pil), cv2.COLOR_RGB2BGR)
+    # 1. Depth
+    depth = estimate_depth(image_pil, depth_model, depth_processor)
+    # 2. Shift Map
+    shift = (depth - convergence) * divergence
+    # 3. Warping
+    right_img, mask = generate_right_and_mask(image_cv, shift)
+    # 4. Inpainting (Local)
+    right_filled = run_local_lama(right_img, mask)
+    left = image_pil
+    right = Image.fromarray(cv2.cvtColor(right_filled, cv2.COLOR_BGR2RGB))
+    # 5. Composition
+    width, height = left.size
+    combined_image = Image.new('RGB', (width * 2, height))
+    combined_image.paste(left, (0, 0))
+    combined_image.paste(right, (width, 0))
+    anaglyph_image = make_anaglyph(left, right)
+    return combined_image, anaglyph_image
+# === GRADIO UI ===
+with gr.Blocks(title="2D to 3D Stereo") as demo:
+    gr.Markdown("## 2D to 3D Stereo Generator (Fully Local)")
+    gr.Markdown("Generates stereo pairs using Depth Estimation and **Local LaMa Inpainting**. No external APIs required.")
+    with gr.Row():
+        with gr.Column(scale=1):
+            input_img = gr.Image(type="pil", label="Input Image", height=480)
+            with gr.Group():
+                gr.Markdown("### 3D Controls")
+                divergence_slider = gr.Slider(
+                    minimum=0, maximum=100, value=30, step=1,
+                    label="3D Strength (Divergence)",
+                    info="Max pixel separation."
+                )
+                convergence_slider = gr.Slider(
+                    minimum=0.0, maximum=1.0, value=0.1, step=0.05,
+                    label="Focus Plane (Convergence)",
+                    info="0.0 = Background at screen. 1.0 = Foreground at screen."
+                )
+            btn = gr.Button("Generate 3D", variant="primary")
+        with gr.Column(scale=1):
+            out_anaglyph = gr.Image(label="Anaglyph (Red/Cyan)", height=480)
+    with gr.Row():
+        out_stereo = gr.Image(label="Side-by-Side Stereo Pair", height=400)
+    btn.click(
+        fn=stereo_pipeline,
+        inputs=[input_img, divergence_slider, convergence_slider],
+        outputs=[out_stereo, out_anaglyph]
+    )
+if __name__ == "__main__":
     demo.launch()

requirements.txt CHANGED Viewed

@@ -1,8 +1,8 @@
 gradio
-gradio_client
 torch
 numpy
 opencv-python
 pillow
 transformers
-scipy

 gradio
 torch
 numpy
 opencv-python
 pillow
 transformers
+scipy
+huggingface_hub