Spaces:

enoky
/

2D-to-Stereo-3D

Running

File size: 6,898 Bytes

import gradio as gr
import torch
import numpy as np
import cv2
from PIL import Image
from transformers import DPTForDepthEstimation, DPTImageProcessor
from huggingface_hub import hf_hub_download
import os

# === DEVICE ===
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Running on device: {device}")

# === LOAD MODELS ===
def load_models():
    print("Loading Depth Model...")
    # 1. Depth Model
    depth_model = DPTForDepthEstimation.from_pretrained("Intel/dpt-hybrid-midas").to(device)
    depth_processor = DPTImageProcessor.from_pretrained("Intel/dpt-hybrid-midas")
    
    print("Loading LaMa Inpainting Model...")
    # 2. LaMa Inpainting Model (TorchScript)
    # We download the JIT traced model which is self-contained
    model_path = hf_hub_download(repo_id="smartywu/big-lama", filename="big-lama.pt")
    lama_model = torch.jit.load(model_path).to(device)
    lama_model.eval()
    
    return depth_model, depth_processor, lama_model

# Load models once at startup
depth_model, depth_processor, lama_model = load_models()

# === DEPTH ESTIMATION ===
@torch.no_grad()
def estimate_depth(image_pil, model, processor):
    original_size = image_pil.size
    inputs = processor(images=image_pil, return_tensors="pt").to(device)
    depth = model(**inputs).predicted_depth
    
    depth = torch.nn.functional.interpolate(
        depth.unsqueeze(1),
        size=(original_size[1], original_size[0]),
        mode="bicubic",
        align_corners=False,
    ).squeeze().detach().cpu().numpy()
    
    depth_min, depth_max = depth.min(), depth.max()
    if depth_max - depth_min > 0:
        return (depth - depth_min) / (depth_max - depth_min)
    return depth

# === STEREO GENERATION LOGIC ===
def generate_right_and_mask(image, shift_map):
    height, width = image.shape[:2]
    x_coords, y_coords = np.meshgrid(np.arange(width), np.arange(height))
    shift = shift_map.astype(int)
    target_x = x_coords - shift
    
    right = np.zeros_like(image)
    # Mask: 1 (or 255) means HOLE/MISSING info. 
    # Initialize as all holes (255)
    mask = np.ones((height, width), dtype=np.float32) 
    
    valid_mask = (target_x >= 0) & (target_x < width)
    flat_y = y_coords[valid_mask]
    flat_x_target = target_x[valid_mask]
    flat_x_source = x_coords[valid_mask]
    
    right[flat_y, flat_x_target] = image[flat_y, flat_x_source]
    # Mark written pixels as valid (0)
    mask[flat_y, flat_x_target] = 0.0
    
    return right, mask

# === LOCAL INPAINTING ===
@torch.no_grad()
def run_local_lama(image_bgr, mask_float):
    """

    Runs LaMa locally.

    image_bgr: HxWx3 uint8 numpy array

    mask_float: HxW float32 numpy array (1.0 = hole, 0.0 = valid)

    """
    # 1. Resize to be divisible by 8 (LaMa requirement)
    h, w = image_bgr.shape[:2]
    new_h = (h // 8) * 8
    new_w = (w // 8) * 8
    
    img_resized = cv2.resize(image_bgr, (new_w, new_h))
    mask_resized = cv2.resize(mask_float, (new_w, new_h), interpolation=cv2.INTER_NEAREST)
    
    # 2. Convert to Torch Tensors
    # Image: (1, 3, H, W), RGB, 0-1
    img_t = torch.from_numpy(img_resized).float().permute(2, 0, 1).unsqueeze(0) / 255.0
    # Swap BGR to RGB
    img_t = img_t[:, [2, 1, 0], :, :]
    
    # Mask: (1, 1, H, W), 0-1
    mask_t = torch.from_numpy(mask_resized).float().unsqueeze(0).unsqueeze(0)
    # Binary threshold just in case
    mask_t = (mask_t > 0.5).float()
    
    img_t = img_t.to(device)
    mask_t = mask_t.to(device)
    
    # 3. Inference
    inpainted_t = lama_model(img_t, mask_t)
    
    # 4. Post-process
    inpainted = inpainted_t[0].permute(1, 2, 0).cpu().numpy()
    inpainted = np.clip(inpainted * 255, 0, 255).astype(np.uint8)
    
    # Swap back RGB to BGR
    inpainted = cv2.cvtColor(inpainted, cv2.COLOR_RGB2BGR)
    
    # Resize back to original if needed
    if new_h != h or new_w != w:
        inpainted = cv2.resize(inpainted, (w, h))
        
    return inpainted

def make_anaglyph(left, right):
    l_arr = np.array(left)
    r_arr = np.array(right)
    anaglyph = np.zeros_like(l_arr)
    anaglyph[:, :, 0] = l_arr[:, :, 0]
    anaglyph[:, :, 1] = r_arr[:, :, 1]
    anaglyph[:, :, 2] = r_arr[:, :, 2]
    return Image.fromarray(anaglyph)

# === PIPELINE ===
def stereo_pipeline(image_pil, divergence, convergence):
    if image_pil is None:
        return None, None
        
    # Convert to BGR for OpenCV processing
    image_cv = cv2.cvtColor(np.array(image_pil), cv2.COLOR_RGB2BGR)

    # 1. Depth
    depth = estimate_depth(image_pil, depth_model, depth_processor)
    
    # 2. Shift Map
    shift = (depth - convergence) * divergence
    
    # 3. Warping
    right_img, mask = generate_right_and_mask(image_cv, shift)
    
    # 4. Inpainting (Local)
    right_filled = run_local_lama(right_img, mask)

    left = image_pil
    right = Image.fromarray(cv2.cvtColor(right_filled, cv2.COLOR_BGR2RGB))

    # 5. Composition
    width, height = left.size
    combined_image = Image.new('RGB', (width * 2, height))
    combined_image.paste(left, (0, 0))
    combined_image.paste(right, (width, 0))
    
    anaglyph_image = make_anaglyph(left, right)
    
    return combined_image, anaglyph_image

# === GRADIO UI ===
with gr.Blocks(title="2D to 3D Stereo") as demo:
    gr.Markdown("## 2D to 3D Stereo Generator (Fully Local)")
    gr.Markdown("Generates stereo pairs using Depth Estimation and **Local LaMa Inpainting**. No external APIs required.")
    
    with gr.Row():
        with gr.Column(scale=1):
            input_img = gr.Image(type="pil", label="Input Image", height=480)
            
            with gr.Group():
                gr.Markdown("### 3D Controls")
                divergence_slider = gr.Slider(
                    minimum=0, maximum=100, value=30, step=1, 
                    label="3D Strength (Divergence)", 
                    info="Max pixel separation."
                )
                convergence_slider = gr.Slider(
                    minimum=0.0, maximum=1.0, value=0.1, step=0.05, 
                    label="Focus Plane (Convergence)", 
                    info="0.0 = Background at screen. 1.0 = Foreground at screen."
                )
            
            btn = gr.Button("Generate 3D", variant="primary")
            
        with gr.Column(scale=1):
            out_anaglyph = gr.Image(label="Anaglyph (Red/Cyan)", height=480)
    
    with gr.Row():
        out_stereo = gr.Image(label="Side-by-Side Stereo Pair", height=400)
        
    btn.click(
        fn=stereo_pipeline, 
        inputs=[input_img, divergence_slider, convergence_slider], 
        outputs=[out_stereo, out_anaglyph]
    )

if __name__ == "__main__":
    demo.launch()