Spaces:
Running
Running
| import gradio as gr | |
| import torch | |
| import numpy as np | |
| import cv2 | |
| from PIL import Image | |
| from transformers import DPTForDepthEstimation, DPTImageProcessor | |
| from huggingface_hub import hf_hub_download | |
| import os | |
| # === DEVICE === | |
| device = torch.device("cuda" if torch.cuda.is_available() else "cpu") | |
| print(f"Running on device: {device}") | |
| # === LOAD MODELS === | |
| def load_models(): | |
| print("Loading Depth Model...") | |
| # 1. Depth Model | |
| depth_model = DPTForDepthEstimation.from_pretrained("Intel/dpt-hybrid-midas").to(device) | |
| depth_processor = DPTImageProcessor.from_pretrained("Intel/dpt-hybrid-midas") | |
| print("Loading LaMa Inpainting Model...") | |
| # 2. LaMa Inpainting Model (TorchScript) | |
| # We download the JIT traced model which is self-contained | |
| model_path = hf_hub_download(repo_id="smartywu/big-lama", filename="big-lama.pt") | |
| lama_model = torch.jit.load(model_path).to(device) | |
| lama_model.eval() | |
| return depth_model, depth_processor, lama_model | |
| # Load models once at startup | |
| depth_model, depth_processor, lama_model = load_models() | |
| # === DEPTH ESTIMATION === | |
| def estimate_depth(image_pil, model, processor): | |
| original_size = image_pil.size | |
| inputs = processor(images=image_pil, return_tensors="pt").to(device) | |
| depth = model(**inputs).predicted_depth | |
| depth = torch.nn.functional.interpolate( | |
| depth.unsqueeze(1), | |
| size=(original_size[1], original_size[0]), | |
| mode="bicubic", | |
| align_corners=False, | |
| ).squeeze().detach().cpu().numpy() | |
| depth_min, depth_max = depth.min(), depth.max() | |
| if depth_max - depth_min > 0: | |
| return (depth - depth_min) / (depth_max - depth_min) | |
| return depth | |
| # === STEREO GENERATION LOGIC === | |
| def generate_right_and_mask(image, shift_map): | |
| height, width = image.shape[:2] | |
| x_coords, y_coords = np.meshgrid(np.arange(width), np.arange(height)) | |
| shift = shift_map.astype(int) | |
| target_x = x_coords - shift | |
| right = np.zeros_like(image) | |
| # Mask: 1 (or 255) means HOLE/MISSING info. | |
| # Initialize as all holes (255) | |
| mask = np.ones((height, width), dtype=np.float32) | |
| valid_mask = (target_x >= 0) & (target_x < width) | |
| flat_y = y_coords[valid_mask] | |
| flat_x_target = target_x[valid_mask] | |
| flat_x_source = x_coords[valid_mask] | |
| right[flat_y, flat_x_target] = image[flat_y, flat_x_source] | |
| # Mark written pixels as valid (0) | |
| mask[flat_y, flat_x_target] = 0.0 | |
| return right, mask | |
| # === LOCAL INPAINTING === | |
| def run_local_lama(image_bgr, mask_float): | |
| """ | |
| Runs LaMa locally. | |
| image_bgr: HxWx3 uint8 numpy array | |
| mask_float: HxW float32 numpy array (1.0 = hole, 0.0 = valid) | |
| """ | |
| # 1. Resize to be divisible by 8 (LaMa requirement) | |
| h, w = image_bgr.shape[:2] | |
| new_h = (h // 8) * 8 | |
| new_w = (w // 8) * 8 | |
| img_resized = cv2.resize(image_bgr, (new_w, new_h)) | |
| mask_resized = cv2.resize(mask_float, (new_w, new_h), interpolation=cv2.INTER_NEAREST) | |
| # 2. Convert to Torch Tensors | |
| # Image: (1, 3, H, W), RGB, 0-1 | |
| img_t = torch.from_numpy(img_resized).float().permute(2, 0, 1).unsqueeze(0) / 255.0 | |
| # Swap BGR to RGB | |
| img_t = img_t[:, [2, 1, 0], :, :] | |
| # Mask: (1, 1, H, W), 0-1 | |
| mask_t = torch.from_numpy(mask_resized).float().unsqueeze(0).unsqueeze(0) | |
| # Binary threshold just in case | |
| mask_t = (mask_t > 0.5).float() | |
| img_t = img_t.to(device) | |
| mask_t = mask_t.to(device) | |
| # 3. Inference | |
| inpainted_t = lama_model(img_t, mask_t) | |
| # 4. Post-process | |
| inpainted = inpainted_t[0].permute(1, 2, 0).cpu().numpy() | |
| inpainted = np.clip(inpainted * 255, 0, 255).astype(np.uint8) | |
| # Swap back RGB to BGR | |
| inpainted = cv2.cvtColor(inpainted, cv2.COLOR_RGB2BGR) | |
| # Resize back to original if needed | |
| if new_h != h or new_w != w: | |
| inpainted = cv2.resize(inpainted, (w, h)) | |
| return inpainted | |
| def make_anaglyph(left, right): | |
| l_arr = np.array(left) | |
| r_arr = np.array(right) | |
| anaglyph = np.zeros_like(l_arr) | |
| anaglyph[:, :, 0] = l_arr[:, :, 0] | |
| anaglyph[:, :, 1] = r_arr[:, :, 1] | |
| anaglyph[:, :, 2] = r_arr[:, :, 2] | |
| return Image.fromarray(anaglyph) | |
| # === PIPELINE === | |
| def stereo_pipeline(image_pil, divergence, convergence): | |
| if image_pil is None: | |
| return None, None | |
| # Convert to BGR for OpenCV processing | |
| image_cv = cv2.cvtColor(np.array(image_pil), cv2.COLOR_RGB2BGR) | |
| # 1. Depth | |
| depth = estimate_depth(image_pil, depth_model, depth_processor) | |
| # 2. Shift Map | |
| shift = (depth - convergence) * divergence | |
| # 3. Warping | |
| right_img, mask = generate_right_and_mask(image_cv, shift) | |
| # 4. Inpainting (Local) | |
| right_filled = run_local_lama(right_img, mask) | |
| left = image_pil | |
| right = Image.fromarray(cv2.cvtColor(right_filled, cv2.COLOR_BGR2RGB)) | |
| # 5. Composition | |
| width, height = left.size | |
| combined_image = Image.new('RGB', (width * 2, height)) | |
| combined_image.paste(left, (0, 0)) | |
| combined_image.paste(right, (width, 0)) | |
| anaglyph_image = make_anaglyph(left, right) | |
| return combined_image, anaglyph_image | |
| # === GRADIO UI === | |
| with gr.Blocks(title="2D to 3D Stereo") as demo: | |
| gr.Markdown("## 2D to 3D Stereo Generator (Fully Local)") | |
| gr.Markdown("Generates stereo pairs using Depth Estimation and **Local LaMa Inpainting**. No external APIs required.") | |
| with gr.Row(): | |
| with gr.Column(scale=1): | |
| input_img = gr.Image(type="pil", label="Input Image", height=480) | |
| with gr.Group(): | |
| gr.Markdown("### 3D Controls") | |
| divergence_slider = gr.Slider( | |
| minimum=0, maximum=100, value=30, step=1, | |
| label="3D Strength (Divergence)", | |
| info="Max pixel separation." | |
| ) | |
| convergence_slider = gr.Slider( | |
| minimum=0.0, maximum=1.0, value=0.1, step=0.05, | |
| label="Focus Plane (Convergence)", | |
| info="0.0 = Background at screen. 1.0 = Foreground at screen." | |
| ) | |
| btn = gr.Button("Generate 3D", variant="primary") | |
| with gr.Column(scale=1): | |
| out_anaglyph = gr.Image(label="Anaglyph (Red/Cyan)", height=480) | |
| with gr.Row(): | |
| out_stereo = gr.Image(label="Side-by-Side Stereo Pair", height=400) | |
| btn.click( | |
| fn=stereo_pipeline, | |
| inputs=[input_img, divergence_slider, convergence_slider], | |
| outputs=[out_stereo, out_anaglyph] | |
| ) | |
| if __name__ == "__main__": | |
| demo.launch() |