Spaces:

enoky
/

2D-to-Stereo-3D

Running

App Files Files Community

2D-to-Stereo-3D / app.py

enoky

run the LaMa model locally

53f760e verified 17 days ago

raw

history blame

6.9 kB

	import gradio as gr
	import torch
	import numpy as np
	import cv2
	from PIL import Image
	from transformers import DPTForDepthEstimation, DPTImageProcessor
	from huggingface_hub import hf_hub_download
	import os

	# === DEVICE ===
	device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
	print(f"Running on device: {device}")

	# === LOAD MODELS ===
	def load_models():
	print("Loading Depth Model...")
	# 1. Depth Model
	depth_model = DPTForDepthEstimation.from_pretrained("Intel/dpt-hybrid-midas").to(device)
	depth_processor = DPTImageProcessor.from_pretrained("Intel/dpt-hybrid-midas")

	print("Loading LaMa Inpainting Model...")
	# 2. LaMa Inpainting Model (TorchScript)
	# We download the JIT traced model which is self-contained
	model_path = hf_hub_download(repo_id="smartywu/big-lama", filename="big-lama.pt")
	lama_model = torch.jit.load(model_path).to(device)
	lama_model.eval()

	return depth_model, depth_processor, lama_model

	# Load models once at startup
	depth_model, depth_processor, lama_model = load_models()

	# === DEPTH ESTIMATION ===
	@torch.no_grad()
	def estimate_depth(image_pil, model, processor):
	original_size = image_pil.size
	inputs = processor(images=image_pil, return_tensors="pt").to(device)
	depth = model(**inputs).predicted_depth

	depth = torch.nn.functional.interpolate(
	depth.unsqueeze(1),
	size=(original_size[1], original_size[0]),
	mode="bicubic",
	align_corners=False,
	).squeeze().detach().cpu().numpy()

	depth_min, depth_max = depth.min(), depth.max()
	if depth_max - depth_min > 0:
	return (depth - depth_min) / (depth_max - depth_min)
	return depth

	# === STEREO GENERATION LOGIC ===
	def generate_right_and_mask(image, shift_map):
	height, width = image.shape[:2]
	x_coords, y_coords = np.meshgrid(np.arange(width), np.arange(height))
	shift = shift_map.astype(int)
	target_x = x_coords - shift

	right = np.zeros_like(image)
	# Mask: 1 (or 255) means HOLE/MISSING info.
	# Initialize as all holes (255)
	mask = np.ones((height, width), dtype=np.float32)

	valid_mask = (target_x >= 0) & (target_x < width)
	flat_y = y_coords[valid_mask]
	flat_x_target = target_x[valid_mask]
	flat_x_source = x_coords[valid_mask]

	right[flat_y, flat_x_target] = image[flat_y, flat_x_source]
	# Mark written pixels as valid (0)
	mask[flat_y, flat_x_target] = 0.0

	return right, mask

	# === LOCAL INPAINTING ===
	@torch.no_grad()
	def run_local_lama(image_bgr, mask_float):
	"""
	Runs LaMa locally.
	image_bgr: HxWx3 uint8 numpy array
	mask_float: HxW float32 numpy array (1.0 = hole, 0.0 = valid)
	"""
	# 1. Resize to be divisible by 8 (LaMa requirement)
	h, w = image_bgr.shape[:2]
	new_h = (h // 8) * 8
	new_w = (w // 8) * 8

	img_resized = cv2.resize(image_bgr, (new_w, new_h))
	mask_resized = cv2.resize(mask_float, (new_w, new_h), interpolation=cv2.INTER_NEAREST)

	# 2. Convert to Torch Tensors
	# Image: (1, 3, H, W), RGB, 0-1
	img_t = torch.from_numpy(img_resized).float().permute(2, 0, 1).unsqueeze(0) / 255.0
	# Swap BGR to RGB
	img_t = img_t[:, [2, 1, 0], :, :]

	# Mask: (1, 1, H, W), 0-1
	mask_t = torch.from_numpy(mask_resized).float().unsqueeze(0).unsqueeze(0)
	# Binary threshold just in case
	mask_t = (mask_t > 0.5).float()

	img_t = img_t.to(device)
	mask_t = mask_t.to(device)

	# 3. Inference
	inpainted_t = lama_model(img_t, mask_t)

	# 4. Post-process
	inpainted = inpainted_t[0].permute(1, 2, 0).cpu().numpy()
	inpainted = np.clip(inpainted * 255, 0, 255).astype(np.uint8)

	# Swap back RGB to BGR
	inpainted = cv2.cvtColor(inpainted, cv2.COLOR_RGB2BGR)

	# Resize back to original if needed
	if new_h != h or new_w != w:
	inpainted = cv2.resize(inpainted, (w, h))

	return inpainted

	def make_anaglyph(left, right):
	l_arr = np.array(left)
	r_arr = np.array(right)
	anaglyph = np.zeros_like(l_arr)
	anaglyph[:, :, 0] = l_arr[:, :, 0]
	anaglyph[:, :, 1] = r_arr[:, :, 1]
	anaglyph[:, :, 2] = r_arr[:, :, 2]
	return Image.fromarray(anaglyph)

	# === PIPELINE ===
	def stereo_pipeline(image_pil, divergence, convergence):
	if image_pil is None:
	return None, None

	# Convert to BGR for OpenCV processing
	image_cv = cv2.cvtColor(np.array(image_pil), cv2.COLOR_RGB2BGR)

	# 1. Depth
	depth = estimate_depth(image_pil, depth_model, depth_processor)

	# 2. Shift Map
	shift = (depth - convergence) * divergence

	# 3. Warping
	right_img, mask = generate_right_and_mask(image_cv, shift)

	# 4. Inpainting (Local)
	right_filled = run_local_lama(right_img, mask)

	left = image_pil
	right = Image.fromarray(cv2.cvtColor(right_filled, cv2.COLOR_BGR2RGB))

	# 5. Composition
	width, height = left.size
	combined_image = Image.new('RGB', (width * 2, height))
	combined_image.paste(left, (0, 0))
	combined_image.paste(right, (width, 0))

	anaglyph_image = make_anaglyph(left, right)

	return combined_image, anaglyph_image

	# === GRADIO UI ===
	with gr.Blocks(title="2D to 3D Stereo") as demo:
	gr.Markdown("## 2D to 3D Stereo Generator (Fully Local)")
	gr.Markdown("Generates stereo pairs using Depth Estimation and Local LaMa Inpainting. No external APIs required.")

	with gr.Row():
	with gr.Column(scale=1):
	input_img = gr.Image(type="pil", label="Input Image", height=480)

	with gr.Group():
	gr.Markdown("### 3D Controls")
	divergence_slider = gr.Slider(
	minimum=0, maximum=100, value=30, step=1,
	label="3D Strength (Divergence)",
	info="Max pixel separation."
	)
	convergence_slider = gr.Slider(
	minimum=0.0, maximum=1.0, value=0.1, step=0.05,
	label="Focus Plane (Convergence)",
	info="0.0 = Background at screen. 1.0 = Foreground at screen."
	)

	btn = gr.Button("Generate 3D", variant="primary")

	with gr.Column(scale=1):
	out_anaglyph = gr.Image(label="Anaglyph (Red/Cyan)", height=480)

	with gr.Row():
	out_stereo = gr.Image(label="Side-by-Side Stereo Pair", height=400)

	btn.click(
	fn=stereo_pipeline,
	inputs=[input_img, divergence_slider, convergence_slider],
	outputs=[out_stereo, out_anaglyph]
	)

	if __name__ == "__main__":
	demo.launch()