Spaces:

juanmackie
/

2DTo3DSpatialPhotoConverter

Running

App Files Files Community

2DTo3DSpatialPhotoConverter / app.py

juanmackie

Update app.py

4058b98 verified 9 months ago

raw

history blame contribute delete

12.5 kB

	import gradio as gr
	import torch
	import numpy as np
	import cv2
	from PIL import Image
	import tempfile # For creating temporary video files
	import os # Import the 'os' module
	import accelerate # Import accelerate for better memory management (recommended)

	# Marigold specific imports
	from diffusers import MarigoldDepthPipeline, DDIMScheduler
	from huggingface_hub import login # For Hugging Face Hub login if needed

	# --- Marigold Model Setup ---
	CHECKPOINT = "prs-eth/marigold-depth-v1-1"

	# Check for HF_TOKEN_LOGIN environment variable for private models or higher rate limits
	if "HF_TOKEN_LOGIN" in os.environ:
	login(token=os.environ["HF_TOKEN_LOGIN"])

	device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
	# Use bfloat16 for CUDA if available for performance, else float32
	dtype = torch.bfloat16 if torch.cuda.is_available() else torch.float32

	# Load the Marigold pipeline
	try:
	pipe = MarigoldDepthPipeline.from_pretrained(CHECKPOINT)
	pipe.scheduler = DDIMScheduler.from_config(pipe.scheduler.config, timestep_spacing="trailing")
	pipe = pipe.to(device=device, dtype=dtype)

	# Enable xformers for memory-efficient attention ONLY IF CUDA is available
	if torch.cuda.is_available():
	try:
	import xformers
	pipe.enable_xformers_memory_efficient_attention()
	print("xFormers enabled for Marigold pipeline.")
	except ImportError:
	print("xFormers not found, running without memory-efficient attention (on GPU).")
	else:
	print("Running on CPU or MPS. xFormers memory-efficient attention is not applicable.")

	print(f"MarigoldDepthPipeline loaded successfully from {CHECKPOINT} on {device}.")
	except Exception as e:
	print(f"Error loading MarigoldDepthPipeline: {e}")
	pipe = None # Set pipe to None to gracefully handle if it couldn't be loaded

	# --- Default Marigold Parameters (from their demo) ---
	DEFAULT_MARIGOLD_ENSEMBLE_SIZE = 1
	DEFAULT_MARIGOLD_DENOISE_STEPS = 4
	DEFAULT_MARIGOLD_PROCESSING_RES = 768 # Recommended resolution for Marigold

	def process_image(image, max_disparity_ratio, inpaint_radius, ensemble_size, denoise_steps, processing_res):
	"""
	Convert a 2D photo to a stereoscopic 3D image pair using Marigold for depth estimation
	and DIBR, with adjustable parameters.
	"""
	if pipe is None:
	print("Error: Marigold model not loaded. Cannot process image.")
	return Image.new('RGB', (200, 200), color = 'red')

	# Convert PIL image to numpy array
	image_np = np.array(image)
	height, width = image_np.shape[:2]

	# Step 1: Estimate the depth map using Marigold
	try:
	# Marigold's pipeline directly takes a PIL Image.
	# Use a fixed seed for reproducibility if desired, otherwise remove 'generator'.
	generator = torch.Generator(device=device).manual_seed(2024)
	marigold_output = pipe(
	image, # Pass PIL Image directly
	ensemble_size=ensemble_size,
	num_inference_steps=denoise_steps,
	processing_resolution=processing_res,
	batch_size=1 if processing_res == 0 else 2, # Batch size recommended by Marigold for resolutions
	generator=generator,
	).prediction # This is the predicted depth map as a torch.Tensor

	# Move to CPU and convert to NumPy array
	depth_map = marigold_output.squeeze().cpu().numpy()

	except Exception as e:
	print(f"Error during Marigold depth estimation: {e}")
	# Return an orange image to indicate a depth estimation specific error
	return Image.new('RGB', (200, 200), color = 'orange')

	# Normalize the depth map to [0,1]
	if depth_map.max() - depth_map.min() > 0:
	depth_map = (depth_map - depth_map.min()) / (depth_map.max() - depth_map.min())
	else:
	depth_map = np.zeros_like(depth_map) # Handle flat depth map

	# Smooth the depth map to reduce noise for DIBR
	depth_map = cv2.GaussianBlur(depth_map, (5, 5), 0)

	# Step 2: Calculate the disparity map (inversely proportional to depth)
	max_disparity_pixels = int(max_disparity_ratio * width)
	disparity_map = max_disparity_pixels * (1 - depth_map)

	# Step 3: Initialize left and right images and masks for DIBR
	left_image = np.zeros_like(image_np)
	right_image = np.zeros_like(image_np)
	left_mask = np.ones((height, width), dtype=bool)
	right_mask = np.ones((height, width), dtype=bool)

	# Step 4: Perform pixel shifting (forward warping)
	for y in range(height):
	for x in range(width):
	disparity = int(disparity_map[y, x])

	new_x_left = x + disparity
	new_x_right = x - disparity

	if 0 <= new_x_left < width:
	left_image[y, new_x_left] = image_np[y, x]
	left_mask[y, new_x_left] = False

	if 0 <= new_x_right < width:
	right_image[y, new_x_right] = image_np[y, x]
	right_mask[y, new_x_right] = False

	# Convert masks to uint8 for OpenCV inpainting
	left_mask_uint8 = left_mask.astype(np.uint8) * 255
	right_mask_uint8 = right_mask.astype(np.uint8) * 255

	# Step 5: Apply inpainting to fill holes
	left_image_inpaint = cv2.inpaint(left_image, left_mask_uint8, inpaint_radius, cv2.INPAINT_TELEA)
	right_image_inpaint = cv2.inpaint(right_image, right_mask_uint8, inpaint_radius, cv2.INPAINT_TELEA)

	# Step 6: Combine into a side-by-side stereoscopic image
	stereo_image = np.hstack((left_image_inpaint, right_image_inpaint))

	return Image.fromarray(stereo_image)


	def process_video(video_path, max_disparity_ratio, inpaint_radius, ensemble_size, denoise_steps, processing_res):
	"""
	Convert a 2D video to a stereoscopic 3D video by processing each frame.
	"""
	if pipe is None:
	print("Error: Marigold model not loaded. Cannot process video.")
	return None

	cap = cv2.VideoCapture(video_path)
	if not cap.isOpened():
	print(f"Error: Could not open video file at {video_path}")
	return None

	fps = cap.get(cv2.CAP_PROP_FPS)
	original_width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
	original_height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))

	output_width = original_width * 2
	output_height = original_height

	temp_output_video_path = tempfile.NamedTemporaryFile(suffix=".mp4", delete=False).name
	fourcc = cv2.VideoWriter_fourcc(*'mp4v') # Codec for MP4
	out = cv2.VideoWriter(temp_output_video_path, fourcc, fps, (output_width, output_height))

	if not out.isOpened():
	print(f"Error: Could not create video writer for {temp_output_video_path}")
	cap.release()
	return None

	frame_count = 0
	while True:
	ret, frame_bgr = cap.read() # frame_bgr is in BGR format
	if not ret:
	break

	frame_rgb_pil = Image.fromarray(cv2.cvtColor(frame_bgr, cv2.COLOR_BGR2RGB))

	# Process the single frame using the existing image processing logic
	processed_frame_pil = process_image(
	frame_rgb_pil,
	max_disparity_ratio,
	inpaint_radius,
	ensemble_size, # Pass Marigold params
	denoise_steps, # Pass Marigold params
	processing_res # Pass Marigold params
	)

	if processed_frame_pil is None:
	print(f"Skipping frame {frame_count} due to processing error.")
	processed_frame_bgr = np.zeros((output_height, output_width, 3), dtype=np.uint8)
	else:
	processed_frame_np_rgb = np.array(processed_frame_pil)
	processed_frame_bgr = cv2.cvtColor(processed_frame_np_rgb, cv2.COLOR_RGB2BGR)

	out.write(processed_frame_bgr)
	frame_count += 1
	print(f"Processed frame {frame_count}...")

	cap.release()
	out.release()
	print(f"Finished processing {frame_count} frames. Output video saved to: {temp_output_video_path}")
	return temp_output_video_path

	# Define the Gradio web interface layout and components
	with gr.Blocks() as demo:
	gr.Markdown(
	"""
	# 2D to Stereoscopic 3D Converter (with Marigold Depth)
	Upload a 2D photo or video to generate a stereoscopic 3D image or video pair for viewing on a Quest headset.
	The output is a side-by-side format: left half for the left eye, right half for the right eye.
	Adjust the sliders to fine-tune the 3D effect and Marigold's depth estimation.
	"""
	)

	# Global sliders for DIBR and Marigold parameters
	with gr.Row():
	max_disparity_slider = gr.Slider(
	minimum=0.01,
	maximum=0.10,
	value=0.03, # A balanced default
	step=0.005,
	label="Max Disparity Ratio (controls 3D intensity)",
	info="Higher values mean a stronger 3D effect, but can cause more distortion."
	)
	inpaint_radius_slider = gr.Slider(
	minimum=1,
	maximum=20,
	value=5, # A common default for inpainting
	step=1,
	label="Inpainting Radius (controls hole filling)",
	info="Larger values fill holes more, but can blur details around shifted objects."
	)

	with gr.Accordion("Marigold Depth Estimation Settings", open=False):
	with gr.Row():
	ensemble_size_slider = gr.Slider(
	label="Marigold Ensemble size",
	minimum=1,
	maximum=10,
	step=1,
	value=DEFAULT_MARIGOLD_ENSEMBLE_SIZE,
	info="Higher values improve accuracy but increase processing time."
	)
	denoise_steps_slider = gr.Slider(
	label="Marigold Denoising steps",
	minimum=1,
	maximum=20,
	step=1,
	value=DEFAULT_MARIGOLD_DENOISE_STEPS,
	info="More steps improve quality but increase processing time."
	)
	processing_res_radio = gr.Radio(
	[
	("Native", 0),
	("Recommended (768)", 768),
	("High (1024)", 1024)
	],
	label="Marigold Processing resolution",
	value=DEFAULT_MARIGOLD_PROCESSING_RES,
	info="Resolution for Marigold's internal processing. Native uses original image resolution. Higher resolutions are more accurate but slower."
	)

	with gr.Tabs():
	with gr.TabItem("Image Conversion"):
	with gr.Row():
	with gr.Column():
	image_input = gr.Image(type="pil", label="Upload a 2D Photo")
	image_process_button = gr.Button("Convert Image to 3D")
	with gr.Column():
	image_output = gr.Image(type="pil", label="Stereoscopic 3D Image Output (Side-by-Side)")
	# Connect the image button to the image processing function
	image_process_button.click(
	fn=process_image,
	inputs=[
	image_input,
	max_disparity_slider,
	inpaint_radius_slider,
	ensemble_size_slider,
	denoise_steps_slider,
	processing_res_radio
	],
	outputs=image_output
	)

	with gr.TabItem("Video Conversion"):
	with gr.Row():
	with gr.Column():
	video_input = gr.Video(label="Upload a 2D MP4 Video")
	video_process_button = gr.Button("Convert Video to 3D")
	with gr.Column():
	video_output = gr.Video(label="Stereoscopic 3D Video Output (Side-by-Side)")
	# Connect the video button to the video processing function
	video_process_button.click(
	fn=process_video,
	inputs=[
	video_input,
	max_disparity_slider,
	inpaint_radius_slider,
	ensemble_size_slider,
	denoise_steps_slider,
	processing_res_radio
	],
	outputs=video_output
	)

	# This block is executed when the script is run directly (e.g., for local testing).
	# Hugging Face Spaces typically runs the app via its own internal mechanisms.
	if __name__ == '__main__':
	demo.launch()