| | import gradio as gr |
| | import torch |
| | import numpy as np |
| | import cv2 |
| | from PIL import Image |
| | import tempfile |
| | import os |
| | import accelerate |
| |
|
| | |
| | from diffusers import MarigoldDepthPipeline, DDIMScheduler |
| | from huggingface_hub import login |
| |
|
| | |
| | CHECKPOINT = "prs-eth/marigold-depth-v1-1" |
| |
|
| | |
| | if "HF_TOKEN_LOGIN" in os.environ: |
| | login(token=os.environ["HF_TOKEN_LOGIN"]) |
| |
|
| | device = torch.device("cuda" if torch.cuda.is_available() else "cpu") |
| | |
| | dtype = torch.bfloat16 if torch.cuda.is_available() else torch.float32 |
| |
|
| | |
| | try: |
| | pipe = MarigoldDepthPipeline.from_pretrained(CHECKPOINT) |
| | pipe.scheduler = DDIMScheduler.from_config(pipe.scheduler.config, timestep_spacing="trailing") |
| | pipe = pipe.to(device=device, dtype=dtype) |
| | |
| | |
| | if torch.cuda.is_available(): |
| | try: |
| | import xformers |
| | pipe.enable_xformers_memory_efficient_attention() |
| | print("xFormers enabled for Marigold pipeline.") |
| | except ImportError: |
| | print("xFormers not found, running without memory-efficient attention (on GPU).") |
| | else: |
| | print("Running on CPU or MPS. xFormers memory-efficient attention is not applicable.") |
| |
|
| | print(f"MarigoldDepthPipeline loaded successfully from {CHECKPOINT} on {device}.") |
| | except Exception as e: |
| | print(f"Error loading MarigoldDepthPipeline: {e}") |
| | pipe = None |
| |
|
| | |
| | DEFAULT_MARIGOLD_ENSEMBLE_SIZE = 1 |
| | DEFAULT_MARIGOLD_DENOISE_STEPS = 4 |
| | DEFAULT_MARIGOLD_PROCESSING_RES = 768 |
| |
|
| | def process_image(image, max_disparity_ratio, inpaint_radius, ensemble_size, denoise_steps, processing_res): |
| | """ |
| | Convert a 2D photo to a stereoscopic 3D image pair using Marigold for depth estimation |
| | and DIBR, with adjustable parameters. |
| | """ |
| | if pipe is None: |
| | print("Error: Marigold model not loaded. Cannot process image.") |
| | return Image.new('RGB', (200, 200), color = 'red') |
| |
|
| | |
| | image_np = np.array(image) |
| | height, width = image_np.shape[:2] |
| |
|
| | |
| | try: |
| | |
| | |
| | generator = torch.Generator(device=device).manual_seed(2024) |
| | marigold_output = pipe( |
| | image, |
| | ensemble_size=ensemble_size, |
| | num_inference_steps=denoise_steps, |
| | processing_resolution=processing_res, |
| | batch_size=1 if processing_res == 0 else 2, |
| | generator=generator, |
| | ).prediction |
| |
|
| | |
| | depth_map = marigold_output.squeeze().cpu().numpy() |
| |
|
| | except Exception as e: |
| | print(f"Error during Marigold depth estimation: {e}") |
| | |
| | return Image.new('RGB', (200, 200), color = 'orange') |
| |
|
| | |
| | if depth_map.max() - depth_map.min() > 0: |
| | depth_map = (depth_map - depth_map.min()) / (depth_map.max() - depth_map.min()) |
| | else: |
| | depth_map = np.zeros_like(depth_map) |
| |
|
| | |
| | depth_map = cv2.GaussianBlur(depth_map, (5, 5), 0) |
| |
|
| | |
| | max_disparity_pixels = int(max_disparity_ratio * width) |
| | disparity_map = max_disparity_pixels * (1 - depth_map) |
| |
|
| | |
| | left_image = np.zeros_like(image_np) |
| | right_image = np.zeros_like(image_np) |
| | left_mask = np.ones((height, width), dtype=bool) |
| | right_mask = np.ones((height, width), dtype=bool) |
| |
|
| | |
| | for y in range(height): |
| | for x in range(width): |
| | disparity = int(disparity_map[y, x]) |
| | |
| | new_x_left = x + disparity |
| | new_x_right = x - disparity |
| | |
| | if 0 <= new_x_left < width: |
| | left_image[y, new_x_left] = image_np[y, x] |
| | left_mask[y, new_x_left] = False |
| | |
| | if 0 <= new_x_right < width: |
| | right_image[y, new_x_right] = image_np[y, x] |
| | right_mask[y, new_x_right] = False |
| |
|
| | |
| | left_mask_uint8 = left_mask.astype(np.uint8) * 255 |
| | right_mask_uint8 = right_mask.astype(np.uint8) * 255 |
| |
|
| | |
| | left_image_inpaint = cv2.inpaint(left_image, left_mask_uint8, inpaint_radius, cv2.INPAINT_TELEA) |
| | right_image_inpaint = cv2.inpaint(right_image, right_mask_uint8, inpaint_radius, cv2.INPAINT_TELEA) |
| |
|
| | |
| | stereo_image = np.hstack((left_image_inpaint, right_image_inpaint)) |
| |
|
| | return Image.fromarray(stereo_image) |
| |
|
| |
|
| | def process_video(video_path, max_disparity_ratio, inpaint_radius, ensemble_size, denoise_steps, processing_res): |
| | """ |
| | Convert a 2D video to a stereoscopic 3D video by processing each frame. |
| | """ |
| | if pipe is None: |
| | print("Error: Marigold model not loaded. Cannot process video.") |
| | return None |
| |
|
| | cap = cv2.VideoCapture(video_path) |
| | if not cap.isOpened(): |
| | print(f"Error: Could not open video file at {video_path}") |
| | return None |
| |
|
| | fps = cap.get(cv2.CAP_PROP_FPS) |
| | original_width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH)) |
| | original_height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT)) |
| |
|
| | output_width = original_width * 2 |
| | output_height = original_height |
| |
|
| | temp_output_video_path = tempfile.NamedTemporaryFile(suffix=".mp4", delete=False).name |
| | fourcc = cv2.VideoWriter_fourcc(*'mp4v') |
| | out = cv2.VideoWriter(temp_output_video_path, fourcc, fps, (output_width, output_height)) |
| |
|
| | if not out.isOpened(): |
| | print(f"Error: Could not create video writer for {temp_output_video_path}") |
| | cap.release() |
| | return None |
| |
|
| | frame_count = 0 |
| | while True: |
| | ret, frame_bgr = cap.read() |
| | if not ret: |
| | break |
| |
|
| | frame_rgb_pil = Image.fromarray(cv2.cvtColor(frame_bgr, cv2.COLOR_BGR2RGB)) |
| | |
| | |
| | processed_frame_pil = process_image( |
| | frame_rgb_pil, |
| | max_disparity_ratio, |
| | inpaint_radius, |
| | ensemble_size, |
| | denoise_steps, |
| | processing_res |
| | ) |
| | |
| | if processed_frame_pil is None: |
| | print(f"Skipping frame {frame_count} due to processing error.") |
| | processed_frame_bgr = np.zeros((output_height, output_width, 3), dtype=np.uint8) |
| | else: |
| | processed_frame_np_rgb = np.array(processed_frame_pil) |
| | processed_frame_bgr = cv2.cvtColor(processed_frame_np_rgb, cv2.COLOR_RGB2BGR) |
| |
|
| | out.write(processed_frame_bgr) |
| | frame_count += 1 |
| | print(f"Processed frame {frame_count}...") |
| |
|
| | cap.release() |
| | out.release() |
| | print(f"Finished processing {frame_count} frames. Output video saved to: {temp_output_video_path}") |
| | return temp_output_video_path |
| |
|
| | |
| | with gr.Blocks() as demo: |
| | gr.Markdown( |
| | """ |
| | # 2D to Stereoscopic 3D Converter (with Marigold Depth) |
| | Upload a 2D photo or video to generate a stereoscopic 3D image or video pair for viewing on a Quest headset. |
| | The output is a side-by-side format: left half for the left eye, right half for the right eye. |
| | Adjust the sliders to fine-tune the 3D effect and Marigold's depth estimation. |
| | """ |
| | ) |
| | |
| | |
| | with gr.Row(): |
| | max_disparity_slider = gr.Slider( |
| | minimum=0.01, |
| | maximum=0.10, |
| | value=0.03, |
| | step=0.005, |
| | label="Max Disparity Ratio (controls 3D intensity)", |
| | info="Higher values mean a stronger 3D effect, but can cause more distortion." |
| | ) |
| | inpaint_radius_slider = gr.Slider( |
| | minimum=1, |
| | maximum=20, |
| | value=5, |
| | step=1, |
| | label="Inpainting Radius (controls hole filling)", |
| | info="Larger values fill holes more, but can blur details around shifted objects." |
| | ) |
| |
|
| | with gr.Accordion("Marigold Depth Estimation Settings", open=False): |
| | with gr.Row(): |
| | ensemble_size_slider = gr.Slider( |
| | label="Marigold Ensemble size", |
| | minimum=1, |
| | maximum=10, |
| | step=1, |
| | value=DEFAULT_MARIGOLD_ENSEMBLE_SIZE, |
| | info="Higher values improve accuracy but increase processing time." |
| | ) |
| | denoise_steps_slider = gr.Slider( |
| | label="Marigold Denoising steps", |
| | minimum=1, |
| | maximum=20, |
| | step=1, |
| | value=DEFAULT_MARIGOLD_DENOISE_STEPS, |
| | info="More steps improve quality but increase processing time." |
| | ) |
| | processing_res_radio = gr.Radio( |
| | [ |
| | ("Native", 0), |
| | ("Recommended (768)", 768), |
| | ("High (1024)", 1024) |
| | ], |
| | label="Marigold Processing resolution", |
| | value=DEFAULT_MARIGOLD_PROCESSING_RES, |
| | info="Resolution for Marigold's internal processing. Native uses original image resolution. Higher resolutions are more accurate but slower." |
| | ) |
| | |
| | with gr.Tabs(): |
| | with gr.TabItem("Image Conversion"): |
| | with gr.Row(): |
| | with gr.Column(): |
| | image_input = gr.Image(type="pil", label="Upload a 2D Photo") |
| | image_process_button = gr.Button("Convert Image to 3D") |
| | with gr.Column(): |
| | image_output = gr.Image(type="pil", label="Stereoscopic 3D Image Output (Side-by-Side)") |
| | |
| | image_process_button.click( |
| | fn=process_image, |
| | inputs=[ |
| | image_input, |
| | max_disparity_slider, |
| | inpaint_radius_slider, |
| | ensemble_size_slider, |
| | denoise_steps_slider, |
| | processing_res_radio |
| | ], |
| | outputs=image_output |
| | ) |
| |
|
| | with gr.TabItem("Video Conversion"): |
| | with gr.Row(): |
| | with gr.Column(): |
| | video_input = gr.Video(label="Upload a 2D MP4 Video") |
| | video_process_button = gr.Button("Convert Video to 3D") |
| | with gr.Column(): |
| | video_output = gr.Video(label="Stereoscopic 3D Video Output (Side-by-Side)") |
| | |
| | video_process_button.click( |
| | fn=process_video, |
| | inputs=[ |
| | video_input, |
| | max_disparity_slider, |
| | inpaint_radius_slider, |
| | ensemble_size_slider, |
| | denoise_steps_slider, |
| | processing_res_radio |
| | ], |
| | outputs=video_output |
| | ) |
| |
|
| | |
| | |
| | if __name__ == '__main__': |
| | demo.launch() |
| |
|