Spaces:
Sleeping
Sleeping
| # -*- coding: utf-8 -*- | |
| """ | |
| CS5330-HW4: Parallax Effect Gradio App | |
| Converted from Colab notebook. | |
| (V4: Final fix for halo/border artifact. Uses correct mask.) | |
| """ | |
| import torch | |
| import numpy as np | |
| import matplotlib.pyplot as plt | |
| from PIL import Image | |
| from transformers import DPTImageProcessor, DPTForDepthEstimation | |
| import cv2 | |
| import imageio.v2 as imageio | |
| import gradio as gr | |
| import time # To create unique filenames | |
| # ================================================================== | |
| # Global Transformer Setup | |
| # ================================================================== | |
| print("Loading Intel DPT depth estimation model...") | |
| processor = DPTImageProcessor.from_pretrained("Intel/dpt-large") | |
| model = DPTForDepthEstimation.from_pretrained("Intel/dpt-large") | |
| model.eval() | |
| device = torch.device("cuda" if torch.cuda.is_available() else "cpu") | |
| model = model.to(device) | |
| print(f"Model loaded on {device}. Gradio app is ready.") | |
| # ================================================================== | |
| # Helper Function 1: Get Depth Map | |
| # ================================================================== | |
| def get_depth_map(pil_image, processor, model, device): | |
| print("... (1/5) Extracting depth map") | |
| inputs = processor(images=pil_image, return_tensors="pt") | |
| inputs = {k: v.to(device) for k, v in inputs.items()} | |
| with torch.no_grad(): | |
| outputs = model(**inputs) | |
| predicted_depth = outputs.predicted_depth | |
| prediction = torch.nn.functional.interpolate( | |
| predicted_depth.unsqueeze(1), | |
| size=pil_image.size[::-1], | |
| mode="bicubic", | |
| align_corners=False, | |
| ) | |
| depth_map = prediction.squeeze().cpu().numpy() | |
| depth_map = (depth_map - depth_map.min()) / (depth_map.max() - depth_map.min()) | |
| return depth_map | |
| # ================================================================== | |
| # Helper Function 2: Layer Separation | |
| # ================================================================== | |
| # This function returns mask_clean (hard) and mask_soft (soft/full-size) | |
| def separate_foreground_background(image, depth_map, *, | |
| assume_bgr_input=True, | |
| near_is_foreground=True, | |
| foreground_depth_is_high=True): | |
| print("... (2/5) Separating layers") | |
| if not isinstance(image, np.ndarray): | |
| image = np.array(image) | |
| if not isinstance(depth_map, np.ndarray): | |
| depth_map = np.array(depth_map) | |
| if assume_bgr_input and image.ndim == 3 and image.shape[2] == 3: | |
| image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB) | |
| if image.ndim == 2: | |
| image = cv2.cvtColor(image, cv2.COLOR_GRAY2RGB) | |
| if depth_map.ndim == 3: | |
| depth_map = depth_map[:, :, 0] | |
| depth_norm = cv2.normalize(depth_map, None, 0, 255, cv2.NORM_MINMAX).astype(np.uint8) | |
| depth_smooth = cv2.GaussianBlur(depth_norm, (5, 5), 0) | |
| if near_is_foreground and foreground_depth_is_high: | |
| thresh_flag = cv2.THRESH_BINARY | |
| elif near_is_foreground and not foreground_depth_is_high: | |
| thresh_flag = cv2.THRESH_BINARY_INV | |
| elif (not near_is_foreground) and foreground_depth_is_high: | |
| thresh_flag = cv2.THRESH_BINARY_INV | |
| else: | |
| thresh_flag = cv2.THRESH_BINARY | |
| _, binary_mask = cv2.threshold(depth_smooth, 0, 255, thresh_flag + cv2.THRESH_OTSU) | |
| kernel = cv2.getStructuringElement(cv2.MORPH_ELLIPSE, (5, 5)) | |
| mask_clean = cv2.morphologyEx(binary_mask, cv2.MORPH_OPEN, kernel, iterations=1) | |
| mask_clean = cv2.morphologyEx(mask_clean, cv2.MORPH_CLOSE, kernel, iterations=2) | |
| num_labels, labels, stats, _ = cv2.connectedComponentsWithStats(mask_clean, 8) | |
| if num_labels > 1: | |
| largest_label = 1 + np.argmax(stats[1:, cv2.CC_STAT_AREA]) | |
| mask_clean = (labels == largest_label).astype(np.uint8) * 255 | |
| # mask_soft is the full-size mask, which is key to fixing the artifact. | |
| mask_soft = cv2.GaussianBlur(mask_clean, (9, 9), 5).astype(np.float32) / 255.0 | |
| img_f = image.astype(np.float32) / 255.0 | |
| mask_3 = np.dstack([mask_soft]*3) | |
| foreground = np.clip(img_f * mask_3, 0, 1) | |
| background = np.clip(img_f * (1.0 - mask_3), 0, 1) | |
| foreground = (foreground * 255.0).astype(np.uint8) | |
| background = (background * 255.0).astype(np.uint8) | |
| return foreground, background, mask_clean, mask_soft | |
| # ================================================================== | |
| # Helper Function 3: Background Reconstruction | |
| # ================================================================== | |
| # This function returns final_bg (inpainted background) and alpha_no_halo (eroded mask) | |
| # Note: We no longer use alpha_no_halo for the animation, but the function is fine. | |
| def reconstruct_background(background, mask_hard, original_image_np): | |
| print("... (3/5) Reconstructing background") | |
| kernel = np.ones((7,7), np.uint8) | |
| mask_dilated = cv2.dilate(mask_hard, kernel, iterations=1) | |
| bg_inpainted = cv2.inpaint(background, mask_dilated, inpaintRadius=6, flags=cv2.INPAINT_TELEA) | |
| bg_smooth = cv2.bilateralFilter(bg_inpainted, d=9, sigmaColor=75, sigmaSpace=75) | |
| final_bg = np.where(mask_dilated[..., None] == 255, bg_smooth, background) | |
| k3 = cv2.getStructuringElement(cv2.MORPH_ELLIPSE, (3,3)) | |
| mask_erode = cv2.erode(mask_hard, k3, iterations=1) | |
| dist = cv2.distanceTransform(mask_erode, cv2.DIST_L2, 5) | |
| alpha_no_halo = dist / 6.0 | |
| alpha_no_halo = np.clip(alpha_no_halo, 0, 1).astype(np.float32) | |
| alpha_no_halo = alpha_no_halo[..., None] # HxWx1 | |
| return final_bg, alpha_no_halo | |
| # ================================================================== | |
| # Helper Function 4: Animation | |
| # ================================================================== | |
| # This is the animation function (from V2 logic), which is correct (uses normalization to prevent gaps). | |
| def create_multi_layer_animation( | |
| image_original, | |
| background_clean, | |
| alpha_mask, # KEY: We will pass the full-size mask_soft here | |
| depth_map, | |
| n_frames=60, | |
| parallax_strength=12, | |
| blur_strength=1.0, | |
| direction='right', | |
| zoom_center=1.10, | |
| zoom_peak=1.05 | |
| ): | |
| print(f"... (4/5) Generating {n_frames} animation frames") | |
| print(f" Params: Parallax={parallax_strength}px, Blur={blur_strength}x, Dir={direction}") | |
| h, w = image_original.shape[:2] | |
| # --- 1. Prepare motion and blur settings --- | |
| direction_map = {'right': (1, 0), 'left': (-1, 0), 'up': (0, -1), 'down': (0, 1)} | |
| dx, dy = direction_map.get(direction, (1, 0)) | |
| fg_shift = parallax_strength | |
| mid_shift = parallax_strength * 0.5 | |
| far_shift = parallax_strength * (2 / 12) | |
| base_mid_k = 9 | |
| base_far_k = 35 | |
| mid_k_raw = int(base_mid_k * blur_strength) | |
| far_k_raw = int(base_far_k * blur_strength) | |
| mid_k = (mid_k_raw + 1) if (mid_k_raw > 0 and mid_k_raw % 2 == 0) else max(1, mid_k_raw) | |
| far_k = (far_k_raw + 1) if (far_k_raw > 0 and far_k_raw % 2 == 0) else max(1, far_k_raw) | |
| mid_blur_ksize = (mid_k, mid_k) | |
| far_blur_ksize = (far_k, far_k) | |
| print(f" ...Using blur kernels: Mid={mid_blur_ksize}, Far={far_blur_ksize}") | |
| # --- 2. Prepare base masks (FG vs BG) --- | |
| # alpha_mask is now the full-size mask_soft | |
| if alpha_mask.max() > 1: | |
| alpha_mask = alpha_mask.astype(np.float32) / 255.0 | |
| if alpha_mask.ndim == 2: | |
| alpha_mask = alpha_mask[..., None] | |
| fg_mask_3ch = np.repeat(alpha_mask, 3, axis=2) # full-size foreground | |
| bg_mask_3ch = 1.0 - fg_mask_3ch # full-size background "hole" | |
| # --- 3. Create mid-ground / far-ground masks --- | |
| if depth_map.ndim == 3: | |
| depth_map = cv2.cvtColor(depth_map, cv2.COLOR_BGR2GRAY) | |
| # We find depth values inside the "background hole" (bg_mask_3ch) | |
| bg_depth_values = depth_map[alpha_mask[..., 0] < 0.5] | |
| if len(bg_depth_values) > 0: | |
| bg_split_threshold = np.percentile(bg_depth_values, 50) | |
| else: | |
| bg_split_threshold = 0.5 | |
| raw_mid_mask = (depth_map > bg_split_threshold).astype(np.float32) | |
| raw_mid_mask_smooth = cv2.GaussianBlur(raw_mid_mask, (21, 21), 0) | |
| if raw_mid_mask_smooth.ndim == 2: | |
| raw_mid_mask_smooth = raw_mid_mask_smooth[..., None] | |
| raw_mid_mask_smooth_3ch = np.repeat(raw_mid_mask_smooth, 3, axis=2) | |
| # --- 4. Generate the final 3 mutually exclusive masks --- | |
| # These three layers will perfectly cover the image with no gaps or overlaps. | |
| mid_mask_3ch = raw_mid_mask_smooth_3ch * bg_mask_3ch | |
| far_mask_3ch = (1.0 - raw_mid_mask_smooth_3ch) * bg_mask_3ch | |
| frames = [] | |
| # --- 5. Loop to generate each frame --- | |
| for i in range(n_frames): | |
| phase = (i / n_frames) * 2 * np.pi | |
| ease = np.sin(phase) | |
| zoom_range = zoom_center - zoom_peak | |
| scale = zoom_center - (zoom_range * abs(ease)) | |
| center = (w / 2, h / 2) | |
| M_scale = cv2.getRotationMatrix2D(center, 0, scale) | |
| M_fg_trans = np.float32([[1, 0, dx*ease*fg_shift], [0, 1, dy*ease*fg_shift]]) | |
| M_mid_trans = np.float32([[1, 0, dx*ease*mid_shift], [0, 1, dy*ease*mid_shift]]) | |
| M_far_trans = np.float32([[1, 0, dx*ease*far_shift], [0, 1, dy*ease*far_shift]]) | |
| # --- Layer Transforms --- | |
| fg_warped = cv2.warpAffine(image_original, M_fg_trans, (w,h), borderMode=cv2.BORDER_REFLECT_101) | |
| fg_final = cv2.warpAffine(fg_warped, M_scale, (w,h), borderMode=cv2.BORDER_REFLECT_101).astype(np.float32) | |
| mid_warped = cv2.warpAffine(background_clean, M_mid_trans, (w,h), borderMode=cv2.BORDER_REPLICATE) | |
| mid_warped_scaled = cv2.warpAffine(mid_warped, M_scale, (w,h), borderMode=cv2.BORDER_REPLICATE) | |
| mid_final = cv2.GaussianBlur(mid_warped_scaled, mid_blur_ksize, 0).astype(np.float32) | |
| far_warped = cv2.warpAffine(background_clean, M_far_trans, (w,h), borderMode=cv2.BORDER_REPLICATE) | |
| far_warped_scaled = cv2.warpAffine(far_warped, M_scale, (w,h), borderMode=cv2.BORDER_REPLICATE) | |
| far_final = cv2.GaussianBlur(far_warped_scaled, far_blur_ksize, 0).astype(np.float32) | |
| # --- Mask Transforms --- | |
| fg_mask_warped = cv2.warpAffine(fg_mask_3ch, M_fg_trans, (w,h)) | |
| fg_mask_warped = cv2.warpAffine(fg_mask_warped, M_scale, (w,h)) | |
| mid_mask_warped = cv2.warpAffine(mid_mask_3ch, M_mid_trans, (w,h)) | |
| mid_mask_warped = cv2.warpAffine(mid_mask_warped, M_scale, (w,h)) | |
| far_mask_warped = cv2.warpAffine(far_mask_3ch, M_far_trans, (w,h)) | |
| far_mask_warped = cv2.warpAffine(far_mask_warped, M_scale, (w,h)) | |
| # --- Final Composite (V2 normalization logic) --- | |
| # Re-normalize masks to prevent black borders or tiny gaps after warp. | |
| total_mask = fg_mask_warped + mid_mask_warped + far_mask_warped + 1e-6 | |
| fg_mask_warped /= total_mask | |
| mid_mask_warped /= total_mask | |
| far_mask_warped /= total_mask | |
| # Add the three layers, weighted by their masks. | |
| composite = (fg_final * fg_mask_warped) + \ | |
| (mid_final * mid_mask_warped) + \ | |
| (far_final * far_mask_warped) | |
| frame = np.clip(composite, 0, 255).astype(np.uint8) | |
| frames.append(frame) | |
| print(f"... (4/5) Frame generation complete.") | |
| return frames | |
| # ================================================================== | |
| # MAIN GRADIO FUNCTION (Ties everything together) | |
| # ================================================================== | |
| def generate_parallax_effect(input_image_np, parallax_strength, blur_strength, animation_direction): | |
| print("\n--- Processing new image ---") | |
| # --- 0. Image Preparation --- | |
| image_pil = Image.fromarray(input_image_np).convert('RGB') | |
| max_size = 640 | |
| if max(image_pil.size) > max_size: | |
| ratio = max_size / max(image_pil.size) | |
| new_size = tuple(int(dim * ratio) for dim in image_pil.size) | |
| image_pil = image_pil.resize(new_size, Image.LANCZOS) | |
| image_resized_np = np.array(image_pil) | |
| print(f"Image resized to: {image_pil.size}") | |
| # --- 1. Get Depth Map --- | |
| depth_map_0_1 = get_depth_map(image_pil, processor, model, device) | |
| # --- 2. Layer Separation --- | |
| # We get mask_soft (full-size mask) from this function. | |
| foreground, background, mask_hard, mask_soft = separate_foreground_background( | |
| image_pil, | |
| depth_map_0_1, | |
| assume_bgr_input=False, | |
| near_is_foreground=True, | |
| foreground_depth_is_high=True | |
| ) | |
| # --- 3. Background Reconstruction --- | |
| # We get final_bg (inpainted background) from this. | |
| # We also get alpha_no_halo, but we won't use it for the animation. | |
| final_bg, alpha_no_halo = reconstruct_background(background, mask_hard, image_resized_np) | |
| # --- 4. Animation --- | |
| # *** THIS IS THE KEY FIX *** | |
| # We use the V2-logic animation function (V4) with `mask_soft` (the full-size mask). | |
| multi_layer_frames = create_multi_layer_animation( | |
| image_original=image_resized_np, | |
| background_clean=final_bg, | |
| alpha_mask=mask_soft, # <-- KEY FIX: Pass the full-size soft mask | |
| depth_map=depth_map_0_1, | |
| n_frames=60, | |
| parallax_strength=parallax_strength, | |
| blur_strength=blur_strength, | |
| direction=animation_direction | |
| ) | |
| # --- 5. Save GIF and Return Path --- | |
| print("... (5/5) Saving final GIF") | |
| timestamp = int(time.time()) | |
| output_filename = f'parallax_final_{timestamp}.gif' | |
| # This saves the file to the SERVER'S disk. | |
| # It does NOT trigger a download in the user's browser. | |
| imageio.mimsave(output_filename, multi_layer_frames, duration=0.04, loop=0) | |
| print(f"--- Processing complete! Saved to {output_filename} ---") | |
| # MODIFIED: Only return the GIF filepath | |
| return output_filename | |
| # ================================================================== | |
| # Gradio Interface (Modified) | |
| # ================================================================== | |
| print("Creating Gradio interface...") | |
| # --- 1. Define Input Components --- | |
| input_image = gr.Image(label="1. Upload Your Image", type="numpy") | |
| param_parallax = gr.Slider( | |
| minimum=0, | |
| maximum=30, | |
| value=12, | |
| step=1, | |
| label="2. Parallax Strength (px)", | |
| info="Foreground motion in pixels. Higher = stronger 3D effect." | |
| ) | |
| param_blur = gr.Slider( | |
| minimum=0.0, | |
| maximum=2.0, | |
| value=1.0, | |
| step=0.1, | |
| label="3. Aperture / Blur Strength", | |
| info="Controls background blur (bokeh). 0 = no blur, 1 = default, 2 = max blur." | |
| ) | |
| param_direction = gr.Dropdown( | |
| choices=['right', 'left', 'up', 'down'], | |
| value='right', | |
| label="4. Animation Direction" | |
| ) | |
| # --- 2. Define Output Components --- | |
| # MODIFIED: Removed output_original | |
| output_gif = gr.Image(label="Generated Parallax GIF") | |
| # NOTE: The gr.Image component automatically provides a download button | |
| # in the top-right corner when displaying an image/GIF. This | |
| # fulfills the requirement for a "Gradio download button". | |
| # --- 4. Create Interface --- | |
| iface = gr.Interface( | |
| fn=generate_parallax_effect, | |
| inputs=[input_image, param_parallax, param_blur, param_direction], | |
| # MODIFIED: Only one output | |
| outputs=output_gif, | |
| title="📸 3D Parallax Photo Animator (CS5330-HW4)", | |
| description=""" | |
| Upload a photo (ideally with a clear foreground and background) to generate a 3D parallax and depth-of-field animation. | |
| 1. Upload an image. | |
| 2. Adjust the 3 parameters below. | |
| 3. Click "Submit". | |
| Processing may take 30-60 seconds. You can find the download button in the top-right corner of the generated GIF. | |
| """, | |
| delete_cache=(86400,86400) | |
| ) | |
| if __name__ == "__main__": | |
| iface.launch(share=False) |