Spaces:
Running
on
Zero
Running
on
Zero
| import gc | |
| import os | |
| import numpy as np | |
| import torch | |
| from diffusers.training_utils import set_seed | |
| from extern.depthcrafter.depth_crafter_ppl import DepthCrafterPipeline | |
| from extern.depthcrafter.unet import DiffusersUNetSpatioTemporalConditionModelDepthCrafter | |
| class DepthCrafterDemo: | |
| def __init__( | |
| self, | |
| unet_path: str, | |
| pre_train_path: str, | |
| cpu_offload: str = "model", | |
| device: str = "cuda:0" | |
| ): | |
| unet = DiffusersUNetSpatioTemporalConditionModelDepthCrafter.from_pretrained( | |
| unet_path, | |
| low_cpu_mem_usage=True, | |
| torch_dtype=torch.float16, | |
| ) | |
| # load weights of other components from the provided checkpoint | |
| self.pipe = DepthCrafterPipeline.from_pretrained( | |
| pre_train_path, | |
| unet=unet, | |
| torch_dtype=torch.float16, | |
| variant="fp16", | |
| ) | |
| # for saving memory, we can offload the model to CPU, or even run the model sequentially to save more memory | |
| if cpu_offload is not None: | |
| if cpu_offload == "sequential": | |
| # This will slow, but save more memory | |
| self.pipe.enable_sequential_cpu_offload() | |
| elif cpu_offload == "model": | |
| self.pipe.enable_model_cpu_offload() | |
| else: | |
| raise ValueError(f"Unknown cpu offload option: {cpu_offload}") | |
| else: | |
| self.pipe.to(device) | |
| # enable attention slicing and xformers memory efficient attention | |
| try: | |
| self.pipe.enable_xformers_memory_efficient_attention() | |
| except Exception as e: | |
| print(e) | |
| print("Xformers is not enabled") | |
| self.pipe.enable_attention_slicing() | |
| def infer( | |
| self, | |
| frames, | |
| near, | |
| far, | |
| num_denoising_steps: int, | |
| guidance_scale: float, | |
| window_size: int = 110, | |
| overlap: int = 25, | |
| seed: int = 42, | |
| track_time: bool = True, | |
| ): | |
| set_seed(seed) | |
| # inference the depth map using the DepthCrafter pipeline | |
| with torch.inference_mode(): | |
| res = self.pipe( | |
| frames, | |
| height=frames.shape[1], | |
| width=frames.shape[2], | |
| output_type="np", | |
| guidance_scale=guidance_scale, | |
| num_inference_steps=num_denoising_steps, | |
| window_size=window_size, | |
| overlap=overlap, | |
| track_time=track_time, | |
| ).frames[0] | |
| # convert the three-channel output to a single channel depth map | |
| res = res.sum(-1) / res.shape[-1] | |
| # normalize the depth map to [0, 1] across the whole video | |
| depths = (res - res.min()) / (res.max() - res.min()) | |
| # visualize the depth map and save the results | |
| # vis = vis_sequence_depth(res) | |
| # save the depth map and visualization with the target FPS | |
| depths = torch.from_numpy(depths).unsqueeze(1) # 49 576 1024 -> | |
| depths *= 3900 # compatible with da output | |
| depths[depths < 1e-5] = 1e-5 | |
| depths = 10000. / depths | |
| depths = depths.clip(near, far) | |
| return depths |