""" This script runs inference of LatentSync using Modal. To run you must first install modal. Then you should run the download of the """ import modal #Shared volume with models volume = modal.Volume.from_name("openlipsync-volume", create_if_missing=True) model_volume = modal.Volume.from_name("hf-hub-cache", create_if_missing=True) MODEL_PATH = "/models" # where the Volume will appear on our Functions' filesystems #Lipsync image lipsync_image = ( modal.Image.from_registry("nvidia/cuda:12.8.0-devel-ubuntu22.04", add_python="3.11") .uv_pip_install( [ "fastapi[standard]", "torch", "torchvision", "xformers", "triton", "diffusers", "transformers", "huggingface-hub", "imageio==2.27.0", "decord==0.6.0", "accelerate", "einops==0.7.0", "omegaconf==2.3.0", "safetensors>=0.4.3", "opencv-python==4.9.0.80", "mediapipe==0.10.11", "av==11.0.0", "torch-fidelity==0.3.0", "torchmetrics==1.3.1", "python_speech_features==0.6", "librosa==0.10.1", "scenedetect==0.6.1", "ffmpeg-python==0.2.0", "lpips==0.1.4", "face-alignment==1.4.1", "ninja==1.11.1.1", "pandas==2.0.3", "numpy<2", "pydub==0.25.1", "moviepy==1.0.3", "hf-xet==1.1.8" ] ) .apt_install([ "libgl1", "curl", "git", "wget", "ffmpeg", ]) .env( { "HF_HUB_ENABLE_HF_TRANSFER": "1", # faster downloads "HF_HUB_CACHE": MODEL_PATH } ) .run_commands( "mkdir -p ~/.cache/torch/hub/checkpoints" ) .run_commands( "ln -s /data/data/checkpoints/auxiliary/2DFAN4-cd938726ad.zip ~/.cache/torch/hub/checkpoints/2DFAN4-cd938726ad.zip" ) .run_commands( "ln -s /data/data/checkpoints/auxiliary/vgg16-397923af.pth ~/.cache/torch/hub/checkpoints/vgg16-397923af.pth" ) .run_commands( "ln -s /data/data/checkpoints/auxiliary/s3fd-619a316812.pth ~/.cache/torch/hub/checkpoints/s3fd-619a316812.pth" ) .entrypoint([] ) .add_local_python_source("latentsync")# remove NVIDIA base container entrypoint ) with lipsync_image.imports(): import time # Create the Modal app app = modal.App("lipsync-dummy",image=lipsync_image) @app.function( image=lipsync_image, #gpu="A100", volumes={"/data": volume,MODEL_PATH:model_volume}, timeout=300 ) def volume_search(some_path="/data"): """Generates a lipsynced video""" import os print("Files in volume:") def list_directory(path): try: for item in os.listdir(path): item_path = os.path.join(path, item) abs_path = os.path.abspath(item_path) if os.path.isdir(item_path): print(f" {abs_path}/") list_directory(item_path) else: print(f" {abs_path}") except Exception as e: print(f"Error accessing {path}: {e}") # List files in the volume list_directory(some_path) @app.function( image=lipsync_image, gpu="H100", volumes={"/data": volume,MODEL_PATH:model_volume}, timeout=300 ) def inference(video_uri, audio_uri, unet_ckpt_path="./checkpoints/latentsync/latentsync_unet.pt", vae_path="./checkpoints/sd-vae-ft-mse", unet_config_path="configs/unet/second_stage.yaml", scheduler_path="configs/scheduler_config.json",whisper_model_path="./checkpoints/whisper",guidance_scale=1.0, seed=1247): """Generates a lipsynced video""" from omegaconf import OmegaConf import torch import time from diffusers import AutoencoderKL, DDIMScheduler from latentsync.models.unet import UNet3DConditionModel from latentsync.pipelines.lipsync_pipeline import LipsyncPipeline from accelerate.utils import set_seed from latentsync.whisper.audio2feature import Audio2Feature import torch import requests from PIL import Image import io # Download video and audio files video_response = requests.get(video_uri) audio_response = requests.get(audio_uri) # Save video and audio files video_path = "./temp_video.mp4" audio_path = "./temp_audio.wav" with open(video_path, "wb") as video_file: video_file.write(video_response.content) with open(audio_path, "wb") as audio_file: audio_file.write(audio_response.content) video_out_path = "./outvideo.mp4" config = OmegaConf.load(unet_config_path) scheduler = DDIMScheduler.from_pretrained(scheduler_path) if config.model.cross_attention_dim == 768: whisper_model_path = whisper_model_path+"/small.pt" elif config.model.cross_attention_dim == 384: whisper_model_path = whisper_model_path+"/tiny.pt" else: raise NotImplementedError("cross_attention_dim must be 768 or 384") audio_encoder = Audio2Feature(model_path=whisper_model_path, device="cuda", num_frames=config.data.num_frames) vae = AutoencoderKL.from_pretrained(vae_path, torch_dtype=torch.float16) vae.config.scaling_factor = 0.18215 vae.config.shift_factor = 0 unet, _ = UNet3DConditionModel.from_pretrained( OmegaConf.to_container(config.model), unet_ckpt_path, # load checkpoint device="cpu", ) unet = unet.to(dtype=torch.float16) pipeline = LipsyncPipeline( vae=vae, audio_encoder=audio_encoder, unet=unet, scheduler=scheduler, ).to("cuda") if seed != -1: set_seed(seed) else: torch.seed() print(f"Initial seed: {torch.initial_seed()}") # Start timing start_time = time.time() pipeline( video_path=video_path, audio_path=audio_path, video_out_path=video_out_path, video_mask_path=video_out_path.replace(".mp4", "_mask.mp4"), num_frames=config.data.num_frames, num_inference_steps=config.run.inference_steps, guidance_scale=guidance_scale, weight_dtype=torch.float16, width=config.data.resolution, height=config.data.resolution, ) # Calculate execution time end_time = time.time() execution_time = end_time - start_time # Read the processed video as bytes and return it with open(video_out_path, "rb") as video_file: video_bytes = video_file.read() return video_bytes, execution_time @app.local_entrypoint() def main(): #run the function locally # Example video and audio URIs (replace with actual URLs) video_uri = "https://huggingface.co/miguelamendez/openlipsync/resolve/main/assets/demo3_video.mp4" audio_uri = "https://huggingface.co/miguelamendez/openlipsync/resolve/main/assets/demo1_audio.wav" # Call the inference function #print(volume_search.remote()) print("Local inference") try: video_bytes,exec_time = inference.local( video_uri=video_uri, audio_uri=audio_uri, unet_ckpt_path="./checkpoints/latentsync/latentsync_unet.pt", vae_path="./checkpoints/sd-vae-ft-mse", unet_config_path="./configs/unet/second_stage.yaml", whisper_model_path="./checkpoints/whisper", scheduler_path="./configs/scheduler_config.json", guidance_scale=1.0, seed=1247 ) print(f"Inference time:{exec_time}") # Save the video bytes to a file in the current path output_filename = "local_video.mp4" with open(output_filename, "wb") as output_file: output_file.write(video_bytes) print(f"Video saved successfully as {output_filename}") except Exception as e: print(f"Error during inference: {e}") """ print("remote inference") try: video_bytes,exec_time = inference.remote( video_uri=video_uri, audio_uri=audio_uri, unet_ckpt_path="/data/data/checkpoints/latentsync/latentsync_unet.pt", vae_path="/data/data/checkpoints/sd-vae-ft-mse", unet_config_path="/data/data/configs/unet/second_stage.yaml", whisper_model_path="/data/data/checkpoints/whisper", scheduler_path="/data/data/configs/scheduler_config.json", guidance_scale=1.0, seed=1247 ) print(f"Inference time:{exec_time}") # Save the video bytes to a file in the current path output_filename = "remote_video.mp4" with open(output_filename, "wb") as output_file: output_file.write(video_bytes) print(f"Video saved successfully as {output_filename}") except Exception as e: print(f"Error during inference: {e}") """