File size: 8,910 Bytes

"""
This script runs inference of LatentSync using Modal.
To run you must first install modal.
Then you should run the download of the 
"""

import modal
#Shared volume with models
volume = modal.Volume.from_name("openlipsync-volume", create_if_missing=True)
model_volume = modal.Volume.from_name("hf-hub-cache", create_if_missing=True)
MODEL_PATH = "/models"  # where the Volume will appear on our Functions' filesystems
#Lipsync image
lipsync_image = (
    modal.Image.from_registry("nvidia/cuda:12.8.0-devel-ubuntu22.04", add_python="3.11")
    .uv_pip_install(
        [
            "fastapi[standard]",
            "torch",
            "torchvision",
            "xformers",
            "triton",
            "diffusers",
            "transformers",
            "huggingface-hub",
            "imageio==2.27.0",
            "decord==0.6.0",
            "accelerate",
            "einops==0.7.0",
            "omegaconf==2.3.0",
            "safetensors>=0.4.3",
            "opencv-python==4.9.0.80",
            "mediapipe==0.10.11",
            "av==11.0.0",
            "torch-fidelity==0.3.0",
            "torchmetrics==1.3.1",
            "python_speech_features==0.6",
            "librosa==0.10.1",
            "scenedetect==0.6.1",
            "ffmpeg-python==0.2.0",
            "lpips==0.1.4",
            "face-alignment==1.4.1",
            "ninja==1.11.1.1",
            "pandas==2.0.3",
            "numpy<2",
            "pydub==0.25.1",
            "moviepy==1.0.3",
            "hf-xet==1.1.8"
        ]
    )
    .apt_install([
        "libgl1",
        "curl",
        "git",
        "wget",
        "ffmpeg",
    ])
    .env(
    {
        "HF_HUB_ENABLE_HF_TRANSFER": "1",  # faster downloads
        "HF_HUB_CACHE": MODEL_PATH
    }
    )
    .run_commands(
        "mkdir -p ~/.cache/torch/hub/checkpoints"
    )
    .run_commands(
        "ln -s /data/data/checkpoints/auxiliary/2DFAN4-cd938726ad.zip ~/.cache/torch/hub/checkpoints/2DFAN4-cd938726ad.zip"
    )
    .run_commands(
        "ln -s /data/data/checkpoints/auxiliary/vgg16-397923af.pth ~/.cache/torch/hub/checkpoints/vgg16-397923af.pth"
    )
     .run_commands(
        "ln -s /data/data/checkpoints/auxiliary/s3fd-619a316812.pth ~/.cache/torch/hub/checkpoints/s3fd-619a316812.pth"
    )
    .entrypoint([]
    )
    .add_local_python_source("latentsync")# remove NVIDIA base container entrypoint
)
with lipsync_image.imports():
    import time


# Create the Modal app
app = modal.App("lipsync-dummy",image=lipsync_image)
@app.function(
    image=lipsync_image,
    #gpu="A100",
    volumes={"/data": volume,MODEL_PATH:model_volume},
    timeout=300
)
def volume_search(some_path="/data"):
    """Generates a lipsynced video"""
    import os
    print("Files in volume:")
    def list_directory(path):
        try:
            for item in os.listdir(path):
                item_path = os.path.join(path, item)
                abs_path = os.path.abspath(item_path)
                if os.path.isdir(item_path):
                    print(f"  {abs_path}/")
                    list_directory(item_path)
                else:
                    print(f"  {abs_path}")
        except Exception as e:
            print(f"Error accessing {path}: {e}")
    # List files in the volume
    list_directory(some_path)

@app.function(
    image=lipsync_image,
    gpu="H100",
    volumes={"/data": volume,MODEL_PATH:model_volume},
    timeout=300
)
def inference(video_uri, audio_uri, unet_ckpt_path="./checkpoints/latentsync/latentsync_unet.pt", vae_path="./checkpoints/sd-vae-ft-mse", unet_config_path="configs/unet/second_stage.yaml", scheduler_path="configs/scheduler_config.json",whisper_model_path="./checkpoints/whisper",guidance_scale=1.0, seed=1247):
    """Generates a lipsynced video"""
    from omegaconf import OmegaConf
    import torch
    import time
    from diffusers import AutoencoderKL, DDIMScheduler
    from latentsync.models.unet import UNet3DConditionModel
    from latentsync.pipelines.lipsync_pipeline import LipsyncPipeline
    from accelerate.utils import set_seed
    from latentsync.whisper.audio2feature import Audio2Feature
    import torch
    import requests
    from PIL import Image
    import io
    # Download video and audio files
    video_response = requests.get(video_uri)
    audio_response = requests.get(audio_uri)
    # Save video and audio files
    video_path = "./temp_video.mp4"
    audio_path = "./temp_audio.wav"
    with open(video_path, "wb") as video_file:
        video_file.write(video_response.content)
    with open(audio_path, "wb") as audio_file:
        audio_file.write(audio_response.content)
    video_out_path = "./outvideo.mp4"
    config = OmegaConf.load(unet_config_path)
    scheduler = DDIMScheduler.from_pretrained(scheduler_path)
    if config.model.cross_attention_dim == 768:
        whisper_model_path = whisper_model_path+"/small.pt"
    elif config.model.cross_attention_dim == 384:
        whisper_model_path = whisper_model_path+"/tiny.pt"
    else:
        raise NotImplementedError("cross_attention_dim must be 768 or 384")
    audio_encoder = Audio2Feature(model_path=whisper_model_path, device="cuda", num_frames=config.data.num_frames)
    vae = AutoencoderKL.from_pretrained(vae_path, torch_dtype=torch.float16)
    vae.config.scaling_factor = 0.18215
    vae.config.shift_factor = 0
    unet, _ = UNet3DConditionModel.from_pretrained(
        OmegaConf.to_container(config.model),
        unet_ckpt_path,  # load checkpoint
        device="cpu",
    )

    unet = unet.to(dtype=torch.float16)

    pipeline = LipsyncPipeline(
        vae=vae,
        audio_encoder=audio_encoder,
        unet=unet,
        scheduler=scheduler,
    ).to("cuda")

    if seed != -1:
        set_seed(seed)
    else:
        torch.seed()
    print(f"Initial seed: {torch.initial_seed()}")
    # Start timing
    start_time = time.time()
    pipeline(
        video_path=video_path,
        audio_path=audio_path,
        video_out_path=video_out_path,
        video_mask_path=video_out_path.replace(".mp4", "_mask.mp4"),
        num_frames=config.data.num_frames,
        num_inference_steps=config.run.inference_steps,
        guidance_scale=guidance_scale,
        weight_dtype=torch.float16,
        width=config.data.resolution,
        height=config.data.resolution,
    )
    # Calculate execution time
    end_time = time.time()
    execution_time = end_time - start_time
    # Read the processed video as bytes and return it
    with open(video_out_path, "rb") as video_file:
        video_bytes = video_file.read()
    return video_bytes, execution_time

@app.local_entrypoint()
def main():
    #run the function locally
        # Example video and audio URIs (replace with actual URLs)
    video_uri = "https://huggingface.co/miguelamendez/openlipsync/resolve/main/assets/demo3_video.mp4"
    audio_uri = "https://huggingface.co/miguelamendez/openlipsync/resolve/main/assets/demo1_audio.wav"
    # Call the inference function
    #print(volume_search.remote())
    print("Local inference")
    try:
        video_bytes,exec_time = inference.local(
            video_uri=video_uri,
            audio_uri=audio_uri,
            unet_ckpt_path="./checkpoints/latentsync/latentsync_unet.pt",
            vae_path="./checkpoints/sd-vae-ft-mse",
            unet_config_path="./configs/unet/second_stage.yaml",
            whisper_model_path="./checkpoints/whisper",
            scheduler_path="./configs/scheduler_config.json",
            guidance_scale=1.0,
            seed=1247
        )
        print(f"Inference time:{exec_time}")
        # Save the video bytes to a file in the current path
        output_filename = "local_video.mp4"
        with open(output_filename, "wb") as output_file:
            output_file.write(video_bytes)
        print(f"Video saved successfully as {output_filename}")
    except Exception as e:
        print(f"Error during inference: {e}")
    """
    print("remote inference")
    try:
        video_bytes,exec_time = inference.remote(
            video_uri=video_uri,
            audio_uri=audio_uri,
            unet_ckpt_path="/data/data/checkpoints/latentsync/latentsync_unet.pt",
            vae_path="/data/data/checkpoints/sd-vae-ft-mse",
            unet_config_path="/data/data/configs/unet/second_stage.yaml",
            whisper_model_path="/data/data/checkpoints/whisper",
            scheduler_path="/data/data/configs/scheduler_config.json",
            guidance_scale=1.0,
            seed=1247
        )
        print(f"Inference time:{exec_time}")
        # Save the video bytes to a file in the current path
        output_filename = "remote_video.mp4"
        with open(output_filename, "wb") as output_file:
            output_file.write(video_bytes)
        print(f"Video saved successfully as {output_filename}")
    except Exception as e:
        print(f"Error during inference: {e}")
    """