import modal lipsync_image = ( modal.Image.from_registry("nvidia/cuda:12.8.0-devel-ubuntu22.04", add_python="3.11") .uv_pip_install( [ "torch", "torchvision", "xformers", "triton", "diffusers", "transformers", "huggingface-hub", "imageio==2.27.0", "decord==0.6.0", "accelerate", "einops==0.7.0", "omegaconf==2.3.0", "safetensors>=0.4.3", "opencv-python==4.9.0.80", "mediapipe==0.10.11", "av==11.0.0", "torch-fidelity==0.3.0", "torchmetrics==1.3.1", "python_speech_features==0.6", "librosa==0.10.1", "scenedetect==0.6.1", "ffmpeg-python==0.2.0", "lpips==0.1.4", "face-alignment==1.4.1", "ninja==1.11.1.1", "pandas==2.0.3", "numpy<2", "pydub==0.25.1", "moviepy==1.0.3", "hf-xet==1.1.8" ] ) .apt_install([ "libgl1", "curl", "git", "wget", "ffmpeg", ]) .env( {"HF_HUB_ENABLE_HF_TRANSFER": "1"} ) .entrypoint([] ) .add.add_local_dir( "/home/misha/OpenLipSync/latentsync", remote_path="/latentsync", )# remove NVIDIA base container entrypoint ) #with lipsync_image.imports(): # import torch # import time # Create the Modal app app = modal.App("lipsync-dummy") @app.function( image=lipsync_image, timeout=300 ) def inference(video_url="https://huggingface.co/miguelamendez/openlipsync/resolve/main/assets/demo1_video.mp4",audio_url="https://huggingface.co/miguelamendez/openlipsync/resolve/main/assets/demo2_audio.wav"): """Generates a lipsynced video""" from omegaconf import OmegaConf import torch from diffusers import AutoencoderKL, DDIMScheduler from latentsync.models.unet import UNet3DConditionModel from latentsync.pipelines.lipsync_pipeline import LipsyncPipeline from accelerate.utils import set_seed from latentsync.whisper.audio2feature import Audio2Feature import torch return "a test" @app.local_entrypoint() def main(): # run the function locally print(inference.local()) print(inference.remote())