File size: 2,357 Bytes
75da08b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88


import modal
lipsync_image = (
    modal.Image.from_registry("nvidia/cuda:12.8.0-devel-ubuntu22.04", add_python="3.11")
    .uv_pip_install(
        [
            "torch",
            "torchvision",
            "xformers",
            "triton",
            "diffusers",
            "transformers",
            "huggingface-hub",
            "imageio==2.27.0",
            "decord==0.6.0",
            "accelerate",
            "einops==0.7.0",
            "omegaconf==2.3.0",
            "safetensors>=0.4.3",
            "opencv-python==4.9.0.80",
            "mediapipe==0.10.11",
            "av==11.0.0",
            "torch-fidelity==0.3.0",
            "torchmetrics==1.3.1",
            "python_speech_features==0.6",
            "librosa==0.10.1",
            "scenedetect==0.6.1",
            "ffmpeg-python==0.2.0",
            "lpips==0.1.4",
            "face-alignment==1.4.1",
            "ninja==1.11.1.1",
            "pandas==2.0.3",
            "numpy<2",
            "pydub==0.25.1",
            "moviepy==1.0.3",
            "hf-xet==1.1.8"
        ]
    )
    .apt_install([
        "libgl1",
        "curl",
        "git",
        "wget",
        "ffmpeg",
    ])
    .env(
        {"HF_HUB_ENABLE_HF_TRANSFER": "1"}
    )
    .entrypoint([]
    )
    .add.add_local_dir(
    "/home/misha/OpenLipSync/latentsync",
    remote_path="/latentsync",
    )# remove NVIDIA base container entrypoint
)

#with lipsync_image.imports():
#    import torch
#    import time


# Create the Modal app
app = modal.App("lipsync-dummy")

@app.function(
    image=lipsync_image,
    timeout=300
)
def inference(video_url="https://huggingface.co/miguelamendez/openlipsync/resolve/main/assets/demo1_video.mp4",audio_url="https://huggingface.co/miguelamendez/openlipsync/resolve/main/assets/demo2_audio.wav"):
    """Generates a lipsynced video"""
    from omegaconf import OmegaConf
    import torch
    from diffusers import AutoencoderKL, DDIMScheduler
    from latentsync.models.unet import UNet3DConditionModel
    from latentsync.pipelines.lipsync_pipeline import LipsyncPipeline
    from accelerate.utils import set_seed
    from latentsync.whisper.audio2feature import Audio2Feature
    import torch
    return "a test"


@app.local_entrypoint()
def main():
    # run the function locally
    print(inference.local())
    print(inference.remote())