banao-tech commited on
Commit
2bcec7c
·
verified ·
1 Parent(s): 7f287c5

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +184 -0
app.py ADDED
@@ -0,0 +1,184 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import sys
3
+ import subprocess
4
+ from pathlib import Path
5
+ from datetime import datetime
6
+
7
+ import gradio as gr
8
+ from huggingface_hub import snapshot_download
9
+
10
+
11
+ # -----------------------------
12
+ # Paths
13
+ # -----------------------------
14
+ ROOT = Path(__file__).parent.resolve()
15
+ REPO_DIR = ROOT / "LatentSync"
16
+ CHECKPOINTS_DIR = REPO_DIR / "checkpoints"
17
+ TEMP_DIR = REPO_DIR / "temp"
18
+
19
+ # LatentSync 1.5 checkpoint repo (fits T4 16GB)
20
+ HF_CKPT_REPO = "ByteDance/LatentSync-1.5"
21
+
22
+ # For LatentSync 1.5, config is typically stage2.yaml (256 resolution)
23
+ # (LatentSync has multiple configs; stage2_512.yaml is for 1.6 / 512 training)
24
+ CONFIG_REL_PATH = Path("configs/unet/stage2.yaml")
25
+ CKPT_REL_PATH = Path("checkpoints/latentsync_unet.pt")
26
+
27
+
28
+ def run(cmd, cwd=None):
29
+ print("Running:", " ".join(map(str, cmd)))
30
+ subprocess.check_call(cmd, cwd=cwd)
31
+
32
+
33
+ def ensure_repo():
34
+ if not REPO_DIR.exists():
35
+ run(["git", "clone", "--depth", "1", "https://github.com/bytedance/LatentSync.git", str(REPO_DIR)])
36
+
37
+
38
+ def ensure_checkpoints():
39
+ CHECKPOINTS_DIR.mkdir(parents=True, exist_ok=True)
40
+
41
+ # Download checkpoint + whisper tiny into LatentSync/checkpoints
42
+ # HF repo tree includes `latentsync_unet.pt` and `whisper/...` :contentReference[oaicite:4]{index=4}
43
+ snapshot_download(
44
+ repo_id=HF_CKPT_REPO,
45
+ local_dir=str(CHECKPOINTS_DIR),
46
+ local_dir_use_symlinks=False,
47
+ allow_patterns=[
48
+ "latentsync_unet.pt",
49
+ "whisper/*",
50
+ ],
51
+ )
52
+
53
+ ckpt = CHECKPOINTS_DIR / "latentsync_unet.pt"
54
+ whisper_tiny = CHECKPOINTS_DIR / "whisper" / "tiny.pt"
55
+ if not ckpt.exists():
56
+ raise FileNotFoundError(f"Missing checkpoint: {ckpt}")
57
+ if not whisper_tiny.exists():
58
+ raise FileNotFoundError(f"Missing whisper tiny: {whisper_tiny}")
59
+
60
+
61
+ def make_still_video(image_path: str, audio_path: str, fps: int = 25) -> str:
62
+ TEMP_DIR.mkdir(parents=True, exist_ok=True)
63
+ out_path = TEMP_DIR / f"still_{datetime.now().strftime('%Y%m%d_%H%M%S')}.mp4"
64
+
65
+ # Create a video by looping the image and cutting to audio length.
66
+ # Also scale/crop to a square size (256) to match stage2.yaml typical setting.
67
+ # If you switch to 1.6 later, you'd scale/crop to 512 and use stage2_512.yaml.
68
+ cmd = [
69
+ "ffmpeg", "-y",
70
+ "-loop", "1",
71
+ "-i", image_path,
72
+ "-i", audio_path,
73
+ "-shortest",
74
+ "-r", str(fps),
75
+ "-vf", "scale=256:256:force_original_aspect_ratio=increase,crop=256:256",
76
+ "-pix_fmt", "yuv420p",
77
+ "-c:v", "libx264",
78
+ "-c:a", "aac",
79
+ str(out_path),
80
+ ]
81
+ run(cmd)
82
+ return str(out_path)
83
+
84
+
85
+ def latentsync_infer(video_path: str, audio_path: str, inference_steps: int, guidance_scale: float, seed: int) -> str:
86
+ # Import LatentSync inference code
87
+ sys.path.insert(0, str(REPO_DIR))
88
+ os.chdir(str(REPO_DIR))
89
+
90
+ from omegaconf import OmegaConf
91
+ from scripts.inference import main
92
+ import argparse
93
+
94
+ config_path = (REPO_DIR / CONFIG_REL_PATH).resolve()
95
+ ckpt_path = (REPO_DIR / CKPT_REL_PATH).resolve()
96
+
97
+ if not config_path.exists():
98
+ raise FileNotFoundError(f"Config not found: {config_path}")
99
+ if not ckpt_path.exists():
100
+ raise FileNotFoundError(f"Checkpoint not found: {ckpt_path}")
101
+
102
+ TEMP_DIR.mkdir(parents=True, exist_ok=True)
103
+ out_path = TEMP_DIR / f"result_{datetime.now().strftime('%Y%m%d_%H%M%S')}.mp4"
104
+
105
+ config = OmegaConf.load(str(config_path))
106
+ config["run"].update(
107
+ {
108
+ "guidance_scale": float(guidance_scale),
109
+ "inference_steps": int(inference_steps),
110
+ }
111
+ )
112
+
113
+ parser = argparse.ArgumentParser()
114
+ parser.add_argument("--inference_ckpt_path", type=str, required=True)
115
+ parser.add_argument("--video_path", type=str, required=True)
116
+ parser.add_argument("--audio_path", type=str, required=True)
117
+ parser.add_argument("--video_out_path", type=str, required=True)
118
+ parser.add_argument("--inference_steps", type=int, default=20)
119
+ parser.add_argument("--guidance_scale", type=float, default=1.5)
120
+ parser.add_argument("--temp_dir", type=str, default="temp")
121
+ parser.add_argument("--seed", type=int, default=1247)
122
+ parser.add_argument("--enable_deepcache", action="store_true")
123
+
124
+ args = parser.parse_args(
125
+ [
126
+ "--inference_ckpt_path",
127
+ str(ckpt_path),
128
+ "--video_path",
129
+ str(Path(video_path).resolve()),
130
+ "--audio_path",
131
+ str(Path(audio_path).resolve()),
132
+ "--video_out_path",
133
+ str(out_path.resolve()),
134
+ "--inference_steps",
135
+ str(inference_steps),
136
+ "--guidance_scale",
137
+ str(guidance_scale),
138
+ "--seed",
139
+ str(seed),
140
+ "--temp_dir",
141
+ "temp",
142
+ "--enable_deepcache",
143
+ ]
144
+ )
145
+
146
+ main(config=config, args=args)
147
+ return str(out_path)
148
+
149
+
150
+ def generate(avatar_img, audio_wav, inference_steps, guidance_scale, seed):
151
+ ensure_repo()
152
+ ensure_checkpoints()
153
+
154
+ # avatar_img is a filepath (type="filepath")
155
+ # audio_wav is a filepath (type="filepath")
156
+ still_video = make_still_video(avatar_img, audio_wav, fps=25)
157
+ result = latentsync_infer(still_video, audio_wav, inference_steps, guidance_scale, seed)
158
+ return result
159
+
160
+
161
+ with gr.Blocks(title="LatentSync (avatar.jpg + audio.wav → lip-sync mp4)") as demo:
162
+ gr.Markdown(
163
+ """
164
+ # LatentSync (HF Space)
165
+ Upload **avatar.jpg** + **audio.wav** → get lip-sync **mp4**.
166
+ (Uses **LatentSync 1.5** to fit **T4 16GB VRAM**.)
167
+ """
168
+ )
169
+
170
+ with gr.Row():
171
+ avatar = gr.Image(label="Avatar Image (jpg/png)", type="filepath")
172
+ audio = gr.Audio(label="Audio (wav)", type="filepath")
173
+
174
+ with gr.Row():
175
+ guidance = gr.Slider(1.0, 3.0, value=1.5, step=0.1, label="Guidance Scale")
176
+ steps = gr.Slider(10, 50, value=20, step=1, label="Inference Steps")
177
+ seed = gr.Number(value=1247, precision=0, label="Seed")
178
+
179
+ btn = gr.Button("Generate Lip-Sync Video")
180
+ out = gr.Video(label="Output MP4")
181
+
182
+ btn.click(fn=generate, inputs=[avatar, audio, steps, guidance, seed], outputs=out)
183
+
184
+ demo.launch()