banao-tech commited on
Commit
eb63d21
·
verified ·
1 Parent(s): b606086

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +159 -45
app.py CHANGED
@@ -5,11 +5,14 @@ from pathlib import Path
5
  from datetime import datetime
6
  import gradio as gr
7
  from huggingface_hub import snapshot_download
 
 
8
 
9
  ROOT = Path(__file__).parent.resolve()
10
  REPO_DIR = ROOT / "LatentSync"
11
  CKPT_DIR = REPO_DIR / "checkpoints"
12
  TEMP_DIR = REPO_DIR / "temp"
 
13
 
14
  # Use 1.5 on T4 16GB
15
  HF_CKPT_REPO = "ByteDance/LatentSync-1.5"
@@ -18,15 +21,52 @@ def run(cmd, cwd=None):
18
  print(" ".join(map(str, cmd)))
19
  subprocess.check_call(cmd, cwd=cwd)
20
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
21
  def setup():
22
- # Clone LatentSync repo at runtime (won't appear in HF Files tab)
23
  if not REPO_DIR.exists():
24
  run(["git", "clone", "--depth", "1", "https://github.com/bytedance/LatentSync.git", str(REPO_DIR)])
25
 
26
  CKPT_DIR.mkdir(parents=True, exist_ok=True)
27
  TEMP_DIR.mkdir(parents=True, exist_ok=True)
28
 
29
- # Download all checkpoint files (includes latentsync_unet + whisper tiny/small etc)
 
 
 
30
  snapshot_download(
31
  repo_id=HF_CKPT_REPO,
32
  local_dir=str(CKPT_DIR),
@@ -55,53 +95,127 @@ def make_still_video(image_path: str, audio_path: str, fps: int = 25) -> str:
55
  return str(out_path)
56
 
57
  def generate(avatar_img, audio_wav, steps, guidance, seed, use_deepcache):
58
- setup()
59
-
60
- img_path = str(Path(avatar_img).resolve())
61
- wav_path = str(Path(audio_wav).resolve())
62
-
63
- # Make a temp mp4 from the single image + audio
64
- video_path = make_still_video(img_path, wav_path, fps=25)
65
-
66
- out_path = TEMP_DIR / f"result_{datetime.now().strftime('%Y%m%d_%H%M%S')}.mp4"
67
-
68
- # FIXED: Use correct config path - configs/unet/stage2.yaml instead of configs/unet.yaml
69
- cmd = [
70
- "python", "-m", "scripts.inference",
71
- "--unet_config_path", "configs/unet/stage2.yaml", # ← FIXED PATH
72
- "--inference_ckpt_path", "checkpoints/latentsync_unet.pt",
73
- "--video_path", video_path,
74
- "--audio_path", wav_path,
75
- "--video_out_path", str(out_path),
76
- "--inference_steps", str(int(steps)),
77
- "--guidance_scale", str(float(guidance)),
78
- "--seed", str(int(seed)),
79
- "--temp_dir", "temp",
80
- ]
81
-
82
- if use_deepcache:
83
- cmd.append("--enable_deepcache")
84
-
85
- run(cmd, cwd=str(REPO_DIR))
86
-
87
- return str(out_path)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
88
 
89
- with gr.Blocks(title="LatentSync (avatar.jpg + audio.wav → lip-sync mp4)") as demo:
90
- gr.Markdown("## LatentSync 1.5 on Hugging Face (T4) Upload avatar + audio → mp4")
 
 
 
 
 
 
 
 
 
 
 
 
91
 
92
  with gr.Row():
93
- avatar = gr.Image(type="filepath", label="Avatar image (jpg/png)")
94
- audio = gr.Audio(type="filepath", label="Audio (wav)", format="wav")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
95
 
96
- with gr.Row():
97
- steps = gr.Slider(10, 40, value=20, step=1, label="Inference Steps")
98
- guidance = gr.Slider(0.8, 2.0, value=1.0, step=0.1, label="Guidance Scale")
99
- seed = gr.Number(value=1247, precision=0, label="Seed")
100
- deepcache = gr.Checkbox(value=True, label="Enable DeepCache (faster)")
101
 
102
- btn = gr.Button("Generate")
103
- out = gr.Video(label="Output video")
104
 
105
- btn.click(generate, inputs=[avatar, audio, steps, guidance, seed, deepcache], outputs=out)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
106
 
107
- demo.launch()
 
 
 
5
  from datetime import datetime
6
  import gradio as gr
7
  from huggingface_hub import snapshot_download
8
+ import numpy as np
9
+ from PIL import Image
10
 
11
  ROOT = Path(__file__).parent.resolve()
12
  REPO_DIR = ROOT / "LatentSync"
13
  CKPT_DIR = REPO_DIR / "checkpoints"
14
  TEMP_DIR = REPO_DIR / "temp"
15
+ MASK_DIR = REPO_DIR / "latentsync" / "utils"
16
 
17
  # Use 1.5 on T4 16GB
18
  HF_CKPT_REPO = "ByteDance/LatentSync-1.5"
 
21
  print(" ".join(map(str, cmd)))
22
  subprocess.check_call(cmd, cwd=cwd)
23
 
24
+ def create_mask_image():
25
+ """
26
+ Create the missing mask.png file that LatentSync expects.
27
+ This creates a circular mask for the mouth region (lower half of face).
28
+ """
29
+ mask_path = MASK_DIR / "mask.png"
30
+ if mask_path.exists():
31
+ return # Mask already exists
32
+
33
+ # Create the utils directory if it doesn't exist
34
+ MASK_DIR.mkdir(parents=True, exist_ok=True)
35
+
36
+ # Create a 256x256 mask image
37
+ # White (255) = area to be inpainted (mouth region)
38
+ # Black (0) = area to keep unchanged
39
+ height, width = 256, 256
40
+ mask = np.zeros((height, width), dtype=np.uint8)
41
+
42
+ # Create an elliptical mask for the lower face/mouth region
43
+ # This covers approximately the bottom third of the face
44
+ center_x, center_y = width // 2, int(height * 0.7)
45
+ radius_x, radius_y = int(width * 0.35), int(height * 0.25)
46
+
47
+ for y in range(height):
48
+ for x in range(width):
49
+ # Ellipse equation: ((x-cx)/rx)^2 + ((y-cy)/ry)^2 <= 1
50
+ if ((x - center_x) / radius_x) ** 2 + ((y - center_y) / radius_y) ** 2 <= 1:
51
+ mask[y, x] = 255
52
+
53
+ # Save the mask
54
+ mask_img = Image.fromarray(mask, mode='L')
55
+ mask_img.save(str(mask_path))
56
+ print(f"Created mask image at {mask_path}")
57
+
58
  def setup():
59
+ # Clone LatentSync repo at runtime
60
  if not REPO_DIR.exists():
61
  run(["git", "clone", "--depth", "1", "https://github.com/bytedance/LatentSync.git", str(REPO_DIR)])
62
 
63
  CKPT_DIR.mkdir(parents=True, exist_ok=True)
64
  TEMP_DIR.mkdir(parents=True, exist_ok=True)
65
 
66
+ # Create the missing mask.png file
67
+ create_mask_image()
68
+
69
+ # Download all checkpoint files
70
  snapshot_download(
71
  repo_id=HF_CKPT_REPO,
72
  local_dir=str(CKPT_DIR),
 
95
  return str(out_path)
96
 
97
  def generate(avatar_img, audio_wav, steps, guidance, seed, use_deepcache):
98
+ try:
99
+ setup()
100
+
101
+ if avatar_img is None:
102
+ return None, "Please upload an avatar image!"
103
+ if audio_wav is None:
104
+ return None, "Please upload an audio file!"
105
+
106
+ img_path = str(Path(avatar_img).resolve())
107
+ wav_path = str(Path(audio_wav).resolve())
108
+
109
+ # Make a temp mp4 from the single image + audio
110
+ video_path = make_still_video(img_path, wav_path, fps=25)
111
+
112
+ out_path = TEMP_DIR / f"result_{datetime.now().strftime('%Y%m%d_%H%M%S')}.mp4"
113
+
114
+ # Use correct config path for LatentSync 1.5
115
+ cmd = [
116
+ "python", "-m", "scripts.inference",
117
+ "--unet_config_path", "configs/unet/stage2.yaml",
118
+ "--inference_ckpt_path", "checkpoints/latentsync_unet.pt",
119
+ "--video_path", video_path,
120
+ "--audio_path", wav_path,
121
+ "--video_out_path", str(out_path),
122
+ "--inference_steps", str(int(steps)),
123
+ "--guidance_scale", str(float(guidance)),
124
+ "--seed", str(int(seed)),
125
+ "--temp_dir", "temp",
126
+ ]
127
+
128
+ if use_deepcache:
129
+ cmd.append("--enable_deepcache")
130
+
131
+ run(cmd, cwd=str(REPO_DIR))
132
+
133
+ if out_path.exists():
134
+ return str(out_path), "Video generated successfully!"
135
+ else:
136
+ return None, "Video generation failed - output file not created"
137
+
138
+ except subprocess.CalledProcessError as e:
139
+ error_msg = f"Command failed with return code {e.returncode}"
140
+ return None, error_msg
141
+ except Exception as e:
142
+ return None, f"Error: {str(e)}"
143
 
144
+ # Gradio Interface
145
+ with gr.Blocks(title="LatentSync - Lip Sync Generator", theme=gr.themes.Soft()) as demo:
146
+ gr.Markdown(
147
+ """
148
+ # 🎬 LatentSync 1.5 - AI Lip Sync Generator
149
+
150
+ Upload an avatar image and audio file to generate a lip-synced video!
151
+
152
+ **Tips:**
153
+ - Use clear frontal face images for best results
154
+ - Keep audio under 30 seconds for faster processing
155
+ - Higher inference steps = better quality but slower
156
+ """
157
+ )
158
 
159
  with gr.Row():
160
+ with gr.Column():
161
+ avatar = gr.Image(
162
+ type="filepath",
163
+ label="📷 Avatar Image",
164
+ info="Upload a clear frontal face photo (JPG/PNG)"
165
+ )
166
+ audio = gr.Audio(
167
+ type="filepath",
168
+ label="🎵 Audio File",
169
+ format="wav",
170
+ info="Upload your audio (WAV format recommended)"
171
+ )
172
+
173
+ with gr.Column():
174
+ with gr.Group():
175
+ gr.Markdown("### ⚙️ Generation Settings")
176
+ steps = gr.Slider(
177
+ 10, 40, value=20, step=1,
178
+ label="Inference Steps",
179
+ info="Higher = better quality, slower"
180
+ )
181
+ guidance = gr.Slider(
182
+ 0.8, 2.0, value=1.0, step=0.1,
183
+ label="Guidance Scale",
184
+ info="Higher = better lip sync, may distort"
185
+ )
186
+ seed = gr.Number(
187
+ value=1247, precision=0,
188
+ label="Seed",
189
+ info="For reproducible results"
190
+ )
191
+ deepcache = gr.Checkbox(
192
+ value=True,
193
+ label="Enable DeepCache (Faster)",
194
+ info="Recommended for T4 GPU"
195
+ )
196
 
197
+ btn = gr.Button("🚀 Generate Lip-Synced Video", variant="primary", size="lg")
 
 
 
 
198
 
199
+ status = gr.Textbox(label="Status", interactive=False)
200
+ out = gr.Video(label="Generated Video")
201
 
202
+ btn.click(
203
+ generate,
204
+ inputs=[avatar, audio, steps, guidance, seed, deepcache],
205
+ outputs=[out, status]
206
+ )
207
+
208
+ gr.Markdown(
209
+ """
210
+ ---
211
+ ### 📝 Notes:
212
+ - First run will download models (~7GB) - this may take a few minutes
213
+ - Generation takes 30-90 seconds depending on settings
214
+ - Works best with T4 GPU (16GB)
215
+ - Based on [LatentSync by ByteDance](https://github.com/bytedance/LatentSync)
216
+ """
217
+ )
218
 
219
+ if __name__ == "__main__":
220
+ demo.queue(max_size=3)
221
+ demo.launch(server_name="0.0.0.0", server_port=7860)