banao-tech commited on
Commit
4c48c35
Β·
verified Β·
1 Parent(s): 1872e4c

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +66 -80
app.py CHANGED
@@ -5,14 +5,11 @@ from pathlib import Path
5
  from datetime import datetime
6
  import gradio as gr
7
  from huggingface_hub import snapshot_download
8
- import numpy as np
9
- from PIL import Image
10
 
11
  ROOT = Path(__file__).parent.resolve()
12
  REPO_DIR = ROOT / "LatentSync"
13
  CKPT_DIR = REPO_DIR / "checkpoints"
14
  TEMP_DIR = REPO_DIR / "temp"
15
- MASK_DIR = REPO_DIR / "latentsync" / "utils"
16
 
17
  # Use 1.5 on T4 16GB
18
  HF_CKPT_REPO = "ByteDance/LatentSync-1.5"
@@ -21,63 +18,58 @@ def run(cmd, cwd=None):
21
  print(" ".join(map(str, cmd)))
22
  subprocess.check_call(cmd, cwd=cwd)
23
 
24
- def create_mask_image():
25
- """
26
- Create the missing mask.png file that LatentSync expects.
27
- This creates a circular mask for the mouth region (lower half of face).
28
- """
29
- mask_path = MASK_DIR / "mask.png"
30
- if mask_path.exists():
31
- return # Mask already exists
32
-
33
- # Create the utils directory if it doesn't exist
34
- MASK_DIR.mkdir(parents=True, exist_ok=True)
35
-
36
- # Create a 256x256 mask image
37
- # White (255) = area to be inpainted (mouth region)
38
- # Black (0) = area to keep unchanged
39
- height, width = 256, 256
40
- mask = np.zeros((height, width), dtype=np.uint8)
41
 
42
- # Create an elliptical mask for the lower face/mouth region
43
- # This covers approximately the bottom third of the face
44
- center_x, center_y = width // 2, int(height * 0.7)
45
- radius_x, radius_y = int(width * 0.35), int(height * 0.25)
46
 
47
- for y in range(height):
48
- for x in range(width):
49
- # Ellipse equation: ((x-cx)/rx)^2 + ((y-cy)/ry)^2 <= 1
50
- if ((x - center_x) / radius_x) ** 2 + ((y - center_y) / radius_y) ** 2 <= 1:
51
- mask[y, x] = 255
52
 
53
- # Save the mask
54
- mask_img = Image.fromarray(mask, mode='L')
55
- mask_img.save(str(mask_path))
56
- print(f"Created mask image at {mask_path}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
57
 
58
  def setup():
59
- # Clone LatentSync repo at runtime
60
  if not REPO_DIR.exists():
 
61
  run(["git", "clone", "--depth", "1", "https://github.com/bytedance/LatentSync.git", str(REPO_DIR)])
62
 
63
  CKPT_DIR.mkdir(parents=True, exist_ok=True)
64
  TEMP_DIR.mkdir(parents=True, exist_ok=True)
65
 
66
- # Create the missing mask.png file
67
- create_mask_image()
68
 
69
- # Download all checkpoint files
 
70
  snapshot_download(
71
  repo_id=HF_CKPT_REPO,
72
  local_dir=str(CKPT_DIR),
73
  local_dir_use_symlinks=False,
74
  )
 
75
 
76
  def make_still_video(image_path: str, audio_path: str, fps: int = 25) -> str:
77
- """
78
- Create a video by looping the avatar image for the length of the audio.
79
- LatentSync expects a VIDEO input.
80
- """
81
  out_path = TEMP_DIR / f"still_{datetime.now().strftime('%Y%m%d_%H%M%S')}.mp4"
82
  cmd = [
83
  "ffmpeg", "-y",
@@ -99,19 +91,20 @@ def generate(avatar_img, audio_wav, steps, guidance, seed, use_deepcache):
99
  setup()
100
 
101
  if avatar_img is None:
102
- return None, "Please upload an avatar image!"
103
  if audio_wav is None:
104
- return None, "Please upload an audio file!"
105
 
106
  img_path = str(Path(avatar_img).resolve())
107
  wav_path = str(Path(audio_wav).resolve())
108
 
109
- # Make a temp mp4 from the single image + audio
 
110
  video_path = make_still_video(img_path, wav_path, fps=25)
111
 
112
  out_path = TEMP_DIR / f"result_{datetime.now().strftime('%Y%m%d_%H%M%S')}.mp4"
113
 
114
- # Use correct config path for LatentSync 1.5
115
  cmd = [
116
  "python", "-m", "scripts.inference",
117
  "--unet_config_path", "configs/unet/stage2.yaml",
@@ -128,21 +121,22 @@ def generate(avatar_img, audio_wav, steps, guidance, seed, use_deepcache):
128
  if use_deepcache:
129
  cmd.append("--enable_deepcache")
130
 
 
131
  run(cmd, cwd=str(REPO_DIR))
132
 
133
  if out_path.exists():
134
- return str(out_path), "Video generated successfully!"
135
  else:
136
- return None, "Video generation failed - output file not created"
137
 
138
  except subprocess.CalledProcessError as e:
139
- error_msg = f"Command failed with return code {e.returncode}"
140
  return None, error_msg
141
  except Exception as e:
142
- return None, f"Error: {str(e)}"
143
 
144
- # Gradio Interface
145
- with gr.Blocks(title="LatentSync - Lip Sync Generator", theme=gr.themes.Soft()) as demo:
146
  gr.Markdown(
147
  """
148
  # 🎬 LatentSync 1.5 - AI Lip Sync Generator
@@ -160,41 +154,33 @@ with gr.Blocks(title="LatentSync - Lip Sync Generator", theme=gr.themes.Soft())
160
  with gr.Column():
161
  avatar = gr.Image(
162
  type="filepath",
163
- label="πŸ“· Avatar Image",
164
- info="Upload a clear frontal face photo (JPG/PNG)"
165
  )
166
  audio = gr.Audio(
167
  type="filepath",
168
- label="🎡 Audio File",
169
- format="wav",
170
- info="Upload your audio (WAV format recommended)"
171
  )
172
 
173
  with gr.Column():
174
- with gr.Group():
175
- gr.Markdown("### βš™οΈ Generation Settings")
176
- steps = gr.Slider(
177
- 10, 40, value=20, step=1,
178
- label="Inference Steps",
179
- info="Higher = better quality, slower"
180
- )
181
- guidance = gr.Slider(
182
- 0.8, 2.0, value=1.0, step=0.1,
183
- label="Guidance Scale",
184
- info="Higher = better lip sync, may distort"
185
- )
186
- seed = gr.Number(
187
- value=1247, precision=0,
188
- label="Seed",
189
- info="For reproducible results"
190
- )
191
- deepcache = gr.Checkbox(
192
- value=True,
193
- label="Enable DeepCache (Faster)",
194
- info="Recommended for T4 GPU"
195
- )
196
 
197
- btn = gr.Button("πŸš€ Generate Lip-Synced Video", variant="primary", size="lg")
198
 
199
  status = gr.Textbox(label="Status", interactive=False)
200
  out = gr.Video(label="Generated Video")
@@ -218,4 +204,4 @@ with gr.Blocks(title="LatentSync - Lip Sync Generator", theme=gr.themes.Soft())
218
 
219
  if __name__ == "__main__":
220
  demo.queue(max_size=3)
221
- demo.launch(server_name="0.0.0.0", server_port=7860)
 
5
  from datetime import datetime
6
  import gradio as gr
7
  from huggingface_hub import snapshot_download
 
 
8
 
9
  ROOT = Path(__file__).parent.resolve()
10
  REPO_DIR = ROOT / "LatentSync"
11
  CKPT_DIR = REPO_DIR / "checkpoints"
12
  TEMP_DIR = REPO_DIR / "temp"
 
13
 
14
  # Use 1.5 on T4 16GB
15
  HF_CKPT_REPO = "ByteDance/LatentSync-1.5"
 
18
  print(" ".join(map(str, cmd)))
19
  subprocess.check_call(cmd, cwd=cwd)
20
 
21
+ def create_mask_file():
22
+ """Create the missing mask.png file"""
23
+ mask_dir = REPO_DIR / "latentsync" / "utils"
24
+ mask_path = mask_dir / "mask.png"
 
 
 
 
 
 
 
 
 
 
 
 
 
25
 
26
+ if mask_path.exists():
27
+ return
 
 
28
 
29
+ mask_dir.mkdir(parents=True, exist_ok=True)
 
 
 
 
30
 
31
+ # Create mask using numpy and PIL
32
+ try:
33
+ import numpy as np
34
+ from PIL import Image
35
+
36
+ # Create 256x256 mask (white = inpaint mouth area, black = keep)
37
+ mask = np.zeros((256, 256), dtype=np.uint8)
38
+ # Create ellipse for mouth region (lower face)
39
+ center_x, center_y = 128, 180
40
+ for y in range(256):
41
+ for x in range(256):
42
+ # Ellipse: ((x-cx)/rx)^2 + ((y-cy)/ry)^2 <= 1
43
+ if ((x - center_x) / 90) ** 2 + ((y - center_y) / 64) ** 2 <= 1:
44
+ mask[y, x] = 255
45
+
46
+ Image.fromarray(mask, mode='L').save(str(mask_path))
47
+ print(f"βœ“ Created mask at {mask_path}")
48
+ except Exception as e:
49
+ print(f"Warning: Could not create mask: {e}")
50
 
51
  def setup():
 
52
  if not REPO_DIR.exists():
53
+ print("Cloning LatentSync repository...")
54
  run(["git", "clone", "--depth", "1", "https://github.com/bytedance/LatentSync.git", str(REPO_DIR)])
55
 
56
  CKPT_DIR.mkdir(parents=True, exist_ok=True)
57
  TEMP_DIR.mkdir(parents=True, exist_ok=True)
58
 
59
+ # Create mask file before running inference
60
+ create_mask_file()
61
 
62
+ # Download checkpoints
63
+ print("Downloading model checkpoints...")
64
  snapshot_download(
65
  repo_id=HF_CKPT_REPO,
66
  local_dir=str(CKPT_DIR),
67
  local_dir_use_symlinks=False,
68
  )
69
+ print("βœ“ Setup complete")
70
 
71
  def make_still_video(image_path: str, audio_path: str, fps: int = 25) -> str:
72
+ """Convert static image + audio to video"""
 
 
 
73
  out_path = TEMP_DIR / f"still_{datetime.now().strftime('%Y%m%d_%H%M%S')}.mp4"
74
  cmd = [
75
  "ffmpeg", "-y",
 
91
  setup()
92
 
93
  if avatar_img is None:
94
+ return None, "❌ Please upload an avatar image!"
95
  if audio_wav is None:
96
+ return None, "❌ Please upload an audio file!"
97
 
98
  img_path = str(Path(avatar_img).resolve())
99
  wav_path = str(Path(audio_wav).resolve())
100
 
101
+ # Create video from image + audio
102
+ print("Creating input video...")
103
  video_path = make_still_video(img_path, wav_path, fps=25)
104
 
105
  out_path = TEMP_DIR / f"result_{datetime.now().strftime('%Y%m%d_%H%M%S')}.mp4"
106
 
107
+ # Fixed config path for LatentSync 1.5
108
  cmd = [
109
  "python", "-m", "scripts.inference",
110
  "--unet_config_path", "configs/unet/stage2.yaml",
 
121
  if use_deepcache:
122
  cmd.append("--enable_deepcache")
123
 
124
+ print("Generating lip-synced video...")
125
  run(cmd, cwd=str(REPO_DIR))
126
 
127
  if out_path.exists():
128
+ return str(out_path), "βœ… Video generated successfully!"
129
  else:
130
+ return None, "❌ Video generation failed - output file not created"
131
 
132
  except subprocess.CalledProcessError as e:
133
+ error_msg = f"❌ Command failed with return code {e.returncode}"
134
  return None, error_msg
135
  except Exception as e:
136
+ return None, f"❌ Error: {str(e)}"
137
 
138
+ # Gradio Interface - Compatible with Gradio 4.44.1
139
+ with gr.Blocks(title="LatentSync Lip Sync") as demo:
140
  gr.Markdown(
141
  """
142
  # 🎬 LatentSync 1.5 - AI Lip Sync Generator
 
154
  with gr.Column():
155
  avatar = gr.Image(
156
  type="filepath",
157
+ label="πŸ“· Avatar Image (JPG/PNG)"
 
158
  )
159
  audio = gr.Audio(
160
  type="filepath",
161
+ label="🎡 Audio File (WAV)"
 
 
162
  )
163
 
164
  with gr.Column():
165
+ gr.Markdown("### βš™οΈ Generation Settings")
166
+ steps = gr.Slider(
167
+ 10, 40, value=20, step=1,
168
+ label="Inference Steps (Higher = Better Quality)"
169
+ )
170
+ guidance = gr.Slider(
171
+ 0.8, 2.0, value=1.0, step=0.1,
172
+ label="Guidance Scale (Higher = Stronger Lip Sync)"
173
+ )
174
+ seed = gr.Number(
175
+ value=1247, precision=0,
176
+ label="Seed (For Reproducibility)"
177
+ )
178
+ deepcache = gr.Checkbox(
179
+ value=True,
180
+ label="Enable DeepCache (Faster - Recommended for T4)"
181
+ )
 
 
 
 
 
182
 
183
+ btn = gr.Button("πŸš€ Generate Lip-Synced Video", variant="primary")
184
 
185
  status = gr.Textbox(label="Status", interactive=False)
186
  out = gr.Video(label="Generated Video")
 
204
 
205
  if __name__ == "__main__":
206
  demo.queue(max_size=3)
207
+ demo.launch()