fffiloni commited on
Commit
b4242c7
·
verified ·
1 Parent(s): 8c1f2d8

Move docstrings mcp explntn to the right func

Browse files
Files changed (1) hide show
  1. webgui.py +34 -33
webgui.py CHANGED
@@ -165,39 +165,7 @@ def select_face(det_bboxes, probs):
165
  sorted_bboxes = sorted(filtered_bboxes, key=lambda x:(x[3]-x[1]) * (x[2] - x[0]), reverse=True)
166
  return sorted_bboxes[0]
167
 
168
- def process_video(uploaded_img, uploaded_audio, width, height, length, seed, facemask_dilation_ratio, facecrop_dilation_ratio, context_frames, context_overlap, cfg, steps, sample_rate, fps, device):
169
- """
170
- Generate a realistic lip-synced talking head video from a static reference image and a voice audio file.
171
-
172
- This function takes an image of a face and an audio clip, then generates a video where the face in the image is animated to match the speech in the audio. It uses EchoMimic's pipeline with configurable parameters for generation quality, length, and face conditioning.
173
-
174
- Args:
175
- uploaded_img (str): Path to the input reference image. This should be a front-facing, clear image of a person's face.
176
- uploaded_audio (str): Path to the WAV audio file to drive the animation. Speech audio works best.
177
- width (int): Target width of the generated video frame.
178
- height (int): Target height of the generated video frame.
179
- length (int): Number of frames in the final output video.
180
- seed (int): Random seed for reproducibility. If -1, a random seed is chosen.
181
- facemask_dilation_ratio (float): Dilation ratio for expanding the face mask region.
182
- facecrop_dilation_ratio (float): Dilation ratio for cropping the face region from the image.
183
- context_frames (int): Number of context frames used in temporal modeling.
184
- context_overlap (int): Number of overlapping frames between chunks.
185
- cfg (float): Classifier-Free Guidance scale. Higher values make outputs more faithful to input conditions.
186
- steps (int): Number of denoising steps in the diffusion process.
187
- sample_rate (int): Audio sample rate in Hz (e.g., 16000).
188
- fps (int): Frames per second in the output video.
189
- device (str): Device to run the computation on ("cuda" or "cpu").
190
- progress (gr.Progress): Gradio progress tracker for UI display.
191
-
192
- Returns:
193
- str: File path to the final output video with synchronized audio.
194
-
195
- Notes:
196
- - Input image should clearly show a single face, ideally centered and facing forward.
197
- - Audio should be speech or vocals; music or noise may produce unpredictable results.
198
- - The function trims audio to 5 seconds in shared UI mode to reduce compute time.
199
- - This function is designed to work on a GPU-enabled environment for optimal performance.
200
- """
201
 
202
  if seed is not None and seed > -1:
203
  generator = torch.manual_seed(seed)
@@ -303,6 +271,39 @@ def trim_audio(file_path, output_path, max_duration=5):
303
 
304
  @spaces.GPU(duration=200)
305
  def generate_video(uploaded_img, uploaded_audio, width, height, length, seed, facemask_dilation_ratio, facecrop_dilation_ratio, context_frames, context_overlap, cfg, steps, sample_rate, fps, device, progress=gr.Progress(track_tqdm=True)):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
306
  if is_shared_ui:
307
  gr.Info("Trimming audio to max 5 seconds. Duplicate the space for unlimited audio length.")
308
  uploaded_audio = trim_audio(uploaded_audio, "trimmed_audio.wav")
 
165
  sorted_bboxes = sorted(filtered_bboxes, key=lambda x:(x[3]-x[1]) * (x[2] - x[0]), reverse=True)
166
  return sorted_bboxes[0]
167
 
168
+ def process_video(uploaded_img, uploaded_audio, width, height, length, seed, facemask_dilation_ratio, facecrop_dilation_ratio, context_frames, context_overlap, cfg, steps, sample_rate, fps,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
169
 
170
  if seed is not None and seed > -1:
171
  generator = torch.manual_seed(seed)
 
271
 
272
  @spaces.GPU(duration=200)
273
  def generate_video(uploaded_img, uploaded_audio, width, height, length, seed, facemask_dilation_ratio, facecrop_dilation_ratio, context_frames, context_overlap, cfg, steps, sample_rate, fps, device, progress=gr.Progress(track_tqdm=True)):
274
+ device):
275
+ """
276
+ Generate a realistic lip-synced talking head video from a static reference image and a voice audio file.
277
+
278
+ This function takes an image of a face and an audio clip, then generates a video where the face in the image is animated to match the speech in the audio. It uses EchoMimic's pipeline with configurable parameters for generation quality, length, and face conditioning.
279
+
280
+ Args:
281
+ uploaded_img (str): Path to the input reference image. This should be a front-facing, clear image of a person's face.
282
+ uploaded_audio (str): Path to the WAV audio file to drive the animation. Speech audio works best.
283
+ width (int): Target width of the generated video frame.
284
+ height (int): Target height of the generated video frame.
285
+ length (int): Number of frames in the final output video.
286
+ seed (int): Random seed for reproducibility. If -1, a random seed is chosen.
287
+ facemask_dilation_ratio (float): Dilation ratio for expanding the face mask region.
288
+ facecrop_dilation_ratio (float): Dilation ratio for cropping the face region from the image.
289
+ context_frames (int): Number of context frames used in temporal modeling.
290
+ context_overlap (int): Number of overlapping frames between chunks.
291
+ cfg (float): Classifier-Free Guidance scale. Higher values make outputs more faithful to input conditions.
292
+ steps (int): Number of denoising steps in the diffusion process.
293
+ sample_rate (int): Audio sample rate in Hz (e.g., 16000).
294
+ fps (int): Frames per second in the output video.
295
+ device (str): Device to run the computation on ("cuda" or "cpu").
296
+ progress (gr.Progress): Gradio progress tracker for UI display.
297
+
298
+ Returns:
299
+ str: File path to the final output video with synchronized audio.
300
+
301
+ Notes:
302
+ - Input image should clearly show a single face, ideally centered and facing forward.
303
+ - Audio should be speech or vocals; music or noise may produce unpredictable results.
304
+ - The function trims audio to 5 seconds in shared UI mode to reduce compute time.
305
+ - This function is designed to work on a GPU-enabled environment for optimal performance.
306
+ """
307
  if is_shared_ui:
308
  gr.Info("Trimming audio to max 5 seconds. Duplicate the space for unlimited audio length.")
309
  uploaded_audio = trim_audio(uploaded_audio, "trimmed_audio.wav")