echo

Build error

App Files Files Community

fffiloni commited on Jun 4, 2025

Commit

b4242c7

verified ·

1 Parent(s): 8c1f2d8

Move docstrings mcp explntn to the right func

Browse files

Files changed (1) hide show

webgui.py +34 -33

webgui.py CHANGED Viewed

@@ -165,39 +165,7 @@ def select_face(det_bboxes, probs):
     sorted_bboxes = sorted(filtered_bboxes, key=lambda x:(x[3]-x[1]) * (x[2] - x[0]), reverse=True)
     return sorted_bboxes[0]
-def process_video(uploaded_img, uploaded_audio, width, height, length, seed, facemask_dilation_ratio, facecrop_dilation_ratio, context_frames, context_overlap, cfg, steps, sample_rate, fps, device):
-    """
-    Generate a realistic lip-synced talking head video from a static reference image and a voice audio file.
-    This function takes an image of a face and an audio clip, then generates a video where the face in the image is animated to match the speech in the audio. It uses EchoMimic's pipeline with configurable parameters for generation quality, length, and face conditioning.
-    Args:
-        uploaded_img (str): Path to the input reference image. This should be a front-facing, clear image of a person's face.
-        uploaded_audio (str): Path to the WAV audio file to drive the animation. Speech audio works best.
-        width (int): Target width of the generated video frame.
-        height (int): Target height of the generated video frame.
-        length (int): Number of frames in the final output video.
-        seed (int): Random seed for reproducibility. If -1, a random seed is chosen.
-        facemask_dilation_ratio (float): Dilation ratio for expanding the face mask region.
-        facecrop_dilation_ratio (float): Dilation ratio for cropping the face region from the image.
-        context_frames (int): Number of context frames used in temporal modeling.
-        context_overlap (int): Number of overlapping frames between chunks.
-        cfg (float): Classifier-Free Guidance scale. Higher values make outputs more faithful to input conditions.
-        steps (int): Number of denoising steps in the diffusion process.
-        sample_rate (int): Audio sample rate in Hz (e.g., 16000).
-        fps (int): Frames per second in the output video.
-        device (str): Device to run the computation on ("cuda" or "cpu").
-        progress (gr.Progress): Gradio progress tracker for UI display.
-    Returns:
-        str: File path to the final output video with synchronized audio.
-    Notes:
-        - Input image should clearly show a single face, ideally centered and facing forward.
-        - Audio should be speech or vocals; music or noise may produce unpredictable results.
-        - The function trims audio to 5 seconds in shared UI mode to reduce compute time.
-        - This function is designed to work on a GPU-enabled environment for optimal performance.
-    """
     if seed is not None and seed > -1:
         generator = torch.manual_seed(seed)
@@ -303,6 +271,39 @@ def trim_audio(file_path, output_path, max_duration=5):
 @spaces.GPU(duration=200)
 def generate_video(uploaded_img, uploaded_audio, width, height, length, seed, facemask_dilation_ratio, facecrop_dilation_ratio, context_frames, context_overlap, cfg, steps, sample_rate, fps, device, progress=gr.Progress(track_tqdm=True)):
     if is_shared_ui:
         gr.Info("Trimming audio to max 5 seconds. Duplicate the space for unlimited audio length.")
         uploaded_audio = trim_audio(uploaded_audio, "trimmed_audio.wav")

     sorted_bboxes = sorted(filtered_bboxes, key=lambda x:(x[3]-x[1]) * (x[2] - x[0]), reverse=True)
     return sorted_bboxes[0]
+def process_video(uploaded_img, uploaded_audio, width, height, length, seed, facemask_dilation_ratio, facecrop_dilation_ratio, context_frames, context_overlap, cfg, steps, sample_rate, fps,
     if seed is not None and seed > -1:
         generator = torch.manual_seed(seed)
 @spaces.GPU(duration=200)
 def generate_video(uploaded_img, uploaded_audio, width, height, length, seed, facemask_dilation_ratio, facecrop_dilation_ratio, context_frames, context_overlap, cfg, steps, sample_rate, fps, device, progress=gr.Progress(track_tqdm=True)):
+device):
+    """
+    Generate a realistic lip-synced talking head video from a static reference image and a voice audio file.
+    This function takes an image of a face and an audio clip, then generates a video where the face in the image is animated to match the speech in the audio. It uses EchoMimic's pipeline with configurable parameters for generation quality, length, and face conditioning.
+    Args:
+        uploaded_img (str): Path to the input reference image. This should be a front-facing, clear image of a person's face.
+        uploaded_audio (str): Path to the WAV audio file to drive the animation. Speech audio works best.
+        width (int): Target width of the generated video frame.
+        height (int): Target height of the generated video frame.
+        length (int): Number of frames in the final output video.
+        seed (int): Random seed for reproducibility. If -1, a random seed is chosen.
+        facemask_dilation_ratio (float): Dilation ratio for expanding the face mask region.
+        facecrop_dilation_ratio (float): Dilation ratio for cropping the face region from the image.
+        context_frames (int): Number of context frames used in temporal modeling.
+        context_overlap (int): Number of overlapping frames between chunks.
+        cfg (float): Classifier-Free Guidance scale. Higher values make outputs more faithful to input conditions.
+        steps (int): Number of denoising steps in the diffusion process.
+        sample_rate (int): Audio sample rate in Hz (e.g., 16000).
+        fps (int): Frames per second in the output video.
+        device (str): Device to run the computation on ("cuda" or "cpu").
+        progress (gr.Progress): Gradio progress tracker for UI display.
+    Returns:
+        str: File path to the final output video with synchronized audio.
+    Notes:
+        - Input image should clearly show a single face, ideally centered and facing forward.
+        - Audio should be speech or vocals; music or noise may produce unpredictable results.
+        - The function trims audio to 5 seconds in shared UI mode to reduce compute time.
+        - This function is designed to work on a GPU-enabled environment for optimal performance.
+    """
     if is_shared_ui:
         gr.Info("Trimming audio to max 5 seconds. Duplicate the space for unlimited audio length.")
         uploaded_audio = trim_audio(uploaded_audio, "trimmed_audio.wav")