Move docstrings mcp explntn to the right func
Browse files
webgui.py
CHANGED
|
@@ -165,39 +165,7 @@ def select_face(det_bboxes, probs):
|
|
| 165 |
sorted_bboxes = sorted(filtered_bboxes, key=lambda x:(x[3]-x[1]) * (x[2] - x[0]), reverse=True)
|
| 166 |
return sorted_bboxes[0]
|
| 167 |
|
| 168 |
-
def process_video(uploaded_img, uploaded_audio, width, height, length, seed, facemask_dilation_ratio, facecrop_dilation_ratio, context_frames, context_overlap, cfg, steps, sample_rate, fps,
|
| 169 |
-
"""
|
| 170 |
-
Generate a realistic lip-synced talking head video from a static reference image and a voice audio file.
|
| 171 |
-
|
| 172 |
-
This function takes an image of a face and an audio clip, then generates a video where the face in the image is animated to match the speech in the audio. It uses EchoMimic's pipeline with configurable parameters for generation quality, length, and face conditioning.
|
| 173 |
-
|
| 174 |
-
Args:
|
| 175 |
-
uploaded_img (str): Path to the input reference image. This should be a front-facing, clear image of a person's face.
|
| 176 |
-
uploaded_audio (str): Path to the WAV audio file to drive the animation. Speech audio works best.
|
| 177 |
-
width (int): Target width of the generated video frame.
|
| 178 |
-
height (int): Target height of the generated video frame.
|
| 179 |
-
length (int): Number of frames in the final output video.
|
| 180 |
-
seed (int): Random seed for reproducibility. If -1, a random seed is chosen.
|
| 181 |
-
facemask_dilation_ratio (float): Dilation ratio for expanding the face mask region.
|
| 182 |
-
facecrop_dilation_ratio (float): Dilation ratio for cropping the face region from the image.
|
| 183 |
-
context_frames (int): Number of context frames used in temporal modeling.
|
| 184 |
-
context_overlap (int): Number of overlapping frames between chunks.
|
| 185 |
-
cfg (float): Classifier-Free Guidance scale. Higher values make outputs more faithful to input conditions.
|
| 186 |
-
steps (int): Number of denoising steps in the diffusion process.
|
| 187 |
-
sample_rate (int): Audio sample rate in Hz (e.g., 16000).
|
| 188 |
-
fps (int): Frames per second in the output video.
|
| 189 |
-
device (str): Device to run the computation on ("cuda" or "cpu").
|
| 190 |
-
progress (gr.Progress): Gradio progress tracker for UI display.
|
| 191 |
-
|
| 192 |
-
Returns:
|
| 193 |
-
str: File path to the final output video with synchronized audio.
|
| 194 |
-
|
| 195 |
-
Notes:
|
| 196 |
-
- Input image should clearly show a single face, ideally centered and facing forward.
|
| 197 |
-
- Audio should be speech or vocals; music or noise may produce unpredictable results.
|
| 198 |
-
- The function trims audio to 5 seconds in shared UI mode to reduce compute time.
|
| 199 |
-
- This function is designed to work on a GPU-enabled environment for optimal performance.
|
| 200 |
-
"""
|
| 201 |
|
| 202 |
if seed is not None and seed > -1:
|
| 203 |
generator = torch.manual_seed(seed)
|
|
@@ -303,6 +271,39 @@ def trim_audio(file_path, output_path, max_duration=5):
|
|
| 303 |
|
| 304 |
@spaces.GPU(duration=200)
|
| 305 |
def generate_video(uploaded_img, uploaded_audio, width, height, length, seed, facemask_dilation_ratio, facecrop_dilation_ratio, context_frames, context_overlap, cfg, steps, sample_rate, fps, device, progress=gr.Progress(track_tqdm=True)):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 306 |
if is_shared_ui:
|
| 307 |
gr.Info("Trimming audio to max 5 seconds. Duplicate the space for unlimited audio length.")
|
| 308 |
uploaded_audio = trim_audio(uploaded_audio, "trimmed_audio.wav")
|
|
|
|
| 165 |
sorted_bboxes = sorted(filtered_bboxes, key=lambda x:(x[3]-x[1]) * (x[2] - x[0]), reverse=True)
|
| 166 |
return sorted_bboxes[0]
|
| 167 |
|
| 168 |
+
def process_video(uploaded_img, uploaded_audio, width, height, length, seed, facemask_dilation_ratio, facecrop_dilation_ratio, context_frames, context_overlap, cfg, steps, sample_rate, fps,
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 169 |
|
| 170 |
if seed is not None and seed > -1:
|
| 171 |
generator = torch.manual_seed(seed)
|
|
|
|
| 271 |
|
| 272 |
@spaces.GPU(duration=200)
|
| 273 |
def generate_video(uploaded_img, uploaded_audio, width, height, length, seed, facemask_dilation_ratio, facecrop_dilation_ratio, context_frames, context_overlap, cfg, steps, sample_rate, fps, device, progress=gr.Progress(track_tqdm=True)):
|
| 274 |
+
device):
|
| 275 |
+
"""
|
| 276 |
+
Generate a realistic lip-synced talking head video from a static reference image and a voice audio file.
|
| 277 |
+
|
| 278 |
+
This function takes an image of a face and an audio clip, then generates a video where the face in the image is animated to match the speech in the audio. It uses EchoMimic's pipeline with configurable parameters for generation quality, length, and face conditioning.
|
| 279 |
+
|
| 280 |
+
Args:
|
| 281 |
+
uploaded_img (str): Path to the input reference image. This should be a front-facing, clear image of a person's face.
|
| 282 |
+
uploaded_audio (str): Path to the WAV audio file to drive the animation. Speech audio works best.
|
| 283 |
+
width (int): Target width of the generated video frame.
|
| 284 |
+
height (int): Target height of the generated video frame.
|
| 285 |
+
length (int): Number of frames in the final output video.
|
| 286 |
+
seed (int): Random seed for reproducibility. If -1, a random seed is chosen.
|
| 287 |
+
facemask_dilation_ratio (float): Dilation ratio for expanding the face mask region.
|
| 288 |
+
facecrop_dilation_ratio (float): Dilation ratio for cropping the face region from the image.
|
| 289 |
+
context_frames (int): Number of context frames used in temporal modeling.
|
| 290 |
+
context_overlap (int): Number of overlapping frames between chunks.
|
| 291 |
+
cfg (float): Classifier-Free Guidance scale. Higher values make outputs more faithful to input conditions.
|
| 292 |
+
steps (int): Number of denoising steps in the diffusion process.
|
| 293 |
+
sample_rate (int): Audio sample rate in Hz (e.g., 16000).
|
| 294 |
+
fps (int): Frames per second in the output video.
|
| 295 |
+
device (str): Device to run the computation on ("cuda" or "cpu").
|
| 296 |
+
progress (gr.Progress): Gradio progress tracker for UI display.
|
| 297 |
+
|
| 298 |
+
Returns:
|
| 299 |
+
str: File path to the final output video with synchronized audio.
|
| 300 |
+
|
| 301 |
+
Notes:
|
| 302 |
+
- Input image should clearly show a single face, ideally centered and facing forward.
|
| 303 |
+
- Audio should be speech or vocals; music or noise may produce unpredictable results.
|
| 304 |
+
- The function trims audio to 5 seconds in shared UI mode to reduce compute time.
|
| 305 |
+
- This function is designed to work on a GPU-enabled environment for optimal performance.
|
| 306 |
+
"""
|
| 307 |
if is_shared_ui:
|
| 308 |
gr.Info("Trimming audio to max 5 seconds. Duplicate the space for unlimited audio length.")
|
| 309 |
uploaded_audio = trim_audio(uploaded_audio, "trimmed_audio.wav")
|