MCP server ready
Browse files
webgui.py
CHANGED
|
@@ -166,6 +166,38 @@ def select_face(det_bboxes, probs):
|
|
| 166 |
return sorted_bboxes[0]
|
| 167 |
|
| 168 |
def process_video(uploaded_img, uploaded_audio, width, height, length, seed, facemask_dilation_ratio, facecrop_dilation_ratio, context_frames, context_overlap, cfg, steps, sample_rate, fps, device):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 169 |
|
| 170 |
if seed is not None and seed > -1:
|
| 171 |
generator = torch.manual_seed(seed)
|
|
@@ -427,5 +459,5 @@ args = parser.parse_args()
|
|
| 427 |
# demo.launch(server_name=args.server_name, server_port=args.server_port, inbrowser=True)
|
| 428 |
|
| 429 |
if __name__ == '__main__':
|
| 430 |
-
demo.queue(max_size=3).launch(show_api=
|
| 431 |
#demo.launch(server_name=args.server_name, server_port=args.server_port, inbrowser=True)
|
|
|
|
| 166 |
return sorted_bboxes[0]
|
| 167 |
|
| 168 |
def process_video(uploaded_img, uploaded_audio, width, height, length, seed, facemask_dilation_ratio, facecrop_dilation_ratio, context_frames, context_overlap, cfg, steps, sample_rate, fps, device):
|
| 169 |
+
"""
|
| 170 |
+
Generate a realistic lip-synced talking head video from a static reference image and a voice audio file.
|
| 171 |
+
|
| 172 |
+
This function takes an image of a face and an audio clip, then generates a video where the face in the image is animated to match the speech in the audio. It uses EchoMimic's pipeline with configurable parameters for generation quality, length, and face conditioning.
|
| 173 |
+
|
| 174 |
+
Args:
|
| 175 |
+
uploaded_img (str): Path to the input reference image. This should be a front-facing, clear image of a person's face.
|
| 176 |
+
uploaded_audio (str): Path to the WAV audio file to drive the animation. Speech audio works best.
|
| 177 |
+
width (int): Target width of the generated video frame.
|
| 178 |
+
height (int): Target height of the generated video frame.
|
| 179 |
+
length (int): Number of frames in the final output video.
|
| 180 |
+
seed (int): Random seed for reproducibility. If -1, a random seed is chosen.
|
| 181 |
+
facemask_dilation_ratio (float): Dilation ratio for expanding the face mask region.
|
| 182 |
+
facecrop_dilation_ratio (float): Dilation ratio for cropping the face region from the image.
|
| 183 |
+
context_frames (int): Number of context frames used in temporal modeling.
|
| 184 |
+
context_overlap (int): Number of overlapping frames between chunks.
|
| 185 |
+
cfg (float): Classifier-Free Guidance scale. Higher values make outputs more faithful to input conditions.
|
| 186 |
+
steps (int): Number of denoising steps in the diffusion process.
|
| 187 |
+
sample_rate (int): Audio sample rate in Hz (e.g., 16000).
|
| 188 |
+
fps (int): Frames per second in the output video.
|
| 189 |
+
device (str): Device to run the computation on ("cuda" or "cpu").
|
| 190 |
+
progress (gr.Progress): Gradio progress tracker for UI display.
|
| 191 |
+
|
| 192 |
+
Returns:
|
| 193 |
+
str: File path to the final output video with synchronized audio.
|
| 194 |
+
|
| 195 |
+
Notes:
|
| 196 |
+
- Input image should clearly show a single face, ideally centered and facing forward.
|
| 197 |
+
- Audio should be speech or vocals; music or noise may produce unpredictable results.
|
| 198 |
+
- The function trims audio to 5 seconds in shared UI mode to reduce compute time.
|
| 199 |
+
- This function is designed to work on a GPU-enabled environment for optimal performance.
|
| 200 |
+
"""
|
| 201 |
|
| 202 |
if seed is not None and seed > -1:
|
| 203 |
generator = torch.manual_seed(seed)
|
|
|
|
| 459 |
# demo.launch(server_name=args.server_name, server_port=args.server_port, inbrowser=True)
|
| 460 |
|
| 461 |
if __name__ == '__main__':
|
| 462 |
+
demo.queue(max_size=3).launch(show_api=True, show_error=True, ssr_mode=False, mcp_server=True)
|
| 463 |
#demo.launch(server_name=args.server_name, server_port=args.server_port, inbrowser=True)
|