Spaces:
Runtime error
Runtime error
File size: 16,364 Bytes
b3c65ae 5b355a5 19567e9 b3c65ae cae75e9 4c52cd0 c03f3d5 b3c65ae 19567e9 5b355a5 0190f40 64a2ea3 0190f40 19567e9 4f47a1b 19567e9 64a2ea3 19567e9 5b355a5 19567e9 5b355a5 b3c65ae daa6da0 64a2ea3 e0baa4c 759d98d d72ce9c e0baa4c b7a6480 64a2ea3 b7a6480 759d98d d72ce9c b7a6480 64a2ea3 b7a6480 64a2ea3 d9b799e cae75e9 daa6da0 cae75e9 759d98d 5b355a5 cae75e9 64a2ea3 cae75e9 64a2ea3 4c52cd0 64a2ea3 47a0443 cae75e9 5b355a5 64a2ea3 5b355a5 64a2ea3 f0930a5 64a2ea3 47a0443 7ae1400 64a2ea3 7ae1400 64a2ea3 4c52cd0 64a2ea3 47a0443 cae75e9 5b355a5 64a2ea3 5b355a5 64a2ea3 4c52cd0 be4847d 64a2ea3 5b355a5 cae75e9 47a0443 5b355a5 cae75e9 64a2ea3 47a0443 cae75e9 4c52cd0 5b355a5 4c52cd0 47a0443 cae75e9 47a0443 cae75e9 47a0443 b7a6480 e0baa4c d72ce9c ab7144c e0baa4c b7a6480 daa6da0 64a2ea3 daa6da0 b7a6480 64a2ea3 d72ce9c b7a6480 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 | """Main processing workflows for OutofLipSync"""
import gc
import logging
import os
import traceback
import spaces
import gradio as gr
import psutil
import torch
from audio_processing import (
get_audio_duration,
prepare_audio_for_lipsync,
prepare_audio_for_youtube_aac,
prepare_audio_for_youtube,
)
from config import PROCESSED_RESULTS_DIR
from lipsync_processing import apply_lipsync_to_video, get_video_info
from time_util import timer
from utils import setup_output_dir
from video_processing import (
normalize_video_for_youtube,
merge_audio_video,
)
logger = logging.getLogger(__name__)
os.environ["PROCESSED_RESULTS"] = f"{os.getcwd()}/{PROCESSED_RESULTS_DIR}"
def get_memory_usage():
"""Get current RAM and GPU memory usage"""
ram = psutil.virtual_memory()
ram_used_gb = ram.used / (1024**3)
ram_percent = ram.percent
if torch.cuda.is_available():
gpu_used_gb = torch.cuda.memory_allocated(0) / (1024**3)
gpu_total_gb = torch.cuda.get_device_properties(0).total_memory / (1024**3)
gpu_percent = (gpu_used_gb / gpu_total_gb) * 100
gpu_info = (
f" | GPU: {gpu_used_gb:.2f}GB / {gpu_total_gb:.2f}GB ({gpu_percent:.1f}%)"
)
else:
gpu_info = ""
return f"RAM: {ram_used_gb:.2f}GB / {ram.total / (1024**3):.2f}GB ({ram_percent:.1f}%){gpu_info}"
def validate_input(video_file, audio_file):
"""Validate input files
Args:
video_file: Video input
audio_file: Audio input
Returns:
Tuple of (video_path, audio_path)
"""
if video_file is None:
raise gr.Error("Please upload a video source.")
if audio_file is None:
raise gr.Error("Please upload a target audio.")
if isinstance(video_file, dict):
video_path = video_file.get("name") or video_file.get("path")
else:
video_path = video_file
if isinstance(audio_file, dict):
audio_path = audio_file.get("name") or audio_file.get("path")
else:
audio_path = audio_file
if video_path is None or not os.path.exists(video_path):
raise gr.Error("Could not read uploaded video file.")
if audio_path is None or not os.path.exists(audio_path):
raise gr.Error("Could not read uploaded audio file.")
return video_path, audio_path
# def process_lipsync_with_audio_target(
# video_file,
# audio_file,
# session_id=None,
# crop_size=256,
# progress=gr.Progress(track_tqdm=True),
# ):
# """Lipsync video source với audio target (DEPRECATED - use process_lipsync_with_audio_target_new)
#
# Args:
# video_file: Path to video source
# audio_file: Path to audio target (English only)
# session_id: Session identifier
# crop_size: Size of crop region (256 or 512)
# progress: Progress tracking object
#
# Returns:
# Tuple of (final_video, video_looped, face_cropped, lipsynced_face, lipsynced_full)
# """
# video_looped = None
# face_cropped = None
# lipsynced_face = None
# lipsynced_full = None
# final_video = None
# error_msg = None
#
# try:
# video_path, audio_path = validate_input(video_file, audio_file)
#
# output_dir = setup_output_dir(session_id)
#
# logger.info(f"Memory at start: {get_memory_usage()}")
#
# audio_duration = get_audio_duration(audio_path)
#
# progress(0.1, desc="🎬 Đang chuẩn bị video...")
# logger.info(f"Memory before video loop: {get_memory_usage()}")
#
# with timer("Looping/cropping video to match audio"):
# try:
# video_looped = loop_video_to_match_audio(
# video_path, audio_duration, output_dir
# )
# except Exception as e:
# error_msg = f"Video loop failed: {str(e)}"
# logger.error(error_msg)
# traceback.print_exc()
# return (
# final_video,
# video_looped,
# face_cropped,
# lipsynced_face,
# lipsynced_full,
# )
#
# gc.collect()
# logger.info(f"Memory after video loop: {get_memory_usage()}")
#
# progress(0.2, desc="👤 Đang phát hiện khuôn mặt...")
# with timer("Detecting face"):
# try:
# face_bbox = detect_face_region(video_looped, output_dir, crop_size)
# except FaceDetectionError as e:
# error_msg = str(e)
# logger.error(f"Face detection failed: {e}")
# return (
# final_video,
# video_looped,
# face_cropped,
# lipsynced_face,
# lipsynced_full,
# )
#
# gc.collect()
# logger.info(f"Memory after face detection: {get_memory_usage()}")
#
# actual_crop_size = crop_size * 2
# progress(
# 0.25, desc=f"👤 Đang tính lại crop {actual_crop_size}x{actual_crop_size}..."
# )
# with timer(
# f"Recalculating crop bbox for {actual_crop_size}x{actual_crop_size}"
# ):
# from video_processing import get_video_info
#
# try:
# video_info = get_video_info(video_looped)
# crop_bbox = calculate_safe_crop_size(
# face_bbox["face_bbox"],
# video_info["width"],
# video_info["height"],
# actual_crop_size,
# )
# except Exception as e:
# error_msg = f"Calculate crop bbox failed: {str(e)}"
# logger.error(error_msg)
# traceback.print_exc()
# return (
# final_video,
# video_looped,
# face_cropped,
# lipsynced_face,
# lipsynced_full,
# )
#
# progress(
# 0.3, desc=f"✂️ Đang crop video {actual_crop_size}x{actual_crop_size}..."
# )
# with timer(f"Cropping video to {actual_crop_size}x{actual_crop_size}"):
# try:
# face_cropped = crop_video_to_size(
# video_looped, crop_bbox, output_dir, actual_crop_size
# )
# except Exception as e:
# error_msg = f"Crop video failed: {str(e)}"
# logger.error(error_msg)
# traceback.print_exc()
# return (
# final_video,
# video_looped,
# face_cropped,
# lipsynced_face,
# lipsynced_full,
# )
#
# gc.collect()
# logger.info(f"Memory after crop: {get_memory_usage()}")
#
# progress(0.4, desc="🎵 Đang xử lý audio...")
# logger.info(f"Memory before audio prep: {get_memory_usage()}")
#
# with timer("Preparing target audio"):
# try:
# audio_16k, audio_upsampled = prepare_target_audio(
# audio_path, output_dir
# )
# except Exception as e:
# error_msg = f"Prepare audio failed: {str(e)}"
# logger.error(error_msg)
# traceback.print_exc()
# return (
# final_video,
# video_looped,
# face_cropped,
# lipsynced_face,
# lipsynced_full,
# )
#
# gc.collect()
# logger.info(f"Memory after audio prep: {get_memory_usage()}")
#
# progress(0.6, desc="👄 Đang lipsync...")
#
# video_info = get_video_info(face_cropped)
# logger.info(
# f"Starting lipsync: video={face_cropped}, audio_16k={audio_16k}, output={output_dir}"
# )
# logger.info(
# f"Video info: {video_info['width']}x{video_info['height']}, {video_info['fps']:.1f}fps, {video_info['duration']:.1f}s"
# )
# logger.info(f"Memory before lipsync: {get_memory_usage()}")
#
# with timer("Applying lipsync"):
# try:
# lipsynced_face, lipsynced_info = apply_lipsync_to_video(
# face_cropped, audio_16k, output_dir, crop_size
# )
# logger.info(
# f"Lipsynced video size: {lipsynced_info['width']}x{lipsynced_info['height']}"
# )
# except Exception as e:
# error_msg = f"Lipsync failed: {str(e)}"
# logger.error(f"Lipsync failed with error: {type(e).__name__}: {e}")
# logger.error(f"Memory after crash: {get_memory_usage()}")
# traceback.print_exc()
# return (
# final_video,
# video_looped,
# face_cropped,
# lipsynced_face,
# lipsynced_full,
# )
#
# gc.collect()
# logger.info(f"Memory after lipsync: {get_memory_usage()}")
#
# progress(0.8, desc="🔀 Đang ghép video...")
# with timer("Blending face into original"):
# try:
# lipsynced_full = blend_face_into_original(
# video_looped, lipsynced_face, crop_bbox, output_dir, lipsynced_info
# )
# except Exception as e:
# error_msg = f"Blend video failed: {str(e)}"
# logger.error(error_msg)
# traceback.print_exc()
# return (
# final_video,
# video_looped,
# face_cropped,
# lipsynced_face,
# lipsynced_full,
# )
#
# gc.collect()
# logger.info(f"Memory after blend: {get_memory_usage()}")
#
# progress(0.9, desc="🔗 Đang ghép audio...")
# try:
# audio_final = prepare_audio_for_youtube(audio_upsampled, output_dir)
# final_video = merge_audio_video(lipsynced_full, audio_final, output_dir)
# except Exception as e:
# error_msg = f"Merge audio failed: {str(e)}"
# logger.error(error_msg)
# traceback.print_exc()
# return (
# final_video,
# video_looped,
# face_cropped,
# lipsynced_face,
# lipsynced_full,
# )
#
# progress(1.0, desc="✅ Hoàn tất!")
# logger.info(f"Memory at end: {get_memory_usage()}")
#
# return final_video, video_looped, face_cropped, lipsynced_face, lipsynced_full
#
# except Exception as e:
# print(f"ERROR in process_lipsync_with_audio_target: {e}")
# traceback.print_exc()
# return final_video, video_looped, face_cropped, lipsynced_face, lipsynced_full
def process_lipsync_with_audio_target_new(
video_file,
audio_file,
session_id=None,
model_type="latentsync",
quality_level="Normal",
progress=gr.Progress(track_tqdm=True),
):
"""Workflow mới: Chuẩn hóa YouTube rồi lipsync
Steps:
1. Validate inputs
2. Chuẩn hóa video YouTube (loop/crop + re-encode)
3. Chuẩn hóa audio YouTube (AAC 320k)
4. Chuẩn bị audio 16k cho lipsync
5. Lipsync pipeline (tự detect/crop/lipsync/restore)
6. Merge audio YouTube + video lipsynced
Args:
video_file: Path to video source
audio_file: Path to audio target (English only)
session_id: Session identifier
model_type: Model type for lipsync ("latentsync" or "musetalk")
quality_level: Quality level ("Fast", "Normal", "Medium", "Best", "Super Best")
progress: Progress tracking object
Returns:
Tuple of (final_video, video_normalized, lipsynced_video)
"""
video_normalized = None
lipsynced_video = None
final_video = None
try:
video_path, audio_path = validate_input(video_file, audio_file)
output_dir = setup_output_dir(session_id)
# Mapping model_type to crop_size
if model_type == "LatentSync v1.6":
logger.info("Using LatentSync v1.6 with crop_size=512")
elif model_type == "MuseTalk v1.5":
logger.info("Using MuseTalk v1.5 with crop_size=256")
else:
raise ValueError(f"Unknown model_type: {model_type}")
logger.info(f"Memory at start: {get_memory_usage()}")
audio_duration = get_audio_duration(audio_path)
logger.info(f"Audio duration: {audio_duration:.2f}s")
progress(0.15, desc="🎬 Đang chuẩn hóa video YouTube...")
logger.info(f"Memory before video normalization: {get_memory_usage()}")
with timer("Normalizing video for YouTube"):
video_normalized = normalize_video_for_youtube(
video_path, audio_duration, output_dir
)
video_info = get_video_info(video_normalized)
logger.info(
f"Normalized video: {video_info['width']}x{video_info['height']}, "
f"{video_info['fps']:.1f}fps, {video_info['duration']:.1f}s"
)
gc.collect()
logger.info(f"Memory after video normalization: {get_memory_usage()}")
progress(0.25, desc="🎵 Đang chuẩn hóa audio YouTube...")
logger.info(f"Memory before audio normalization: {get_memory_usage()}")
with timer("Normalizing audio for YouTube"):
audio_youtube = prepare_audio_for_youtube_aac(audio_path, output_dir)
logger.info(f"Audio YouTube: {audio_youtube}")
gc.collect()
logger.info(f"Memory after audio normalization: {get_memory_usage()}")
progress(0.35, desc="🔊 Đang chuẩn bị audio cho lipsync...")
with timer("Preparing audio for lipsync"):
audio_16k = prepare_audio_for_lipsync(audio_path, output_dir)
logger.info(f"Audio 16k for lipsync: {audio_16k}")
gc.collect()
logger.info(f"Memory after audio preparation: {get_memory_usage()}")
progress(0.55, desc="👄 Đang lipsync...")
logger.info(
f"Starting lipsync: video={video_normalized}, audio_16k={audio_16k}, output={output_dir}"
)
logger.info(f"Memory before lipsync: {get_memory_usage()}")
with timer("Applying lipsync"):
lipsynced_video, lipsynced_info = apply_lipsync_to_video(
video_normalized, audio_16k, output_dir, model_type, quality_level
)
logger.info(
f"Lipsynced video: {lipsynced_video}, size: {lipsynced_info['width']}x{lipsynced_info['height']}"
)
gc.collect()
logger.info(f"Memory after lipsync: {get_memory_usage()}")
progress(0.85, desc="🔗 Đang ghép audio YouTube...")
logger.info(f"Memory before merge: {get_memory_usage()}")
with timer("Merging audio and video"):
final_video = merge_audio_video(lipsynced_video, audio_youtube, output_dir)
logger.info(f"Final video: {final_video}")
progress(1.0, desc="✅ Hoàn tất!")
logger.info(f"Memory at end: {get_memory_usage()}")
return final_video
except Exception as e:
logger.error(f"ERROR in process_lipsync_with_audio_target_new: {e}")
traceback.print_exc()
raise gr.Error(f"Lỗi xử lý: {str(e)}")
def lipsync_with_audio_target(
video_file,
audio_file,
session_id=None,
quality_level="Normal",
model_type="LatentSync v1.6",
progress=gr.Progress(track_tqdm=True),
):
"""Wrapper for Gradio: Lipsync video source with audio target (English only)
Returns:
Tuple of (final_video, video_normalized, lipsynced_video)
"""
if video_file is None:
raise gr.Error("Please upload a video source.")
if audio_file is None:
raise gr.Error("Please upload a target audio.")
return process_lipsync_with_audio_target_new(
video_file, audio_file, session_id, model_type, quality_level, progress
)
|