File size: 16,364 Bytes
b3c65ae
 
5b355a5
19567e9
b3c65ae
cae75e9
4c52cd0
c03f3d5
b3c65ae
19567e9
5b355a5
 
0190f40
 
64a2ea3
 
0190f40
 
19567e9
4f47a1b
19567e9
 
 
64a2ea3
19567e9
 
 
5b355a5
 
19567e9
 
5b355a5
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
b3c65ae
daa6da0
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
64a2ea3
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
e0baa4c
 
 
759d98d
d72ce9c
e0baa4c
b7a6480
64a2ea3
 
 
 
 
 
 
 
 
b7a6480
 
 
 
 
759d98d
d72ce9c
b7a6480
 
 
64a2ea3
b7a6480
64a2ea3
 
d9b799e
 
cae75e9
daa6da0
cae75e9
 
 
759d98d
 
 
 
 
 
 
 
5b355a5
 
cae75e9
64a2ea3
cae75e9
64a2ea3
 
4c52cd0
64a2ea3
47a0443
 
 
 
 
 
 
 
cae75e9
5b355a5
64a2ea3
5b355a5
64a2ea3
 
f0930a5
64a2ea3
47a0443
 
7ae1400
 
64a2ea3
7ae1400
64a2ea3
4c52cd0
64a2ea3
47a0443
 
cae75e9
5b355a5
64a2ea3
5b355a5
64a2ea3
4c52cd0
be4847d
64a2ea3
5b355a5
 
 
cae75e9
47a0443
 
 
 
 
 
5b355a5
 
cae75e9
64a2ea3
 
 
 
47a0443
 
cae75e9
4c52cd0
5b355a5
4c52cd0
47a0443
cae75e9
 
47a0443
cae75e9
47a0443
b7a6480
 
 
e0baa4c
 
 
d72ce9c
ab7144c
e0baa4c
b7a6480
daa6da0
 
 
64a2ea3
daa6da0
b7a6480
 
 
 
64a2ea3
d72ce9c
b7a6480
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
"""Main processing workflows for OutofLipSync"""

import gc
import logging
import os
import traceback

import spaces
import gradio as gr
import psutil
import torch

from audio_processing import (
    get_audio_duration,
    prepare_audio_for_lipsync,
    prepare_audio_for_youtube_aac,
    prepare_audio_for_youtube,
)
from config import PROCESSED_RESULTS_DIR
from lipsync_processing import apply_lipsync_to_video, get_video_info
from time_util import timer
from utils import setup_output_dir
from video_processing import (
    normalize_video_for_youtube,
    merge_audio_video,
)

logger = logging.getLogger(__name__)

os.environ["PROCESSED_RESULTS"] = f"{os.getcwd()}/{PROCESSED_RESULTS_DIR}"


def get_memory_usage():
    """Get current RAM and GPU memory usage"""
    ram = psutil.virtual_memory()
    ram_used_gb = ram.used / (1024**3)
    ram_percent = ram.percent

    if torch.cuda.is_available():
        gpu_used_gb = torch.cuda.memory_allocated(0) / (1024**3)
        gpu_total_gb = torch.cuda.get_device_properties(0).total_memory / (1024**3)
        gpu_percent = (gpu_used_gb / gpu_total_gb) * 100
        gpu_info = (
            f" | GPU: {gpu_used_gb:.2f}GB / {gpu_total_gb:.2f}GB ({gpu_percent:.1f}%)"
        )
    else:
        gpu_info = ""

    return f"RAM: {ram_used_gb:.2f}GB / {ram.total / (1024**3):.2f}GB ({ram_percent:.1f}%){gpu_info}"


def validate_input(video_file, audio_file):
    """Validate input files

    Args:
        video_file: Video input
        audio_file: Audio input

    Returns:
        Tuple of (video_path, audio_path)
    """
    if video_file is None:
        raise gr.Error("Please upload a video source.")
    if audio_file is None:
        raise gr.Error("Please upload a target audio.")

    if isinstance(video_file, dict):
        video_path = video_file.get("name") or video_file.get("path")
    else:
        video_path = video_file

    if isinstance(audio_file, dict):
        audio_path = audio_file.get("name") or audio_file.get("path")
    else:
        audio_path = audio_file

    if video_path is None or not os.path.exists(video_path):
        raise gr.Error("Could not read uploaded video file.")
    if audio_path is None or not os.path.exists(audio_path):
        raise gr.Error("Could not read uploaded audio file.")

    return video_path, audio_path


# def process_lipsync_with_audio_target(
#     video_file,
#     audio_file,
#     session_id=None,
#     crop_size=256,
#     progress=gr.Progress(track_tqdm=True),
# ):
#     """Lipsync video source với audio target (DEPRECATED - use process_lipsync_with_audio_target_new)
#
#     Args:
#         video_file: Path to video source
#         audio_file: Path to audio target (English only)
#         session_id: Session identifier
#         crop_size: Size of crop region (256 or 512)
#         progress: Progress tracking object
#
#     Returns:
#         Tuple of (final_video, video_looped, face_cropped, lipsynced_face, lipsynced_full)
#     """
#     video_looped = None
#     face_cropped = None
#     lipsynced_face = None
#     lipsynced_full = None
#     final_video = None
#     error_msg = None
#
#     try:
#         video_path, audio_path = validate_input(video_file, audio_file)
#
#         output_dir = setup_output_dir(session_id)
#
#         logger.info(f"Memory at start: {get_memory_usage()}")
#
#         audio_duration = get_audio_duration(audio_path)
#
#         progress(0.1, desc="🎬 Đang chuẩn bị video...")
#         logger.info(f"Memory before video loop: {get_memory_usage()}")
#
#         with timer("Looping/cropping video to match audio"):
#             try:
#                 video_looped = loop_video_to_match_audio(
#                     video_path, audio_duration, output_dir
#                 )
#             except Exception as e:
#                 error_msg = f"Video loop failed: {str(e)}"
#                 logger.error(error_msg)
#                 traceback.print_exc()
#                 return (
#                     final_video,
#                     video_looped,
#                     face_cropped,
#                     lipsynced_face,
#                     lipsynced_full,
#                 )
#
#         gc.collect()
#         logger.info(f"Memory after video loop: {get_memory_usage()}")
#
#         progress(0.2, desc="👤 Đang phát hiện khuôn mặt...")
#         with timer("Detecting face"):
#             try:
#                 face_bbox = detect_face_region(video_looped, output_dir, crop_size)
#             except FaceDetectionError as e:
#                 error_msg = str(e)
#                 logger.error(f"Face detection failed: {e}")
#                 return (
#                     final_video,
#                     video_looped,
#                     face_cropped,
#                     lipsynced_face,
#                     lipsynced_full,
#                 )
#
#         gc.collect()
#         logger.info(f"Memory after face detection: {get_memory_usage()}")
#
#         actual_crop_size = crop_size * 2
#         progress(
#             0.25, desc=f"👤 Đang tính lại crop {actual_crop_size}x{actual_crop_size}..."
#         )
#         with timer(
#             f"Recalculating crop bbox for {actual_crop_size}x{actual_crop_size}"
#         ):
#             from video_processing import get_video_info
#
#             try:
#                 video_info = get_video_info(video_looped)
#                 crop_bbox = calculate_safe_crop_size(
#                     face_bbox["face_bbox"],
#                     video_info["width"],
#                     video_info["height"],
#                     actual_crop_size,
#                 )
#             except Exception as e:
#                 error_msg = f"Calculate crop bbox failed: {str(e)}"
#                 logger.error(error_msg)
#                 traceback.print_exc()
#                 return (
#                     final_video,
#                     video_looped,
#                     face_cropped,
#                     lipsynced_face,
#                     lipsynced_full,
#                 )
#
#         progress(
#             0.3, desc=f"✂️ Đang crop video {actual_crop_size}x{actual_crop_size}..."
#         )
#         with timer(f"Cropping video to {actual_crop_size}x{actual_crop_size}"):
#             try:
#                 face_cropped = crop_video_to_size(
#                     video_looped, crop_bbox, output_dir, actual_crop_size
#                 )
#             except Exception as e:
#                 error_msg = f"Crop video failed: {str(e)}"
#                 logger.error(error_msg)
#                 traceback.print_exc()
#                 return (
#                     final_video,
#                     video_looped,
#                     face_cropped,
#                     lipsynced_face,
#                     lipsynced_full,
#                 )
#
#         gc.collect()
#         logger.info(f"Memory after crop: {get_memory_usage()}")
#
#         progress(0.4, desc="🎵 Đang xử lý audio...")
#         logger.info(f"Memory before audio prep: {get_memory_usage()}")
#
#         with timer("Preparing target audio"):
#             try:
#                 audio_16k, audio_upsampled = prepare_target_audio(
#                     audio_path, output_dir
#                 )
#             except Exception as e:
#                 error_msg = f"Prepare audio failed: {str(e)}"
#                 logger.error(error_msg)
#                 traceback.print_exc()
#                 return (
#                     final_video,
#                     video_looped,
#                     face_cropped,
#                     lipsynced_face,
#                     lipsynced_full,
#                 )
#
#         gc.collect()
#         logger.info(f"Memory after audio prep: {get_memory_usage()}")
#
#         progress(0.6, desc="👄 Đang lipsync...")
#
#         video_info = get_video_info(face_cropped)
#         logger.info(
#             f"Starting lipsync: video={face_cropped}, audio_16k={audio_16k}, output={output_dir}"
#         )
#         logger.info(
#             f"Video info: {video_info['width']}x{video_info['height']}, {video_info['fps']:.1f}fps, {video_info['duration']:.1f}s"
#         )
#         logger.info(f"Memory before lipsync: {get_memory_usage()}")
#
#         with timer("Applying lipsync"):
#             try:
#                 lipsynced_face, lipsynced_info = apply_lipsync_to_video(
#                     face_cropped, audio_16k, output_dir, crop_size
#                 )
#                 logger.info(
#                     f"Lipsynced video size: {lipsynced_info['width']}x{lipsynced_info['height']}"
#                 )
#             except Exception as e:
#                 error_msg = f"Lipsync failed: {str(e)}"
#                 logger.error(f"Lipsync failed with error: {type(e).__name__}: {e}")
#                 logger.error(f"Memory after crash: {get_memory_usage()}")
#                 traceback.print_exc()
#                 return (
#                     final_video,
#                     video_looped,
#                     face_cropped,
#                     lipsynced_face,
#                     lipsynced_full,
#                 )
#
#         gc.collect()
#         logger.info(f"Memory after lipsync: {get_memory_usage()}")
#
#         progress(0.8, desc="🔀 Đang ghép video...")
#         with timer("Blending face into original"):
#             try:
#                 lipsynced_full = blend_face_into_original(
#                     video_looped, lipsynced_face, crop_bbox, output_dir, lipsynced_info
#                 )
#             except Exception as e:
#                 error_msg = f"Blend video failed: {str(e)}"
#                 logger.error(error_msg)
#                 traceback.print_exc()
#                 return (
#                     final_video,
#                     video_looped,
#                     face_cropped,
#                     lipsynced_face,
#                     lipsynced_full,
#                 )
#
#         gc.collect()
#         logger.info(f"Memory after blend: {get_memory_usage()}")
#
#         progress(0.9, desc="🔗 Đang ghép audio...")
#         try:
#             audio_final = prepare_audio_for_youtube(audio_upsampled, output_dir)
#             final_video = merge_audio_video(lipsynced_full, audio_final, output_dir)
#         except Exception as e:
#             error_msg = f"Merge audio failed: {str(e)}"
#             logger.error(error_msg)
#             traceback.print_exc()
#             return (
#                 final_video,
#                 video_looped,
#                 face_cropped,
#                 lipsynced_face,
#                 lipsynced_full,
#             )
#
#         progress(1.0, desc="✅ Hoàn tất!")
#         logger.info(f"Memory at end: {get_memory_usage()}")
#
#         return final_video, video_looped, face_cropped, lipsynced_face, lipsynced_full
#
#     except Exception as e:
#         print(f"ERROR in process_lipsync_with_audio_target: {e}")
#         traceback.print_exc()
#         return final_video, video_looped, face_cropped, lipsynced_face, lipsynced_full


def process_lipsync_with_audio_target_new(
    video_file,
    audio_file,
    session_id=None,
    model_type="latentsync",
    quality_level="Normal",
    progress=gr.Progress(track_tqdm=True),
):
    """Workflow mới: Chuẩn hóa YouTube rồi lipsync

    Steps:
    1. Validate inputs
    2. Chuẩn hóa video YouTube (loop/crop + re-encode)
    3. Chuẩn hóa audio YouTube (AAC 320k)
    4. Chuẩn bị audio 16k cho lipsync
    5. Lipsync pipeline (tự detect/crop/lipsync/restore)
    6. Merge audio YouTube + video lipsynced

    Args:
        video_file: Path to video source
        audio_file: Path to audio target (English only)
        session_id: Session identifier
        model_type: Model type for lipsync ("latentsync" or "musetalk")
        quality_level: Quality level ("Fast", "Normal", "Medium", "Best", "Super Best")
        progress: Progress tracking object

    Returns:
        Tuple of (final_video, video_normalized, lipsynced_video)
    """
    video_normalized = None
    lipsynced_video = None
    final_video = None

    try:
        video_path, audio_path = validate_input(video_file, audio_file)

        output_dir = setup_output_dir(session_id)

        # Mapping model_type to crop_size
        if model_type == "LatentSync v1.6":
            logger.info("Using LatentSync v1.6 with crop_size=512")
        elif model_type == "MuseTalk v1.5":
            logger.info("Using MuseTalk v1.5 with crop_size=256")
        else:
            raise ValueError(f"Unknown model_type: {model_type}")

        logger.info(f"Memory at start: {get_memory_usage()}")

        audio_duration = get_audio_duration(audio_path)
        logger.info(f"Audio duration: {audio_duration:.2f}s")

        progress(0.15, desc="🎬 Đang chuẩn hóa video YouTube...")
        logger.info(f"Memory before video normalization: {get_memory_usage()}")

        with timer("Normalizing video for YouTube"):
            video_normalized = normalize_video_for_youtube(
                video_path, audio_duration, output_dir
            )
            video_info = get_video_info(video_normalized)
            logger.info(
                f"Normalized video: {video_info['width']}x{video_info['height']}, "
                f"{video_info['fps']:.1f}fps, {video_info['duration']:.1f}s"
            )

        gc.collect()
        logger.info(f"Memory after video normalization: {get_memory_usage()}")

        progress(0.25, desc="🎵 Đang chuẩn hóa audio YouTube...")
        logger.info(f"Memory before audio normalization: {get_memory_usage()}")

        with timer("Normalizing audio for YouTube"):
            audio_youtube = prepare_audio_for_youtube_aac(audio_path, output_dir)
            logger.info(f"Audio YouTube: {audio_youtube}")

        gc.collect()
        logger.info(f"Memory after audio normalization: {get_memory_usage()}")

        progress(0.35, desc="🔊 Đang chuẩn bị audio cho lipsync...")

        with timer("Preparing audio for lipsync"):
            audio_16k = prepare_audio_for_lipsync(audio_path, output_dir)
            logger.info(f"Audio 16k for lipsync: {audio_16k}")

        gc.collect()
        logger.info(f"Memory after audio preparation: {get_memory_usage()}")

        progress(0.55, desc="👄 Đang lipsync...")

        logger.info(
            f"Starting lipsync: video={video_normalized}, audio_16k={audio_16k}, output={output_dir}"
        )
        logger.info(f"Memory before lipsync: {get_memory_usage()}")

        with timer("Applying lipsync"):
            lipsynced_video, lipsynced_info = apply_lipsync_to_video(
                video_normalized, audio_16k, output_dir, model_type, quality_level
            )
            logger.info(
                f"Lipsynced video: {lipsynced_video}, size: {lipsynced_info['width']}x{lipsynced_info['height']}"
            )
        gc.collect()
        logger.info(f"Memory after lipsync: {get_memory_usage()}")

        progress(0.85, desc="🔗 Đang ghép audio YouTube...")
        logger.info(f"Memory before merge: {get_memory_usage()}")

        with timer("Merging audio and video"):
            final_video = merge_audio_video(lipsynced_video, audio_youtube, output_dir)
            logger.info(f"Final video: {final_video}")

        progress(1.0, desc="✅ Hoàn tất!")
        logger.info(f"Memory at end: {get_memory_usage()}")

        return final_video

    except Exception as e:
        logger.error(f"ERROR in process_lipsync_with_audio_target_new: {e}")
        traceback.print_exc()
        raise gr.Error(f"Lỗi xử lý: {str(e)}")


def lipsync_with_audio_target(
    video_file,
    audio_file,
    session_id=None,
    quality_level="Normal",
    model_type="LatentSync v1.6",
    progress=gr.Progress(track_tqdm=True),
):
    """Wrapper for Gradio: Lipsync video source with audio target (English only)

    Returns:
        Tuple of (final_video, video_normalized, lipsynced_video)
    """
    if video_file is None:
        raise gr.Error("Please upload a video source.")
    if audio_file is None:
        raise gr.Error("Please upload a target audio.")
    return process_lipsync_with_audio_target_new(
        video_file, audio_file, session_id, model_type, quality_level, progress
    )