naicoi commited on
Commit
d72ce9c
·
1 Parent(s): dac8faa

video quality

Browse files
Files changed (5) hide show
  1. app.py +6 -1
  2. audio_processing.py +14 -3
  3. lipsync.py +19 -72
  4. lipsync_processing.py +6 -13
  5. processing.py +5 -2
app.py CHANGED
@@ -93,6 +93,11 @@ with gr.Blocks(css=css) as demo:
93
  value="LatentSync v1.6",
94
  label="Model",
95
  )
 
 
 
 
 
96
  lipsync_only_btn = gr.Button("👄 Lipsync", variant="primary", size="lg")
97
 
98
  with gr.Row():
@@ -115,7 +120,7 @@ with gr.Blocks(css=css) as demo:
115
 
116
  lipsync_only_btn.click(
117
  fn=lipsync_with_audio_target,
118
- inputs=[video_input, audio_input, session_state, model_type],
119
  outputs=[
120
  final_video,
121
  video_normalized_output,
 
93
  value="LatentSync v1.6",
94
  label="Model",
95
  )
96
+ quality_level = gr.Radio(
97
+ choices=["Fast", "Normal", "Medium", "Best", "Super Best"],
98
+ value="Normal",
99
+ label="Quality",
100
+ )
101
  lipsync_only_btn = gr.Button("👄 Lipsync", variant="primary", size="lg")
102
 
103
  with gr.Row():
 
120
 
121
  lipsync_only_btn.click(
122
  fn=lipsync_with_audio_target,
123
+ inputs=[video_input, audio_input, session_state, model_type, quality_level],
124
  outputs=[
125
  final_video,
126
  video_normalized_output,
audio_processing.py CHANGED
@@ -5,14 +5,18 @@ import subprocess
5
  from ffmpy import FFmpeg, FFRuntimeError
6
 
7
 
8
- def get_audio_duration(audio_path: str) -> float:
9
- """Get audio file duration
10
 
11
  Args:
12
  audio_path: Path to audio file
 
13
 
14
  Returns:
15
  Duration in seconds
 
 
 
16
  """
17
  cmd = [
18
  "ffprobe",
@@ -25,7 +29,14 @@ def get_audio_duration(audio_path: str) -> float:
25
  audio_path,
26
  ]
27
  result = subprocess.run(cmd, capture_output=True, text=True, check=True)
28
- return float(result.stdout.strip())
 
 
 
 
 
 
 
29
 
30
 
31
  # def prepare_target_audio(audio_path: str, output_dir: str) -> tuple:
 
5
  from ffmpy import FFmpeg, FFRuntimeError
6
 
7
 
8
+ def get_audio_duration(audio_path: str, max_duration: float = 30.0) -> float:
9
+ """Get audio file duration, raise error if exceeds max_duration
10
 
11
  Args:
12
  audio_path: Path to audio file
13
+ max_duration: Maximum duration in seconds (default 30)
14
 
15
  Returns:
16
  Duration in seconds
17
+
18
+ Raises:
19
+ ValueError: If audio duration exceeds max_duration
20
  """
21
  cmd = [
22
  "ffprobe",
 
29
  audio_path,
30
  ]
31
  result = subprocess.run(cmd, capture_output=True, text=True, check=True)
32
+ duration = float(result.stdout.strip())
33
+
34
+ if duration > max_duration:
35
+ raise ValueError(
36
+ f"Audio duration {duration:.2f}s exceeds maximum {max_duration}s"
37
+ )
38
+
39
+ return duration
40
 
41
 
42
  # def prepare_target_audio(audio_path: str, output_dir: str) -> tuple:
lipsync.py CHANGED
@@ -18,59 +18,29 @@ torch.backends.cudnn.deterministic = False
18
  os.makedirs("checkpoints", exist_ok=True)
19
 
20
 
21
- def get_gpu_memory_info():
22
- """Get detailed GPU memory info"""
23
- if not torch.cuda.is_available():
24
- return "CUDA not available"
25
-
26
- device = torch.cuda.current_device()
27
- total = torch.cuda.get_device_properties(device).total_memory / 1024**3
28
- allocated = torch.cuda.memory_allocated(device) / 1024**3
29
- reserved = torch.cuda.memory_reserved(device) / 1024**3
30
- free = total - reserved
31
-
32
- return f"Total: {total:.2f}GB | Allocated: {allocated:.2f}GB | Reserved: {reserved:.2f}GB | Free: {free:.2f}GB"
33
-
34
-
35
- def get_available_vram():
36
- """Get available VRAM in GB"""
37
- if not torch.cuda.is_available():
38
- return 0.0
39
-
40
- device = torch.cuda.current_device()
41
- total = torch.cuda.get_device_properties(device).total_memory / 1024**3
42
- reserved = torch.cuda.memory_reserved(device) / 1024**3
43
- free = total - reserved
44
-
45
- return free
46
-
47
-
48
- def get_optimal_params(available_vram_gb: float) -> tuple:
49
- """Get optimal lipsync parameters based on total VRAM
50
 
51
  Args:
52
- available_vram_gb: Total VRAM in GB (actual available, not advertised)
53
 
54
  Returns:
55
- tuple of (num_frames, num_inference_steps)
56
  """
57
- if not available_vram_gb or available_vram_gb <= 0:
58
- return 12, 20, 1.0
59
-
60
- if available_vram_gb < 20.0:
61
- return 12, 20, 1.0
62
- elif available_vram_gb < 40.0:
63
- return 16, 30, 1.5
64
- elif available_vram_gb < 60.0:
65
- return 20, 40, 2.0
66
- elif available_vram_gb >= 60.0:
67
- return 24, 50, 2.5
68
- else:
69
- return 16, 15, 1.5
70
 
71
 
72
  @spaces.GPU
73
- def apply_lipsync(video_input_path, audio_path, video_out_path, crop_size=256):
 
 
74
  print(f"\n{'=' * 60}")
75
  print(f"LIPSYNC START")
76
  print(f"Input video: {video_input_path}")
@@ -79,8 +49,6 @@ def apply_lipsync(video_input_path, audio_path, video_out_path, crop_size=256):
79
  print(f"Crop size: {crop_size}x{crop_size}")
80
  print(f"{'=' * 60}\n")
81
 
82
- print(f"GPU Memory Before: {get_gpu_memory_info()}")
83
-
84
  manager = ModelManager.get_instance()
85
 
86
  config = manager.get_latentsync_config()
@@ -104,34 +72,18 @@ def apply_lipsync(video_input_path, audio_path, video_out_path, crop_size=256):
104
  if not torch.cuda.is_available():
105
  raise RuntimeError("CUDA not available - GPU required for lipsync")
106
 
107
- total_memory = torch.cuda.get_device_properties(0).total_memory
108
- print(f"Total GPU memory: {total_memory / 1024**3:.2f} GB")
109
-
110
- available_vram = get_available_vram()
111
- print(f"Available VRAM before processing: {available_vram:.2f} GB")
112
-
113
- torch.cuda.empty_cache()
114
- available_vram_after_clear = get_available_vram()
115
- print(f"Available VRAM after cache clear: {available_vram_after_clear:.2f} GB")
116
-
117
- print(
118
- f"\nCalling get_optimal_params with input: {total_memory / 1024**3:.2f} GB"
119
- )
120
- num_frames, num_inference_steps, guidance_scale = get_optimal_params(
121
- total_memory / 1024**3
122
- )
123
- print(
124
- f"get_optimal_params output: num_frames={num_frames}, num_inference_steps={num_inference_steps}"
125
  )
126
 
127
- print(f"\nParameters:")
 
128
  print(f" num_frames: {num_frames}")
129
  print(f" num_inference_steps: {num_inference_steps}")
130
  print(f" guidance_scale: {guidance_scale}")
131
  print(f" resolution: {config.data.resolution}")
132
 
133
  print(f"Initial seed: {torch.initial_seed()}")
134
- print(f"GPU Memory After model load: {get_gpu_memory_info()}")
135
 
136
  print("\nStarting pipeline inference...")
137
  print(
@@ -154,16 +106,13 @@ def apply_lipsync(video_input_path, audio_path, video_out_path, crop_size=256):
154
  height=crop_size,
155
  )
156
  print("Pipeline completed successfully")
157
- print(f"GPU Memory After pipeline: {get_gpu_memory_info()}")
158
 
159
  except RuntimeError as e:
160
  error_msg = str(e).lower()
161
  print(f"RuntimeError in pipeline: {e}")
162
  if "out of memory" in error_msg or "cuda out of memory" in error_msg:
163
  print("GPU OOM DETECTED!")
164
- print(f"GPU Memory at crash: {get_gpu_memory_info()}")
165
  torch.cuda.empty_cache()
166
- print(f"GPU Memory after OOM cleanup: {get_gpu_memory_info()}")
167
  raise RuntimeError(
168
  "GPU out of memory during lipsync. Try: 1) Shorter video 2) Lower resolution 3) Close other GPU apps"
169
  )
@@ -172,13 +121,11 @@ def apply_lipsync(video_input_path, audio_path, video_out_path, crop_size=256):
172
  print(f"Unexpected error in pipeline: {e}")
173
  print(f"Error type: {type(e).__name__}")
174
  traceback.print_exc()
175
- print(f"GPU Memory at error: {get_gpu_memory_info()}")
176
  raise
177
  finally:
178
  print("Clearing GPU cache...")
179
  torch.cuda.empty_cache()
180
  gc.collect()
181
- print(f"GPU Memory After cleanup: {get_gpu_memory_info()}")
182
 
183
  print(f"\n{'=' * 60}")
184
  print(f"LIPSYNC SUCCESS - Output: {video_out_path}")
 
18
  os.makedirs("checkpoints", exist_ok=True)
19
 
20
 
21
+ def get_quality_params(level: str) -> tuple:
22
+ """Get lipsync parameters based on quality level
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
23
 
24
  Args:
25
+ level: Quality level (Fast, Normal, Medium, Best, Super Best)
26
 
27
  Returns:
28
+ tuple of (num_frames, num_inference_steps, guidance_scale)
29
  """
30
+ params = {
31
+ "Fast": (12, 15, 1.0),
32
+ "Normal": (12, 20, 1.0),
33
+ "Medium": (16, 30, 1.5),
34
+ "Best": (20, 40, 2.0),
35
+ "Super Best": (24, 50, 2.5),
36
+ }
37
+ return params.get(level, (12, 20, 1.0))
 
 
 
 
 
38
 
39
 
40
  @spaces.GPU
41
+ def apply_lipsync(
42
+ video_input_path, audio_path, video_out_path, crop_size=256, quality_level="Normal"
43
+ ):
44
  print(f"\n{'=' * 60}")
45
  print(f"LIPSYNC START")
46
  print(f"Input video: {video_input_path}")
 
49
  print(f"Crop size: {crop_size}x{crop_size}")
50
  print(f"{'=' * 60}\n")
51
 
 
 
52
  manager = ModelManager.get_instance()
53
 
54
  config = manager.get_latentsync_config()
 
72
  if not torch.cuda.is_available():
73
  raise RuntimeError("CUDA not available - GPU required for lipsync")
74
 
75
+ num_frames, num_inference_steps, guidance_scale = get_quality_params(
76
+ quality_level
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
77
  )
78
 
79
+ print(f"\nQuality level: {quality_level}")
80
+ print(f"Parameters:")
81
  print(f" num_frames: {num_frames}")
82
  print(f" num_inference_steps: {num_inference_steps}")
83
  print(f" guidance_scale: {guidance_scale}")
84
  print(f" resolution: {config.data.resolution}")
85
 
86
  print(f"Initial seed: {torch.initial_seed()}")
 
87
 
88
  print("\nStarting pipeline inference...")
89
  print(
 
106
  height=crop_size,
107
  )
108
  print("Pipeline completed successfully")
 
109
 
110
  except RuntimeError as e:
111
  error_msg = str(e).lower()
112
  print(f"RuntimeError in pipeline: {e}")
113
  if "out of memory" in error_msg or "cuda out of memory" in error_msg:
114
  print("GPU OOM DETECTED!")
 
115
  torch.cuda.empty_cache()
 
116
  raise RuntimeError(
117
  "GPU out of memory during lipsync. Try: 1) Shorter video 2) Lower resolution 3) Close other GPU apps"
118
  )
 
121
  print(f"Unexpected error in pipeline: {e}")
122
  print(f"Error type: {type(e).__name__}")
123
  traceback.print_exc()
 
124
  raise
125
  finally:
126
  print("Clearing GPU cache...")
127
  torch.cuda.empty_cache()
128
  gc.collect()
 
129
 
130
  print(f"\n{'=' * 60}")
131
  print(f"LIPSYNC SUCCESS - Output: {video_out_path}")
lipsync_processing.py CHANGED
@@ -49,6 +49,7 @@ def apply_lipsync_to_video(
49
  audio_16k_path: str,
50
  output_dir: str,
51
  model_type: str = "LatentSync v1.6",
 
52
  ) -> tuple:
53
  """Apply lipsync to video using clean 16k audio
54
 
@@ -57,6 +58,7 @@ def apply_lipsync_to_video(
57
  audio_16k_path: Path to 16kHz audio
58
  output_dir: Directory to save output
59
  model_type: Model type for lipsync ("LatentSync v1.6" or "MuseTalk v1.5")
 
60
 
61
  Returns:
62
  Tuple of (lipsynced_video_path, video_info)
@@ -67,9 +69,11 @@ def apply_lipsync_to_video(
67
  if model_type == "LatentSync v1.6":
68
  crop_size = 512
69
  print(
70
- f"Using LatentSync: video={video_path}, audio={audio_16k_path}, crop_size={crop_size}"
 
 
 
71
  )
72
- apply_lipsync(video_path, audio_16k_path, lipsynced_video, crop_size)
73
 
74
  elif model_type == "MuseTalk v1.5":
75
  from musetalk import apply_musetalk_lipsync
@@ -101,14 +105,3 @@ def apply_lipsync_to_video(
101
  print(f"Runtime Error in lipsync processing: {e}")
102
  traceback.print_exc()
103
  raise
104
- except Exception:
105
- raise
106
- except Exception as e:
107
- print(f"Error in apply_lipsync_to_video: {e}")
108
- traceback.print_exc()
109
- raise
110
-
111
- except Exception as e:
112
- print(f"Error in apply_lipsync_to_video: {e}")
113
- traceback.print_exc()
114
- raise
 
49
  audio_16k_path: str,
50
  output_dir: str,
51
  model_type: str = "LatentSync v1.6",
52
+ quality_level: str = "Normal",
53
  ) -> tuple:
54
  """Apply lipsync to video using clean 16k audio
55
 
 
58
  audio_16k_path: Path to 16kHz audio
59
  output_dir: Directory to save output
60
  model_type: Model type for lipsync ("LatentSync v1.6" or "MuseTalk v1.5")
61
+ quality_level: Quality level ("Fast", "Normal", "Medium", "Best", "Super Best")
62
 
63
  Returns:
64
  Tuple of (lipsynced_video_path, video_info)
 
69
  if model_type == "LatentSync v1.6":
70
  crop_size = 512
71
  print(
72
+ f"Using LatentSync: video={video_path}, audio={audio_16k_path}, crop_size={crop_size}, quality={quality_level}"
73
+ )
74
+ apply_lipsync(
75
+ video_path, audio_16k_path, lipsynced_video, crop_size, quality_level
76
  )
 
77
 
78
  elif model_type == "MuseTalk v1.5":
79
  from musetalk import apply_musetalk_lipsync
 
105
  print(f"Runtime Error in lipsync processing: {e}")
106
  traceback.print_exc()
107
  raise
 
 
 
 
 
 
 
 
 
 
 
processing.py CHANGED
@@ -321,6 +321,7 @@ def process_lipsync_with_audio_target_new(
321
  audio_file,
322
  session_id=None,
323
  model_type="latentsync",
 
324
  progress=gr.Progress(track_tqdm=True),
325
  ):
326
  """Workflow mới: Chuẩn hóa YouTube rồi lipsync
@@ -338,6 +339,7 @@ def process_lipsync_with_audio_target_new(
338
  audio_file: Path to audio target (English only)
339
  session_id: Session identifier
340
  model_type: Model type for lipsync ("latentsync" or "musetalk")
 
341
  progress: Progress tracking object
342
 
343
  Returns:
@@ -427,7 +429,7 @@ def process_lipsync_with_audio_target_new(
427
  with timer("Applying lipsync"):
428
  try:
429
  lipsynced_video, lipsynced_info = apply_lipsync_to_video(
430
- video_normalized, audio_16k, output_dir, model_type
431
  )
432
  logger.info(
433
  f"Lipsynced video: {lipsynced_video}, size: {lipsynced_info['width']}x{lipsynced_info['height']}"
@@ -471,6 +473,7 @@ def lipsync_with_audio_target(
471
  audio_file,
472
  session_id=None,
473
  model_type="LatentSync v1.6",
 
474
  progress=gr.Progress(track_tqdm=True),
475
  ):
476
  """Wrapper for Gradio: Lipsync video source with audio target (English only)
@@ -483,5 +486,5 @@ def lipsync_with_audio_target(
483
  if audio_file is None:
484
  raise gr.Error("Please upload a target audio.")
485
  return process_lipsync_with_audio_target_new(
486
- video_file, audio_file, session_id, model_type, progress
487
  )
 
321
  audio_file,
322
  session_id=None,
323
  model_type="latentsync",
324
+ quality_level="Normal",
325
  progress=gr.Progress(track_tqdm=True),
326
  ):
327
  """Workflow mới: Chuẩn hóa YouTube rồi lipsync
 
339
  audio_file: Path to audio target (English only)
340
  session_id: Session identifier
341
  model_type: Model type for lipsync ("latentsync" or "musetalk")
342
+ quality_level: Quality level ("Fast", "Normal", "Medium", "Best", "Super Best")
343
  progress: Progress tracking object
344
 
345
  Returns:
 
429
  with timer("Applying lipsync"):
430
  try:
431
  lipsynced_video, lipsynced_info = apply_lipsync_to_video(
432
+ video_normalized, audio_16k, output_dir, model_type, quality_level
433
  )
434
  logger.info(
435
  f"Lipsynced video: {lipsynced_video}, size: {lipsynced_info['width']}x{lipsynced_info['height']}"
 
473
  audio_file,
474
  session_id=None,
475
  model_type="LatentSync v1.6",
476
+ quality_level="Normal",
477
  progress=gr.Progress(track_tqdm=True),
478
  ):
479
  """Wrapper for Gradio: Lipsync video source with audio target (English only)
 
486
  if audio_file is None:
487
  raise gr.Error("Please upload a target audio.")
488
  return process_lipsync_with_audio_target_new(
489
+ video_file, audio_file, session_id, model_type, quality_level, progress
490
  )