Spaces:

naicoi
/

lipsync-docker

Runtime error

App Files Files Community

naicoi commited on Jan 29

Commit

d72ce9c

1 Parent(s): dac8faa

video quality

Browse files

Files changed (5) hide show

app.py +6 -1
audio_processing.py +14 -3
lipsync.py +19 -72
lipsync_processing.py +6 -13
processing.py +5 -2

app.py CHANGED Viewed

@@ -93,6 +93,11 @@ with gr.Blocks(css=css) as demo:
                     value="LatentSync v1.6",
                     label="Model",
                 )
                 lipsync_only_btn = gr.Button("👄 Lipsync", variant="primary", size="lg")
         with gr.Row():
@@ -115,7 +120,7 @@ with gr.Blocks(css=css) as demo:
     lipsync_only_btn.click(
         fn=lipsync_with_audio_target,
-        inputs=[video_input, audio_input, session_state, model_type],
         outputs=[
             final_video,
             video_normalized_output,

                     value="LatentSync v1.6",
                     label="Model",
                 )
+                quality_level = gr.Radio(
+                    choices=["Fast", "Normal", "Medium", "Best", "Super Best"],
+                    value="Normal",
+                    label="Quality",
+                )
                 lipsync_only_btn = gr.Button("👄 Lipsync", variant="primary", size="lg")
         with gr.Row():
     lipsync_only_btn.click(
         fn=lipsync_with_audio_target,
+        inputs=[video_input, audio_input, session_state, model_type, quality_level],
         outputs=[
             final_video,
             video_normalized_output,

audio_processing.py CHANGED Viewed

@@ -5,14 +5,18 @@ import subprocess
 from ffmpy import FFmpeg, FFRuntimeError
-def get_audio_duration(audio_path: str) -> float:
-    """Get audio file duration
     Args:
         audio_path: Path to audio file
     Returns:
         Duration in seconds
     """
     cmd = [
         "ffprobe",
@@ -25,7 +29,14 @@ def get_audio_duration(audio_path: str) -> float:
         audio_path,
     ]
     result = subprocess.run(cmd, capture_output=True, text=True, check=True)
-    return float(result.stdout.strip())
 # def prepare_target_audio(audio_path: str, output_dir: str) -> tuple:

 from ffmpy import FFmpeg, FFRuntimeError
+def get_audio_duration(audio_path: str, max_duration: float = 30.0) -> float:
+    """Get audio file duration, raise error if exceeds max_duration
     Args:
         audio_path: Path to audio file
+        max_duration: Maximum duration in seconds (default 30)
     Returns:
         Duration in seconds
+    Raises:
+        ValueError: If audio duration exceeds max_duration
     """
     cmd = [
         "ffprobe",
         audio_path,
     ]
     result = subprocess.run(cmd, capture_output=True, text=True, check=True)
+    duration = float(result.stdout.strip())
+    if duration > max_duration:
+        raise ValueError(
+            f"Audio duration {duration:.2f}s exceeds maximum {max_duration}s"
+        )
+    return duration
 # def prepare_target_audio(audio_path: str, output_dir: str) -> tuple:

lipsync.py CHANGED Viewed

@@ -18,59 +18,29 @@ torch.backends.cudnn.deterministic = False
 os.makedirs("checkpoints", exist_ok=True)
-def get_gpu_memory_info():
-    """Get detailed GPU memory info"""
-    if not torch.cuda.is_available():
-        return "CUDA not available"
-    device = torch.cuda.current_device()
-    total = torch.cuda.get_device_properties(device).total_memory / 1024**3
-    allocated = torch.cuda.memory_allocated(device) / 1024**3
-    reserved = torch.cuda.memory_reserved(device) / 1024**3
-    free = total - reserved
-    return f"Total: {total:.2f}GB | Allocated: {allocated:.2f}GB | Reserved: {reserved:.2f}GB | Free: {free:.2f}GB"
-def get_available_vram():
-    """Get available VRAM in GB"""
-    if not torch.cuda.is_available():
-        return 0.0
-    device = torch.cuda.current_device()
-    total = torch.cuda.get_device_properties(device).total_memory / 1024**3
-    reserved = torch.cuda.memory_reserved(device) / 1024**3
-    free = total - reserved
-    return free
-def get_optimal_params(available_vram_gb: float) -> tuple:
-    """Get optimal lipsync parameters based on total VRAM
     Args:
-        available_vram_gb: Total VRAM in GB (actual available, not advertised)
     Returns:
-        tuple of (num_frames, num_inference_steps)
     """
-    if not available_vram_gb or available_vram_gb <= 0:
-        return 12, 20, 1.0
-    if available_vram_gb < 20.0:
-        return 12, 20, 1.0
-    elif available_vram_gb < 40.0:
-        return 16, 30, 1.5
-    elif available_vram_gb < 60.0:
-        return 20, 40, 2.0
-    elif available_vram_gb >= 60.0:
-        return 24, 50, 2.5
-    else:
-        return 16, 15, 1.5
 @spaces.GPU
-def apply_lipsync(video_input_path, audio_path, video_out_path, crop_size=256):
     print(f"\n{'=' * 60}")
     print(f"LIPSYNC START")
     print(f"Input video: {video_input_path}")
@@ -79,8 +49,6 @@ def apply_lipsync(video_input_path, audio_path, video_out_path, crop_size=256):
     print(f"Crop size: {crop_size}x{crop_size}")
     print(f"{'=' * 60}\n")
-    print(f"GPU Memory Before: {get_gpu_memory_info()}")
     manager = ModelManager.get_instance()
     config = manager.get_latentsync_config()
@@ -104,34 +72,18 @@ def apply_lipsync(video_input_path, audio_path, video_out_path, crop_size=256):
         if not torch.cuda.is_available():
             raise RuntimeError("CUDA not available - GPU required for lipsync")
-        total_memory = torch.cuda.get_device_properties(0).total_memory
-        print(f"Total GPU memory: {total_memory / 1024**3:.2f} GB")
-        available_vram = get_available_vram()
-        print(f"Available VRAM before processing: {available_vram:.2f} GB")
-        torch.cuda.empty_cache()
-        available_vram_after_clear = get_available_vram()
-        print(f"Available VRAM after cache clear: {available_vram_after_clear:.2f} GB")
-        print(
-            f"\nCalling get_optimal_params with input: {total_memory / 1024**3:.2f} GB"
-        )
-        num_frames, num_inference_steps, guidance_scale = get_optimal_params(
-            total_memory / 1024**3
-        )
-        print(
-            f"get_optimal_params output: num_frames={num_frames}, num_inference_steps={num_inference_steps}"
         )
-        print(f"\nParameters:")
         print(f"  num_frames: {num_frames}")
         print(f"  num_inference_steps: {num_inference_steps}")
         print(f"  guidance_scale: {guidance_scale}")
         print(f"  resolution: {config.data.resolution}")
         print(f"Initial seed: {torch.initial_seed()}")
-        print(f"GPU Memory After model load: {get_gpu_memory_info()}")
         print("\nStarting pipeline inference...")
         print(
@@ -154,16 +106,13 @@ def apply_lipsync(video_input_path, audio_path, video_out_path, crop_size=256):
                     height=crop_size,
                 )
             print("Pipeline completed successfully")
-            print(f"GPU Memory After pipeline: {get_gpu_memory_info()}")
         except RuntimeError as e:
             error_msg = str(e).lower()
             print(f"RuntimeError in pipeline: {e}")
             if "out of memory" in error_msg or "cuda out of memory" in error_msg:
                 print("GPU OOM DETECTED!")
-                print(f"GPU Memory at crash: {get_gpu_memory_info()}")
                 torch.cuda.empty_cache()
-                print(f"GPU Memory after OOM cleanup: {get_gpu_memory_info()}")
                 raise RuntimeError(
                     "GPU out of memory during lipsync. Try: 1) Shorter video 2) Lower resolution 3) Close other GPU apps"
                 )
@@ -172,13 +121,11 @@ def apply_lipsync(video_input_path, audio_path, video_out_path, crop_size=256):
             print(f"Unexpected error in pipeline: {e}")
             print(f"Error type: {type(e).__name__}")
             traceback.print_exc()
-            print(f"GPU Memory at error: {get_gpu_memory_info()}")
             raise
         finally:
             print("Clearing GPU cache...")
             torch.cuda.empty_cache()
             gc.collect()
-            print(f"GPU Memory After cleanup: {get_gpu_memory_info()}")
         print(f"\n{'=' * 60}")
         print(f"LIPSYNC SUCCESS - Output: {video_out_path}")

 os.makedirs("checkpoints", exist_ok=True)
+def get_quality_params(level: str) -> tuple:
+    """Get lipsync parameters based on quality level
     Args:
+        level: Quality level (Fast, Normal, Medium, Best, Super Best)
     Returns:
+        tuple of (num_frames, num_inference_steps, guidance_scale)
     """
+    params = {
+        "Fast": (12, 15, 1.0),
+        "Normal": (12, 20, 1.0),
+        "Medium": (16, 30, 1.5),
+        "Best": (20, 40, 2.0),
+        "Super Best": (24, 50, 2.5),
+    }
+    return params.get(level, (12, 20, 1.0))
 @spaces.GPU
+def apply_lipsync(
+    video_input_path, audio_path, video_out_path, crop_size=256, quality_level="Normal"
+):
     print(f"\n{'=' * 60}")
     print(f"LIPSYNC START")
     print(f"Input video: {video_input_path}")
     print(f"Crop size: {crop_size}x{crop_size}")
     print(f"{'=' * 60}\n")
     manager = ModelManager.get_instance()
     config = manager.get_latentsync_config()
         if not torch.cuda.is_available():
             raise RuntimeError("CUDA not available - GPU required for lipsync")
+        num_frames, num_inference_steps, guidance_scale = get_quality_params(
+            quality_level
         )
+        print(f"\nQuality level: {quality_level}")
+        print(f"Parameters:")
         print(f"  num_frames: {num_frames}")
         print(f"  num_inference_steps: {num_inference_steps}")
         print(f"  guidance_scale: {guidance_scale}")
         print(f"  resolution: {config.data.resolution}")
         print(f"Initial seed: {torch.initial_seed()}")
         print("\nStarting pipeline inference...")
         print(
                     height=crop_size,
                 )
             print("Pipeline completed successfully")
         except RuntimeError as e:
             error_msg = str(e).lower()
             print(f"RuntimeError in pipeline: {e}")
             if "out of memory" in error_msg or "cuda out of memory" in error_msg:
                 print("GPU OOM DETECTED!")
                 torch.cuda.empty_cache()
                 raise RuntimeError(
                     "GPU out of memory during lipsync. Try: 1) Shorter video 2) Lower resolution 3) Close other GPU apps"
                 )
             print(f"Unexpected error in pipeline: {e}")
             print(f"Error type: {type(e).__name__}")
             traceback.print_exc()
             raise
         finally:
             print("Clearing GPU cache...")
             torch.cuda.empty_cache()
             gc.collect()
         print(f"\n{'=' * 60}")
         print(f"LIPSYNC SUCCESS - Output: {video_out_path}")

lipsync_processing.py CHANGED Viewed

@@ -49,6 +49,7 @@ def apply_lipsync_to_video(
     audio_16k_path: str,
     output_dir: str,
     model_type: str = "LatentSync v1.6",
 ) -> tuple:
     """Apply lipsync to video using clean 16k audio
@@ -57,6 +58,7 @@ def apply_lipsync_to_video(
         audio_16k_path: Path to 16kHz audio
         output_dir: Directory to save output
         model_type: Model type for lipsync ("LatentSync v1.6" or "MuseTalk v1.5")
     Returns:
         Tuple of (lipsynced_video_path, video_info)
@@ -67,9 +69,11 @@ def apply_lipsync_to_video(
         if model_type == "LatentSync v1.6":
             crop_size = 512
             print(
-                f"Using LatentSync: video={video_path}, audio={audio_16k_path}, crop_size={crop_size}"
             )
-            apply_lipsync(video_path, audio_16k_path, lipsynced_video, crop_size)
         elif model_type == "MuseTalk v1.5":
             from musetalk import apply_musetalk_lipsync
@@ -101,14 +105,3 @@ def apply_lipsync_to_video(
         print(f"Runtime Error in lipsync processing: {e}")
         traceback.print_exc()
         raise
-    except Exception:
-        raise
-    except Exception as e:
-        print(f"Error in apply_lipsync_to_video: {e}")
-        traceback.print_exc()
-        raise
-    except Exception as e:
-        print(f"Error in apply_lipsync_to_video: {e}")
-        traceback.print_exc()
-        raise

     audio_16k_path: str,
     output_dir: str,
     model_type: str = "LatentSync v1.6",
+    quality_level: str = "Normal",
 ) -> tuple:
     """Apply lipsync to video using clean 16k audio
         audio_16k_path: Path to 16kHz audio
         output_dir: Directory to save output
         model_type: Model type for lipsync ("LatentSync v1.6" or "MuseTalk v1.5")
+        quality_level: Quality level ("Fast", "Normal", "Medium", "Best", "Super Best")
     Returns:
         Tuple of (lipsynced_video_path, video_info)
         if model_type == "LatentSync v1.6":
             crop_size = 512
             print(
+                f"Using LatentSync: video={video_path}, audio={audio_16k_path}, crop_size={crop_size}, quality={quality_level}"
+            )
+            apply_lipsync(
+                video_path, audio_16k_path, lipsynced_video, crop_size, quality_level
             )
         elif model_type == "MuseTalk v1.5":
             from musetalk import apply_musetalk_lipsync
         print(f"Runtime Error in lipsync processing: {e}")
         traceback.print_exc()
         raise

processing.py CHANGED Viewed

@@ -321,6 +321,7 @@ def process_lipsync_with_audio_target_new(
     audio_file,
     session_id=None,
     model_type="latentsync",
     progress=gr.Progress(track_tqdm=True),
 ):
     """Workflow mới: Chuẩn hóa YouTube rồi lipsync
@@ -338,6 +339,7 @@ def process_lipsync_with_audio_target_new(
         audio_file: Path to audio target (English only)
         session_id: Session identifier
         model_type: Model type for lipsync ("latentsync" or "musetalk")
         progress: Progress tracking object
     Returns:
@@ -427,7 +429,7 @@ def process_lipsync_with_audio_target_new(
         with timer("Applying lipsync"):
             try:
                 lipsynced_video, lipsynced_info = apply_lipsync_to_video(
-                    video_normalized, audio_16k, output_dir, model_type
                 )
                 logger.info(
                     f"Lipsynced video: {lipsynced_video}, size: {lipsynced_info['width']}x{lipsynced_info['height']}"
@@ -471,6 +473,7 @@ def lipsync_with_audio_target(
     audio_file,
     session_id=None,
     model_type="LatentSync v1.6",
     progress=gr.Progress(track_tqdm=True),
 ):
     """Wrapper for Gradio: Lipsync video source with audio target (English only)
@@ -483,5 +486,5 @@ def lipsync_with_audio_target(
     if audio_file is None:
         raise gr.Error("Please upload a target audio.")
     return process_lipsync_with_audio_target_new(
-        video_file, audio_file, session_id, model_type, progress
     )

     audio_file,
     session_id=None,
     model_type="latentsync",
+    quality_level="Normal",
     progress=gr.Progress(track_tqdm=True),
 ):
     """Workflow mới: Chuẩn hóa YouTube rồi lipsync
         audio_file: Path to audio target (English only)
         session_id: Session identifier
         model_type: Model type for lipsync ("latentsync" or "musetalk")
+        quality_level: Quality level ("Fast", "Normal", "Medium", "Best", "Super Best")
         progress: Progress tracking object
     Returns:
         with timer("Applying lipsync"):
             try:
                 lipsynced_video, lipsynced_info = apply_lipsync_to_video(
+                    video_normalized, audio_16k, output_dir, model_type, quality_level
                 )
                 logger.info(
                     f"Lipsynced video: {lipsynced_video}, size: {lipsynced_info['width']}x{lipsynced_info['height']}"
     audio_file,
     session_id=None,
     model_type="LatentSync v1.6",
+    quality_level="Normal",
     progress=gr.Progress(track_tqdm=True),
 ):
     """Wrapper for Gradio: Lipsync video source with audio target (English only)
     if audio_file is None:
         raise gr.Error("Please upload a target audio.")
     return process_lipsync_with_audio_target_new(
+        video_file, audio_file, session_id, model_type, quality_level, progress
     )