lipsync-docker / lipsync.py
naicoi's picture
model-dirs (#2)
f5651ba
import gc
import os
import time
import traceback
import torch
from DeepCache import DeepCacheSDHelper
from latentsync.pipelines.lipsync_pipeline import LipsyncPipeline
from shared.model_manager import ModelManager
from config import MODELS_DIR
torch.backends.cudnn.benchmark = True
torch.backends.cudnn.deterministic = False
os.makedirs(MODELS_DIR, exist_ok=True)
def get_quality_params(level: str) -> tuple:
"""Get lipsync parameters based on quality level
Args:
level: Quality level (Fast, Normal, Medium, Best, Super Best)
Returns:
tuple of (num_frames, num_inference_steps, guidance_scale)
"""
params = {
"Fast": (12, 15, 1.0),
"Normal": (12, 20, 1.5),
"Medium": (16, 30, 1.5),
"Best": (20, 40, 1.5),
"Super Best": (24, 50, 1.5),
}
return params.get(level, (12, 20, 1.0))
def apply_lipsync(
video_input_path, audio_path, video_out_path, crop_size=256, quality_level="Normal"
):
print(f"\n{'=' * 60}")
print("LIPSYNC START")
print(f"Input video: {video_input_path}")
print(f"Input audio: {audio_path}")
print(f"Output video: {video_out_path}")
print(f"Crop size: {crop_size}x{crop_size}")
print(f"{'=' * 60}\n")
manager = ModelManager.get_instance()
config = manager.get_latentsync_config()
vae = manager.load_vae()
audio_encoder = manager.load_whisper_encoder(
manager.get_whisper_model_path(config.model.cross_attention_dim),
"cuda",
config.data.num_frames,
)
unet = manager.load_latentsync_unet()
scheduler = manager.get_scheduler()
pipeline = LipsyncPipeline(
vae=vae,
audio_encoder=audio_encoder,
unet=unet,
scheduler=scheduler,
).to("cuda")
print("Enabling DeepCache (cache_interval=3, cache_branch_id=0)...")
deepcache_helper = DeepCacheSDHelper(pipe=pipeline)
deepcache_helper.set_params(cache_interval=3, cache_branch_id=0)
deepcache_helper.enable()
try:
if not torch.cuda.is_available():
raise RuntimeError("CUDA not available - GPU required for lipsync")
num_frames, num_inference_steps, guidance_scale = get_quality_params(
quality_level
)
print(f"\nQuality level: {quality_level}")
print("Parameters:")
print(f" num_frames: {num_frames}")
print(f" num_inference_steps: {num_inference_steps}")
print(f" guidance_scale: {guidance_scale}")
print(f" resolution: {config.data.resolution}")
print(f"Initial seed: {torch.initial_seed()}")
print("\nStarting pipeline inference...")
print(
f"Parameters: num_frames={num_frames}, num_inference_steps={num_inference_steps}, "
f"guidance_scale={guidance_scale}, size={crop_size}x{crop_size}"
)
try:
with torch.no_grad():
result = pipeline(
video_path=video_input_path,
audio_path=audio_path,
video_out_path=video_out_path,
video_mask_path=video_out_path.replace(".mp4", "_mask.mp4"),
num_frames=num_frames,
num_inference_steps=num_inference_steps,
guidance_scale=guidance_scale,
weight_dtype=torch.float16,
width=crop_size,
height=crop_size,
)
print("Pipeline completed successfully")
except RuntimeError as e:
error_msg = str(e).lower()
print(f"RuntimeError in pipeline: {e}")
if "out of memory" in error_msg or "cuda out of memory" in error_msg:
print("GPU OOM DETECTED!")
torch.cuda.empty_cache()
raise RuntimeError(
"GPU out of memory during lipsync. Try: 1) Shorter video 2) Lower resolution 3) Close other GPU apps"
)
raise
except Exception as e:
print(f"Unexpected error in pipeline: {e}")
print(f"Error type: {type(e).__name__}")
traceback.print_exc()
raise
finally:
print("Clearing GPU cache...")
torch.cuda.empty_cache()
gc.collect()
print(f"\n{'=' * 60}")
print(f"LIPSYNC SUCCESS - Output: {video_out_path}")
print(f"{'=' * 60}\n")
time.sleep(1)
if not os.path.exists(video_out_path):
raise RuntimeError(
f"Pipeline succeeded but output file not created: {video_out_path}"
)
return video_out_path
except Exception as e:
print(f"\n{'=' * 60}")
print(f"LIPSYNC FAILED: {type(e).__name__}")
print(f"Error: {e}")
print(f"{'=' * 60}\n")
traceback.print_exc()
raise