Eji-Sensei14's picture
Upload folder using huggingface_hub
c6535db verified
# At the top of inference.py, make sure you have these imports:
import argparse
import os
from omegaconf import OmegaConf
import torch
from diffusers import AutoencoderKL, DDIMScheduler
from latentsync.models.unet import UNet3DConditionModel
from latentsync.pipelines.lipsync_pipeline import LipsyncPipeline
from accelerate.utils import set_seed
from latentsync.whisper.audio2feature import Audio2Feature
from DeepCache import DeepCacheSDHelper
def main(config, args):
if not os.path.exists(args.video_path):
raise RuntimeError(f"Video path '{args.video_path}' not found")
if not os.path.exists(args.audio_path):
raise RuntimeError(f"Audio path '{args.audio_path}' not found")
# Check if the GPU supports float16
is_fp16_supported = torch.cuda.is_available() and torch.cuda.get_device_capability()[0] > 7
dtype = torch.float16 if is_fp16_supported else torch.float32
print(f"Input video path: {args.video_path}")
print(f"Input audio path: {args.audio_path}")
print(f"Loaded checkpoint path: {args.inference_ckpt_path}")
# FIXED: Create DDIMScheduler directly (NO HUGGINGFACE)
scheduler = DDIMScheduler(
beta_end=0.012,
beta_schedule="scaled_linear",
beta_start=0.00085,
clip_sample=False,
num_train_timesteps=1000,
prediction_type="epsilon",
set_alpha_to_one=False,
steps_offset=1
)
print("✓ Created DDIMScheduler directly (fully offline)")
if config.model.cross_attention_dim == 768:
whisper_model_name = "small"
elif config.model.cross_attention_dim == 384:
whisper_model_name = "tiny"
else:
raise NotImplementedError("cross_attention_dim must be 768 or 384")
audio_encoder = Audio2Feature(
model_path=whisper_model_name,
device="cuda",
num_frames=config.data.num_frames,
audio_feat_length=config.data.audio_feat_length,
)
# FIXED: Load VAE locally with proper path resolution
# Get the base directory (where the extension is located)
if hasattr(args, 'extension_dir'):
base_dir = args.extension_dir
else:
# Fallback: try to determine from script location
script_dir = os.path.dirname(os.path.abspath(__file__))
base_dir = os.path.dirname(script_dir) # Go up one level from scripts/ to extension root
# Try multiple VAE locations in order of preference
vae_locations = [
# New vae folder structure
os.path.join(base_dir, "checkpoints", "vae", "sd-vae-ft-mse.safetensors"),
os.path.join(base_dir, "checkpoints", "vae"), # Directory with config.json
# Original locations
os.path.join(base_dir, "checkpoints", "sd-vae-ft-mse.safetensors"),
os.path.join(base_dir, "checkpoints", "sd-vae-ft-mse"),
]
vae = None
for vae_path in vae_locations:
if os.path.exists(vae_path):
try:
if vae_path.endswith('.safetensors'):
print(f"Attempting to load VAE from safetensors file: {vae_path}")
vae = AutoencoderKL.from_single_file(vae_path, torch_dtype=dtype)
elif os.path.isdir(vae_path):
print(f"Attempting to load VAE from directory: {vae_path}")
vae = AutoencoderKL.from_pretrained(vae_path, torch_dtype=dtype, local_files_only=True)
if vae is not None:
print(f"✓ Successfully loaded VAE from: {vae_path}")
break
except Exception as e:
print(f"Failed to load VAE from {vae_path}: {str(e)}")
vae = None # Reset vae to None if loading failed
continue
if vae is None:
print("Local VAE not found in any location, creating VAE with standard configuration")
print(f"Searched locations: {vae_locations}")
# Create VAE with standard SD configuration if local model doesn't exist
vae = AutoencoderKL(
in_channels=3,
out_channels=3,
down_block_types=["DownEncoderBlock2D", "DownEncoderBlock2D", "DownEncoderBlock2D", "DownEncoderBlock2D"],
up_block_types=["UpDecoderBlock2D", "UpDecoderBlock2D", "UpDecoderBlock2D", "UpDecoderBlock2D"],
block_out_channels=[128, 256, 512, 512],
layers_per_block=2,
act_fn="silu",
latent_channels=4,
norm_num_groups=32,
sample_size=512,
).to(dtype=dtype)
print("⚠️ Using default VAE configuration - consider downloading VAE model locally for better results")
# Set VAE configuration
vae.config.scaling_factor = 0.18215
vae.config.shift_factor = 0
# Rest of the function continues as before...
unet, _ = UNet3DConditionModel.from_pretrained(
OmegaConf.to_container(config.model),
args.inference_ckpt_path,
device="cpu",
)
unet = unet.to(dtype=dtype)
pipeline = LipsyncPipeline(
vae=vae,
audio_encoder=audio_encoder,
unet=unet,
scheduler=scheduler,
).to("cuda")
# use DeepCache
helper = DeepCacheSDHelper(pipe=pipeline)
helper.set_params(cache_interval=3, cache_branch_id=0)
helper.enable()
if args.seed != -1:
set_seed(args.seed)
else:
torch.seed()
print(f"Initial seed: {torch.initial_seed()}")
pipeline(
video_path=args.video_path,
audio_path=args.audio_path,
video_out_path=args.video_out_path,
video_mask_path=args.video_out_path.replace(".mp4", "_mask.mp4"),
num_frames=config.data.num_frames,
num_inference_steps=args.inference_steps,
guidance_scale=args.guidance_scale,
weight_dtype=dtype,
width=config.data.resolution,
height=config.data.resolution,
mask_image_path=config.data.mask_image_path,
)
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument("--unet_config_path", type=str, default="configs/unet.yaml")
parser.add_argument("--inference_ckpt_path", type=str, required=True)
parser.add_argument("--video_path", type=str, required=True)
parser.add_argument("--audio_path", type=str, required=True)
parser.add_argument("--video_out_path", type=str, required=True)
parser.add_argument("--inference_steps", type=int, default=20)
parser.add_argument("--guidance_scale", type=float, default=1.0)
parser.add_argument("--seed", type=int, default=1247)
args = parser.parse_args()
config = OmegaConf.load(args.unet_config_path)
main(config, args)