Spaces:

naicoi
/

lipsync-docker

Runtime error

App Files Files Community

lipsync-docker / scripts /train_unet.py

naicoi

model-dirs (#2)

f5651ba 4 months ago

raw

history blame contribute delete

23.9 kB

	# Copyright (c) 2024 Bytedance Ltd. and/or its affiliates
	#
	# Licensed under the Apache License, Version 2.0 (the "License");
	# you may not use this file except in compliance with the License.
	# You may obtain a copy of the License at
	#
	# http://www.apache.org/licenses/LICENSE-2.0
	#
	# Unless required by applicable law or agreed to in writing, software
	# distributed under the License is distributed on an "AS IS" BASIS,
	# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	# See the License for the specific language governing permissions and
	# limitations under the License.

	import os
	import sys
	import math
	import argparse
	import shutil
	import datetime
	import logging

	sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
	from config import MODELS_DIR
	from omegaconf import OmegaConf

	from tqdm.auto import tqdm
	from einops import rearrange

	import torch
	import torch.nn.functional as F
	import torch.nn as nn
	import torch.distributed as dist
	from torch.utils.data.distributed import DistributedSampler
	from torch.nn.parallel import DistributedDataParallel as DDP

	import diffusers
	from diffusers import AutoencoderKL, DDIMScheduler
	from diffusers.utils.logging import get_logger
	from diffusers.optimization import get_scheduler
	from accelerate.utils import set_seed

	from latentsync.data.unet_dataset import UNetDataset
	from latentsync.models.unet import UNet3DConditionModel
	from latentsync.models.stable_syncnet import StableSyncNet
	from latentsync.pipelines.lipsync_pipeline import LipsyncPipeline
	from latentsync.utils.util import (
	init_dist,
	cosine_loss,
	one_step_sampling,
	)
	from latentsync.utils.util import plot_loss_chart
	from latentsync.whisper.audio2feature import Audio2Feature
	from latentsync.trepa.loss import TREPALoss
	from eval.syncnet import SyncNetEval
	from eval.syncnet_detect import SyncNetDetector
	from eval.eval_sync_conf import syncnet_eval
	import lpips


	logger = get_logger(__name__)


	def main(config):
	# Initialize distributed training
	local_rank = init_dist()
	global_rank = dist.get_rank()
	num_processes = dist.get_world_size()
	is_main_process = global_rank == 0

	seed = config.run.seed + global_rank
	set_seed(seed)

	# Logging folder
	folder_name = "train" + datetime.datetime.now().strftime(f"-%Y_%m_%d-%H:%M:%S")
	output_dir = os.path.join(config.data.train_output_dir, folder_name)

	# Make one log on every process with the configuration for debugging.
	logging.basicConfig(
	format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
	datefmt="%m/%d/%Y %H:%M:%S",
	level=logging.INFO,
	)

	# Handle the output folder creation
	if is_main_process:
	diffusers.utils.logging.set_verbosity_info()
	os.makedirs(output_dir, exist_ok=True)
	os.makedirs(f"{output_dir}/checkpoints", exist_ok=True)
	os.makedirs(f"{output_dir}/val_videos", exist_ok=True)
	os.makedirs(f"{output_dir}/sync_conf_results", exist_ok=True)
	shutil.copy(config.unet_config_path, output_dir)
	shutil.copy(config.data.syncnet_config_path, output_dir)

	device = torch.device(local_rank)

	noise_scheduler = DDIMScheduler.from_pretrained("configs")

	vae = AutoencoderKL.from_pretrained(
	"stabilityai/sd-vae-ft-mse", torch_dtype=torch.float16, cache_dir=MODELS_DIR
	)
	vae.config.scaling_factor = 0.18215
	vae.config.shift_factor = 0

	vae_scale_factor = 2 ** (len(vae.config.block_out_channels) - 1)
	vae.requires_grad_(False)
	vae.to(device)

	if config.run.pixel_space_supervise:
	vae.enable_gradient_checkpointing()

	syncnet_eval_model = SyncNetEval(device=device)
	syncnet_eval_model.loadParameters("checkpoints/auxiliary/syncnet_v2.model")

	syncnet_detector = SyncNetDetector(
	device=device, detect_results_dir="detect_results"
	)

	if config.model.cross_attention_dim == 768:
	whisper_model_path = "small"
	elif config.model.cross_attention_dim == 384:
	whisper_model_path = "tiny"
	else:
	raise NotImplementedError("cross_attention_dim must be 768 or 384")

	audio_encoder = Audio2Feature(
	model_path=whisper_model_path,
	device=device,
	audio_embeds_cache_dir=config.data.audio_embeds_cache_dir,
	num_frames=config.data.num_frames,
	audio_feat_length=config.data.audio_feat_length,
	)

	unet, resume_global_step = UNet3DConditionModel.from_pretrained(
	OmegaConf.to_container(config.model),
	config.ckpt.resume_ckpt_path,
	device=device,
	)

	if config.model.add_audio_layer and config.run.use_syncnet:
	syncnet_config = OmegaConf.load(config.data.syncnet_config_path)
	if syncnet_config.ckpt.inference_ckpt_path == "":
	raise ValueError("SyncNet path is not provided")
	syncnet = StableSyncNet(
	OmegaConf.to_container(syncnet_config.model), gradient_checkpointing=True
	).to(device=device, dtype=torch.float16)
	syncnet_checkpoint = torch.load(
	syncnet_config.ckpt.inference_ckpt_path,
	map_location=device,
	weights_only=True,
	)
	syncnet.load_state_dict(syncnet_checkpoint["state_dict"])
	syncnet.requires_grad_(False)

	del syncnet_checkpoint
	torch.cuda.empty_cache()

	if config.model.use_motion_module:
	unet.requires_grad_(False)
	for name, param in unet.named_parameters():
	for trainable_module_name in config.run.trainable_modules:
	if trainable_module_name in name:
	param.requires_grad = True
	break
	trainable_params = list(filter(lambda p: p.requires_grad, unet.parameters()))
	else:
	unet.requires_grad_(True)
	trainable_params = list(unet.parameters())

	if config.optimizer.scale_lr:
	config.optimizer.lr = config.optimizer.lr * num_processes

	optimizer = torch.optim.AdamW(trainable_params, lr=config.optimizer.lr)

	if is_main_process:
	logger.info(f"trainable params number: {len(trainable_params)}")
	logger.info(
	f"trainable params scale: {sum(p.numel() for p in trainable_params) / 1e6:.3f} M"
	)

	# Enable gradient checkpointing
	if config.run.enable_gradient_checkpointing:
	unet.enable_gradient_checkpointing()

	# Get the training dataset
	train_dataset = UNetDataset(config.data.train_data_dir, config)
	distributed_sampler = DistributedSampler(
	train_dataset,
	num_replicas=num_processes,
	rank=global_rank,
	shuffle=True,
	seed=config.run.seed,
	)

	# DataLoaders creation:
	train_dataloader = torch.utils.data.DataLoader(
	train_dataset,
	batch_size=config.data.batch_size,
	shuffle=False,
	sampler=distributed_sampler,
	num_workers=config.data.num_workers,
	pin_memory=False,
	drop_last=True,
	worker_init_fn=train_dataset.worker_init_fn,
	)

	# Get the training iteration
	if config.run.max_train_steps == -1:
	assert config.run.max_train_epochs != -1
	config.run.max_train_steps = config.run.max_train_epochs * len(train_dataloader)

	# Scheduler
	lr_scheduler = get_scheduler(
	config.optimizer.lr_scheduler,
	optimizer=optimizer,
	num_warmup_steps=config.optimizer.lr_warmup_steps,
	num_training_steps=config.run.max_train_steps,
	)

	if config.run.perceptual_loss_weight != 0 and config.run.pixel_space_supervise:
	lpips_loss_func = lpips.LPIPS(net="vgg").to(device)

	if config.run.trepa_loss_weight != 0 and config.run.pixel_space_supervise:
	trepa_loss_func = TREPALoss(device=device, with_cp=True)

	# Validation pipeline
	pipeline = LipsyncPipeline(
	vae=vae,
	audio_encoder=audio_encoder,
	unet=unet,
	scheduler=noise_scheduler,
	).to(device)
	pipeline.set_progress_bar_config(disable=True)

	# DDP warpper
	unet = DDP(unet, device_ids=[local_rank], output_device=local_rank)

	# We need to recalculate our total training steps as the size of the training dataloader may have changed.
	num_update_steps_per_epoch = math.ceil(len(train_dataloader))
	# Afterwards we recalculate our number of training epochs
	num_train_epochs = math.ceil(
	config.run.max_train_steps / num_update_steps_per_epoch
	)

	# Train!
	total_batch_size = config.data.batch_size * num_processes

	if is_main_process:
	logger.info("*** Running training ***")
	logger.info(f" Num examples = {len(train_dataset)}")
	logger.info(f" Num Epochs = {num_train_epochs}")
	logger.info(f" Instantaneous batch size per device = {config.data.batch_size}")
	logger.info(
	f" Total train batch size (w. parallel, distributed & accumulation) = {total_batch_size}"
	)
	logger.info(f" Total optimization steps = {config.run.max_train_steps}")
	global_step = resume_global_step
	first_epoch = resume_global_step // num_update_steps_per_epoch

	# Only show the progress bar once on each machine.
	progress_bar = tqdm(
	range(0, config.run.max_train_steps),
	initial=resume_global_step,
	desc="Steps",
	disable=not is_main_process,
	)

	train_step_list = []
	val_step_list = []
	sync_conf_list = []

	# Support mixed-precision training
	scaler = (
	torch.amp.GradScaler("cuda") if config.run.mixed_precision_training else None
	)

	for epoch in range(first_epoch, num_train_epochs):
	train_dataloader.sampler.set_epoch(epoch)
	unet.train()

	for step, batch in enumerate(train_dataloader):
	### >>>> Training >>>> ###

	if config.model.add_audio_layer:
	if batch["mel"] != []:
	mel = batch["mel"].to(device, dtype=torch.float16)

	audio_embeds_list = []
	try:
	for idx in range(len(batch["video_path"])):
	video_path = batch["video_path"][idx]
	start_idx = batch["start_idx"][idx]

	with torch.no_grad():
	audio_feat = audio_encoder.audio2feat(video_path)
	audio_embeds = audio_encoder.crop_overlap_audio_window(
	audio_feat, start_idx
	)
	audio_embeds_list.append(audio_embeds)
	except Exception as e:
	logger.info(f"{type(e).__name__} - {e} - {video_path}")
	continue
	audio_embeds = torch.stack(audio_embeds_list) # (B, 16, 50, 384)
	audio_embeds = audio_embeds.to(device, dtype=torch.float16)
	else:
	audio_embeds = None

	# Convert videos to latent space
	gt_pixel_values = batch["gt_pixel_values"].to(device, dtype=torch.float16)
	masked_pixel_values = batch["masked_pixel_values"].to(
	device, dtype=torch.float16
	)
	masks = batch["masks"].to(device, dtype=torch.float16)
	ref_pixel_values = batch["ref_pixel_values"].to(device, dtype=torch.float16)

	gt_pixel_values = rearrange(gt_pixel_values, "b f c h w -> (b f) c h w")
	masked_pixel_values = rearrange(
	masked_pixel_values, "b f c h w -> (b f) c h w"
	)
	masks = rearrange(masks, "b f c h w -> (b f) c h w")
	ref_pixel_values = rearrange(ref_pixel_values, "b f c h w -> (b f) c h w")

	with torch.no_grad():
	gt_latents = vae.encode(gt_pixel_values).latent_dist.sample()
	masked_latents = vae.encode(masked_pixel_values).latent_dist.sample()
	ref_latents = vae.encode(ref_pixel_values).latent_dist.sample()

	masks = torch.nn.functional.interpolate(
	masks, size=config.data.resolution // vae_scale_factor
	)

	gt_latents = (
	rearrange(
	gt_latents, "(b f) c h w -> b c f h w", f=config.data.num_frames
	)
	- vae.config.shift_factor
	) * vae.config.scaling_factor
	masked_latents = (
	rearrange(
	masked_latents, "(b f) c h w -> b c f h w", f=config.data.num_frames
	)
	- vae.config.shift_factor
	) * vae.config.scaling_factor
	ref_latents = (
	rearrange(
	ref_latents, "(b f) c h w -> b c f h w", f=config.data.num_frames
	)
	- vae.config.shift_factor
	) * vae.config.scaling_factor
	masks = rearrange(
	masks, "(b f) c h w -> b c f h w", f=config.data.num_frames
	)

	# Sample noise that we'll add to the latents
	if config.run.use_mixed_noise:
	# Refer to the paper: https://arxiv.org/abs/2305.10474
	noise_shared_std_dev = (
	config.run.mixed_noise_alpha**2
	/ (1 + config.run.mixed_noise_alpha**2)
	) ** 0.5
	noise_shared = torch.randn_like(gt_latents) * noise_shared_std_dev
	noise_shared = noise_shared[:, :, 0:1].repeat(
	1, 1, config.data.num_frames, 1, 1
	)

	noise_ind_std_dev = (1 / (1 + config.run.mixed_noise_alpha2)) 0.5
	noise_ind = torch.randn_like(gt_latents) * noise_ind_std_dev
	noise = noise_ind + noise_shared
	else:
	noise = torch.randn_like(gt_latents)
	noise = noise[
	:, :, 0:1
	].repeat(
	1, 1, config.data.num_frames, 1, 1
	) # Using the same noise for all frames, refer to the paper: https://arxiv.org/abs/2308.09716

	bsz = gt_latents.shape[0]

	# Sample a random timestep for each video
	timesteps = torch.randint(
	0,
	noise_scheduler.config.num_train_timesteps,
	(bsz,),
	device=gt_latents.device,
	)
	timesteps = timesteps.long()

	# Add noise to the latents according to the noise magnitude at each timestep
	# (this is the forward diffusion process)
	noisy_gt_latents = noise_scheduler.add_noise(gt_latents, noise, timesteps)

	# Get the target for loss depending on the prediction type
	if noise_scheduler.config.prediction_type == "epsilon":
	target = noise
	elif noise_scheduler.config.prediction_type == "v_prediction":
	raise NotImplementedError
	else:
	raise ValueError(
	f"Unknown prediction type {noise_scheduler.config.prediction_type}"
	)

	unet_input = torch.cat(
	[noisy_gt_latents, masks, masked_latents, ref_latents], dim=1
	)

	# Predict the noise and compute loss
	# Mixed-precision training
	with torch.autocast(
	device_type="cuda",
	dtype=torch.float16,
	enabled=config.run.mixed_precision_training,
	):
	pred_noise = unet(
	unet_input, timesteps, encoder_hidden_states=audio_embeds
	).sample

	if config.run.recon_loss_weight != 0:
	recon_loss = F.mse_loss(
	pred_noise.float(), target.float(), reduction="mean"
	)
	else:
	recon_loss = 0

	pred_latents = one_step_sampling(
	noise_scheduler, pred_noise, timesteps, noisy_gt_latents
	)

	if config.run.pixel_space_supervise:
	pred_pixel_values = vae.decode(
	rearrange(pred_latents, "b c f h w -> (b f) c h w")
	/ vae.config.scaling_factor
	+ vae.config.shift_factor
	).sample

	if (
	config.run.perceptual_loss_weight != 0
	and config.run.pixel_space_supervise
	):
	pred_pixel_values_perceptual = pred_pixel_values[
	:, :, pred_pixel_values.shape[2] // 2 :, :
	]
	gt_pixel_values_perceptual = gt_pixel_values[
	:, :, gt_pixel_values.shape[2] // 2 :, :
	]
	lpips_loss = lpips_loss_func(
	pred_pixel_values_perceptual.float(),
	gt_pixel_values_perceptual.float(),
	).mean()
	else:
	lpips_loss = 0

	if config.run.trepa_loss_weight != 0 and config.run.pixel_space_supervise:
	trepa_pred_pixel_values = rearrange(
	pred_pixel_values,
	"(b f) c h w -> b c f h w",
	f=config.data.num_frames,
	)
	trepa_gt_pixel_values = rearrange(
	gt_pixel_values,
	"(b f) c h w -> b c f h w",
	f=config.data.num_frames,
	)
	trepa_loss = trepa_loss_func(
	trepa_pred_pixel_values, trepa_gt_pixel_values
	)
	else:
	trepa_loss = 0

	if config.model.add_audio_layer and config.run.use_syncnet:
	if config.run.pixel_space_supervise:
	if config.data.resolution != syncnet_config.data.resolution:
	pred_pixel_values = F.interpolate(
	pred_pixel_values,
	size=(
	syncnet_config.data.resolution,
	syncnet_config.data.resolution,
	),
	mode="bicubic",
	)
	syncnet_input = rearrange(
	pred_pixel_values,
	"(b f) c h w -> b (f c) h w",
	f=config.data.num_frames,
	)
	else:
	syncnet_input = rearrange(pred_latents, "b c f h w -> b (f c) h w")

	if syncnet_config.data.lower_half:
	height = syncnet_input.shape[2]
	syncnet_input = syncnet_input[:, :, height // 2 :, :]
	ones_tensor = (
	torch.ones((config.data.batch_size, 1)).float().to(device=device)
	)
	vision_embeds, audio_embeds = syncnet(syncnet_input, mel)
	sync_loss = cosine_loss(
	vision_embeds.float(), audio_embeds.float(), ones_tensor
	).mean()
	else:
	sync_loss = 0

	loss = (
	recon_loss * config.run.recon_loss_weight
	+ sync_loss * config.run.sync_loss_weight
	+ lpips_loss * config.run.perceptual_loss_weight
	+ trepa_loss * config.run.trepa_loss_weight
	)

	train_step_list.append(global_step)

	optimizer.zero_grad()

	# Backpropagate
	if config.run.mixed_precision_training:
	scaler.scale(loss).backward()
	""" >>> gradient clipping >>> """
	scaler.unscale_(optimizer)
	torch.nn.utils.clip_grad_norm_(
	trainable_params, config.optimizer.max_grad_norm
	)
	""" <<< gradient clipping <<< """
	scaler.step(optimizer)
	scaler.update()
	else:
	loss.backward()
	""" >>> gradient clipping >>> """
	torch.nn.utils.clip_grad_norm_(
	trainable_params, config.optimizer.max_grad_norm
	)
	""" <<< gradient clipping <<< """
	optimizer.step()

	# Check the grad of attn blocks for debugging
	# print(unet.module.up_blocks[3].attentions[2].transformer_blocks[0].attn2.to_q.weight.grad)

	lr_scheduler.step()
	progress_bar.update(1)
	global_step += 1

	### <<<< Training <<<< ###

	# Save checkpoint and conduct validation
	if is_main_process and (global_step % config.ckpt.save_ckpt_steps == 0):
	model_save_path = os.path.join(
	output_dir, f"checkpoints/checkpoint-{global_step}.pt"
	)
	state_dict = {
	"global_step": global_step,
	"state_dict": unet.module.state_dict(),
	}
	try:
	torch.save(state_dict, model_save_path)
	logger.info(f"Saved checkpoint to {model_save_path}")
	except Exception as e:
	logger.error(f"Error saving model: {e}")

	# Validation
	logger.info("Running validation... ")

	validation_video_out_path = os.path.join(
	output_dir, f"val_videos/val_video_{global_step}.mp4"
	)

	with torch.autocast(device_type="cuda", dtype=torch.float16):
	pipeline(
	config.data.val_video_path,
	config.data.val_audio_path,
	validation_video_out_path,
	num_frames=config.data.num_frames,
	num_inference_steps=config.run.inference_steps,
	guidance_scale=config.run.guidance_scale,
	weight_dtype=torch.float16,
	width=config.data.resolution,
	height=config.data.resolution,
	mask_image_path=config.data.mask_image_path,
	)

	logger.info(
	f"Saved validation video output to {validation_video_out_path}"
	)

	val_step_list.append(global_step)

	if config.model.add_audio_layer and os.path.exists(
	validation_video_out_path
	):
	try:
	_, conf = syncnet_eval(
	syncnet_eval_model,
	syncnet_detector,
	validation_video_out_path,
	"temp",
	)
	except Exception as e:
	logger.info(e)
	conf = 0
	sync_conf_list.append(conf)
	plot_loss_chart(
	os.path.join(
	output_dir,
	f"sync_conf_results/sync_conf_chart-{global_step}.png",
	),
	("Sync confidence", val_step_list, sync_conf_list),
	)

	logs = {"step_loss": loss.item(), "epoch": epoch}
	progress_bar.set_postfix(**logs)

	if global_step >= config.run.max_train_steps:
	break

	progress_bar.close()
	dist.destroy_process_group()


	if __name__ == "__main__":
	parser = argparse.ArgumentParser()

	# Config file path
	parser.add_argument("--unet_config_path", type=str, default="configs/unet.yaml")

	args = parser.parse_args()
	config = OmegaConf.load(args.unet_config_path)
	config.unet_config_path = args.unet_config_path

	main(config)