marcosremar2
/

MuseTalk

Model card Files Files and versions

MuseTalk / configs /training /syncnet.yaml

Zhizhou Zhong

feat: data preprocessing and training (#294)

f0d6854 unverified about 1 year ago

history blame contribute delete

834 Bytes

	# This file is modified from LatentSync (https://github.com/bytedance/LatentSync/blob/main/latentsync/configs/training/syncnet_16_pixel.yaml).
	model:
	audio_encoder: # input (1, 80, 52)
	in_channels: 1
	block_out_channels: [32, 64, 128, 256, 512, 1024, 2048]
	downsample_factors: [[2, 1], 2, 2, 1, 2, 2, [2, 3]]
	attn_blocks: [0, 0, 0, 0, 0, 0, 0]
	dropout: 0.0
	visual_encoder: # input (48, 128, 256)
	in_channels: 48
	block_out_channels: [64, 128, 256, 256, 512, 1024, 2048, 2048]
	downsample_factors: [[1, 2], 2, 2, 2, 2, 2, 2, 2]
	attn_blocks: [0, 0, 0, 0, 0, 0, 0, 0]
	dropout: 0.0

	ckpt:
	resume_ckpt_path: ""
	inference_ckpt_path: ./models/syncnet/latentsync_syncnet.pt # this pretrained model is from LatentSync (https://huggingface.co/ByteDance/LatentSync/tree/main)
	save_ckpt_steps: 2500