import torch from omegaconf import OmegaConf from safetensors.torch import load_model import numpy as np import os from diffusers.models import AutoencoderKL from pipeline.whisper import WhisperAudioProcessor from pipeline.dwpose import DWposeDetector from pipeline.utils import A2MEvalDataset from pipeline.a2m_pipeline import A2M_Pipeline from model.model_A2M import A2MModel_CrossAtten_Audio_PosePre , A2MModel_CrossAtten_Audio from model.model_AMD import AMDModel from typing import Optional from torch.utils.data import DataLoader from omegaconf import OmegaConf class a2m_inferencer: def __init__( self, config, device, dtype ): self.config = config self.device = device self.dtype = dtype self.setup() def setup(self): vae_model = AutoencoderKL.from_pretrained(self.config.vae_path, subfolder="vae").to(self.device, self.dtype).requires_grad_(False) a2m_config = OmegaConf.load(self.config.a2m_config_path) if self.config.enable_pose: a2m_model = A2MModel_CrossAtten_Audio_PosePre(**a2m_config['model']).to(self.device, self.dtype).requires_grad_(False) else: a2m_model = A2MModel_CrossAtten_Audio(**a2m_config["model"]).to(self.device, self.dtype).requires_grad_(False) load_model(a2m_model, self.config.a2m_ckpt_path) amd_model = AMDModel.from_config(AMDModel.load_config(self.config.amd_config_path)).to(self.device, self.dtype).requires_grad_(False) load_model(amd_model, self.config.amd_ckpt_path) dwpose_model = DWposeDetector().to(self.device) whisper_model = WhisperAudioProcessor( 16000, 30, self.config.whisper_model_path, os.path.dirname(self.config.audio_separator_model_file), os.path.basename(self.config.audio_separator_model_file), cache_dir = self.config.cache_dir, device = self.device) self.pipeline = A2M_Pipeline( amd_model, a2m_model, vae_model, dwpose_model, whisper_model, amd_sample_steps=self.config.amd_sample_steps, a2m_sample_steps=self.config.a2m_sample_steps, output_dir=self.config.output_dir, enable_pose=self.config.enable_pose ) def infer(self, audio_path:str, refimg_path:str): video = self.pipeline.run(audio_path, refimg_path) return video def eval(self, audio_emb_dir:str, dwpose_dir:str, ref_img_dir:str, num_frames:int = 96, audio_dir:Optional[str]=None): evalset = A2MEvalDataset( audio_emb_dir, dwpose_dir, ref_img_dir, num_frames, random_audio=False, random_dwpose=False, audio_dir=audio_dir, # num_evals=4, # audio_suffix="wav" ) evalloader = DataLoader( evalset, 12, shuffle=False,drop_last=True,collate_fn=evalset.collate,num_workers=16 ) self.pipeline.eval(evalloader) if __name__ == "__main__": # TODO add argparse here # config_path = "/mnt/pfs-gv8sxa/tts/dhg/zqy/code/AMD2/config/inference/a2m_wpose.yaml" config_path = "/mnt/pfs-gv8sxa/tts/dhg/zqy/code/AMD2/config/inference/a2m.yaml" audio_path = "/mnt/pfs-gv8sxa/tts/dhg/zqy/code/AMD2/demo/audio21.wav" refimg_path = "/mnt/pfs-gv8sxa/tts/dhg/zqy/code/AMD2/demo/face36.jpg" audio_emb_dir = "/mnt/pfs-gv8sxa/tts/dhg/zqy/data/FaceVid_240h/whisper_embs" dwpose_dir = "/mnt/pfs-gv8sxa/tts/dhg/zqy/data/FaceVid_240h/firstframes/fromdwpose" ref_img_dir = "/mnt/pfs-gv8sxa/tts/dhg/zqy/data/FaceVid_240h/firstframes/fromvideo" audio_dir = "/mnt/pfs-gv8sxa/tts/dhg/zqy/data/FaceVid_240h/audios" config = OmegaConf.load(config_path) inferencer = a2m_inferencer(config, torch.device("cuda:0"), torch.float16) # inferencer.infer(audio_path, refimg_path) inferencer.eval( audio_emb_dir, dwpose_dir, ref_img_dir, 96, audio_dir )