|
|
import torch |
|
|
from omegaconf import OmegaConf |
|
|
from safetensors.torch import load_model |
|
|
import numpy as np |
|
|
import os |
|
|
from diffusers.models import AutoencoderKL |
|
|
from pipeline.whisper import WhisperAudioProcessor |
|
|
from pipeline.dwpose import DWposeDetector |
|
|
from pipeline.utils import A2MEvalDataset |
|
|
from pipeline.a2m_pipeline import A2M_Pipeline |
|
|
from model.model_A2M import A2MModel_CrossAtten_Audio_PosePre , A2MModel_CrossAtten_Audio |
|
|
from model.model_AMD import AMDModel |
|
|
from typing import Optional |
|
|
from torch.utils.data import DataLoader |
|
|
from omegaconf import OmegaConf |
|
|
class a2m_inferencer: |
|
|
def __init__( |
|
|
self, |
|
|
config, |
|
|
device, |
|
|
dtype |
|
|
): |
|
|
self.config = config |
|
|
self.device = device |
|
|
self.dtype = dtype |
|
|
self.setup() |
|
|
def setup(self): |
|
|
vae_model = AutoencoderKL.from_pretrained(self.config.vae_path, subfolder="vae").to(self.device, self.dtype).requires_grad_(False) |
|
|
a2m_config = OmegaConf.load(self.config.a2m_config_path) |
|
|
if self.config.enable_pose: |
|
|
a2m_model = A2MModel_CrossAtten_Audio_PosePre(**a2m_config['model']).to(self.device, self.dtype).requires_grad_(False) |
|
|
else: |
|
|
a2m_model = A2MModel_CrossAtten_Audio(**a2m_config["model"]).to(self.device, self.dtype).requires_grad_(False) |
|
|
load_model(a2m_model, self.config.a2m_ckpt_path) |
|
|
amd_model = AMDModel.from_config(AMDModel.load_config(self.config.amd_config_path)).to(self.device, self.dtype).requires_grad_(False) |
|
|
load_model(amd_model, self.config.amd_ckpt_path) |
|
|
dwpose_model = DWposeDetector().to(self.device) |
|
|
whisper_model = WhisperAudioProcessor( |
|
|
16000, 30, |
|
|
self.config.whisper_model_path, |
|
|
os.path.dirname(self.config.audio_separator_model_file), |
|
|
os.path.basename(self.config.audio_separator_model_file), |
|
|
cache_dir = self.config.cache_dir, |
|
|
device = self.device) |
|
|
self.pipeline = A2M_Pipeline( |
|
|
amd_model, |
|
|
a2m_model, |
|
|
vae_model, |
|
|
dwpose_model, |
|
|
whisper_model, |
|
|
amd_sample_steps=self.config.amd_sample_steps, |
|
|
a2m_sample_steps=self.config.a2m_sample_steps, |
|
|
output_dir=self.config.output_dir, |
|
|
enable_pose=self.config.enable_pose |
|
|
) |
|
|
def infer(self, audio_path:str, refimg_path:str): |
|
|
video = self.pipeline.run(audio_path, refimg_path) |
|
|
return video |
|
|
def eval(self, audio_emb_dir:str, dwpose_dir:str, ref_img_dir:str, num_frames:int = 96, audio_dir:Optional[str]=None): |
|
|
evalset = A2MEvalDataset( |
|
|
audio_emb_dir, |
|
|
dwpose_dir, |
|
|
ref_img_dir, |
|
|
num_frames, |
|
|
random_audio=False, |
|
|
random_dwpose=False, |
|
|
audio_dir=audio_dir, |
|
|
|
|
|
|
|
|
) |
|
|
evalloader = DataLoader( |
|
|
evalset, 12, shuffle=False,drop_last=True,collate_fn=evalset.collate,num_workers=16 |
|
|
) |
|
|
self.pipeline.eval(evalloader) |
|
|
|
|
|
if __name__ == "__main__": |
|
|
|
|
|
|
|
|
config_path = "/mnt/pfs-gv8sxa/tts/dhg/zqy/code/AMD2/config/inference/a2m.yaml" |
|
|
audio_path = "/mnt/pfs-gv8sxa/tts/dhg/zqy/code/AMD2/demo/audio21.wav" |
|
|
refimg_path = "/mnt/pfs-gv8sxa/tts/dhg/zqy/code/AMD2/demo/face36.jpg" |
|
|
audio_emb_dir = "/mnt/pfs-gv8sxa/tts/dhg/zqy/data/FaceVid_240h/whisper_embs" |
|
|
dwpose_dir = "/mnt/pfs-gv8sxa/tts/dhg/zqy/data/FaceVid_240h/firstframes/fromdwpose" |
|
|
ref_img_dir = "/mnt/pfs-gv8sxa/tts/dhg/zqy/data/FaceVid_240h/firstframes/fromvideo" |
|
|
audio_dir = "/mnt/pfs-gv8sxa/tts/dhg/zqy/data/FaceVid_240h/audios" |
|
|
config = OmegaConf.load(config_path) |
|
|
inferencer = a2m_inferencer(config, torch.device("cuda:0"), torch.float16) |
|
|
|
|
|
inferencer.eval( |
|
|
audio_emb_dir, |
|
|
dwpose_dir, |
|
|
ref_img_dir, |
|
|
96, |
|
|
audio_dir |
|
|
) |