semo / scripts /p2v_demo.py
HappyP4nda's picture
Upload folder using huggingface_hub
bd546bf verified
import torch
from omegaconf import OmegaConf
from safetensors.torch import load_model
from diffusers.models import AutoencoderKL
from pipeline.dwpose import DWposeDetector
from pipeline.utils import P2MEvalDataset
from pipeline.p2m_pipeline import P2M_Pipeline
from model.model_A2M import A2MModel_CrossAtten_Pose
from model.model_AMD import AMDModel
from typing import Optional
from torch.utils.data import DataLoader
from omegaconf import OmegaConf
class p2m_inferencer:
def __init__(
self,
config,
device,
dtype
):
self.config = config
self.device = device
self.dtype = dtype
self.setup()
def setup(self):
vae_model = AutoencoderKL.from_pretrained(self.config.vae_path, subfolder="vae").to(self.device, self.dtype).requires_grad_(False)
p2m_config = OmegaConf.load(self.config.p2m_config_path)
p2m_model = A2MModel_CrossAtten_Pose(**p2m_config['model']).to(self.device, self.dtype).requires_grad_(False)
load_model(p2m_model, self.config.p2m_ckpt_path)
amd_model = AMDModel.from_config(AMDModel.load_config(self.config.amd_config_path)).to(self.device, self.dtype).requires_grad_(False)
load_model(amd_model, self.config.amd_ckpt_path)
dwpose_model = DWposeDetector().to(self.device)
self.pipeline = P2M_Pipeline(
amd_model,
p2m_model,
vae_model,
dwpose_model,
amd_sample_steps=self.config.amd_sample_steps,
p2m_sample_steps=self.config.p2m_sample_steps,
output_dir=self.config.output_dir,
)
def infer(self, refimg_path:str, driven_video_path:str, audio_path:Optional[str] = None):
video = self.pipeline.run(refimg_path, driven_video_path, audio_path)
return video
def eval(self, ref_img_dir:str, dwpose_dict_dir:str, num_frames:int = 96):
evalset = P2MEvalDataset(
ref_img_dir,
dwpose_dict_dir,
num_frames,
random_dwpose=True,
)
evalloader = DataLoader(
evalset, 12, shuffle=False,drop_last=True,collate_fn=evalset.collate,num_workers=16
)
self.pipeline.eval(evalloader)
if __name__ == "__main__":
# TODO add argparse here
# config_path = "/mnt/pfs-gv8sxa/tts/dhg/zqy/code/AMD2/config/inference/a2m_wpose.yaml"
config_path = "/mnt/pfs-gv8sxa/tts/dhg/zqy/code/AMD2/config/inference/p2m.yaml"
refimg_path = "/mnt/pfs-gv8sxa/tts/dhg/zqy/code/AMD2/demo/face36.jpg"
driven_video_path = "/mnt/pfs-gv8sxa/tts/dhg/zqy/data/FaceVid_240h/videos/21.mp4"
ref_img_dir = "/mnt/pfs-gv8sxa/tts/dhg/zqy/data/FaceVid_240h/firstframes/fromvideo"
dwpose_dict_dir = "/mnt/pfs-gv8sxa/tts/dhg/zqy/data/FaceVid_240h/dwpose_facebody_dict"
audio_dir = "/mnt/pfs-gv8sxa/tts/dhg/zqy/data/FaceVid_240h/audios"
config = OmegaConf.load(config_path)
inferencer = p2m_inferencer(config, torch.device("cuda:0"), torch.float32)
video = inferencer.infer(refimg_path, driven_video_path, None)
# inferencer.eval(
# ref_img_dir,
# dwpose_dict_dir,
# 96,
# )