Spaces:
Paused
Paused
| #!/usr/bin/env python | |
| # Copyright (c) Xuangeng Chu (xg.chu@outlook.com) | |
| import os | |
| import math | |
| import torch | |
| import torch.nn as nn | |
| from .data_stats import ALLTALKEMICA_MEAN, ALLTALKEMICA_STD | |
| class StyleEncoder(nn.Module): | |
| def __init__(self, ): | |
| super().__init__() | |
| # amespace(mode='train', iter=100000, rot_repr='aa', no_head_pose=False, feature_dim=128, n_heads=4, n_layers=4, mlp_ratio=4, n_motions=100, fps=25, data_root=PosixPath('datasets/HDTF_THHQ/lmdb'), stats_file=PosixPath('stats_train.npz'), batch_size=32, num_workers=4, exp_name='head-L4H4_T0.1_BS32', max_iter=100000, lr=0.0001, temperature=0.1, save_iter=2000, val_iter=2000, log_iter=50, log_smooth_win=50) | |
| # Transformer for feature extraction | |
| encoder_layer = nn.TransformerEncoderLayer( | |
| d_model=128, nhead=4, dim_feedforward=512, activation='gelu', batch_first=True | |
| ) | |
| self.PE = PositionalEncoding(128) | |
| self.encoder = nn.ModuleDict({ | |
| 'motion_proj': nn.Linear(106, 128), | |
| 'transformer': nn.TransformerEncoder(encoder_layer, num_layers=4), | |
| }) | |
| self.register_buffer("motion_mean", torch.tensor(ALLTALKEMICA_MEAN).float()) | |
| self.register_buffer("motion_std", torch.tensor(ALLTALKEMICA_STD).float()) | |
| def forward(self, motion_coef): | |
| """ | |
| :param motion_coef: (batch_size, seq_len, motion_coef_dim) | |
| :return: (batch_size, feature_dim) | |
| """ | |
| batch_size, seq_len, _ = motion_coef.shape | |
| motion_coef = self.norm_with_stats(motion_coef) | |
| # Motion | |
| motion_feat = self.encoder['motion_proj'](motion_coef) | |
| motion_feat = self.PE(motion_feat) | |
| feat = self.encoder['transformer'](motion_feat) # (N, L, feat_dim) | |
| feat = feat.mean(dim=1) # Pooling to (N, feat_dim) | |
| return feat | |
| def norm_with_stats(self, motion_coef): | |
| normed_motion_coef = (motion_coef.clone() - self.motion_mean) / self.motion_std | |
| return normed_motion_coef | |
| class PositionalEncoding(nn.Module): | |
| def __init__(self, d_model, dropout=0.1, max_len=600): | |
| super().__init__() | |
| self.dropout = nn.Dropout(p=dropout) | |
| # vanilla sinusoidal encoding | |
| pe = torch.zeros(max_len, d_model) | |
| position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1) | |
| div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model)) | |
| pe[:, 0::2] = torch.sin(position * div_term) | |
| pe[:, 1::2] = torch.cos(position * div_term) | |
| pe = pe.unsqueeze(0) | |
| self.register_buffer('pe', pe) | |
| def forward(self, x): | |
| x = x + self.pe[:, x.shape[1], :] | |
| return self.dropout(x) | |