| """ |
| SVCCondAdapter: replaces F5-TTS's text conditioning pathway with SVC features. |
| |
| F5-TTS text path: char_tokens (B, T) → embed + ConvNeXt → (B, T_mel, text_dim) |
| SVC replacement: PPG/HuBERT/F0 (B, T_feat, D) → project → (B, T_mel, text_dim) |
| |
| The output shape matches F5-TTS's text_dim so the DiT sees no change. |
| Default text_dim=512 for F5-TTS Base (model_dim=1024). |
| """ |
|
|
| import torch |
| import torch.nn as nn |
| import torch.nn.functional as F |
|
|
|
|
| class SVCCondAdapter(nn.Module): |
| def __init__( |
| self, |
| ppg_dim: int = 1280, |
| hubert_dim: int = 256, |
| f0_dim: int = 1, |
| spk_dim: int = 256, |
| out_dim: int = 512, |
| feat_sr: float = 50.0, |
| mel_sr: float = 93.75, |
| ): |
| super().__init__() |
| self.feat_sr = feat_sr |
| self.mel_sr = mel_sr |
|
|
| feat_in = ppg_dim + hubert_dim + f0_dim |
| self.content_proj = nn.Sequential( |
| nn.Linear(feat_in, out_dim * 2), |
| nn.SiLU(), |
| nn.Linear(out_dim * 2, out_dim), |
| ) |
| |
| self.spk_proj = nn.Linear(spk_dim, out_dim) |
|
|
| |
| |
| |
| |
| nn.init.normal_(self.content_proj[-1].weight, std=0.01) |
| nn.init.zeros_(self.content_proj[-1].bias) |
| nn.init.normal_(self.spk_proj.weight, std=0.01) |
| nn.init.zeros_(self.spk_proj.bias) |
|
|
| def forward( |
| self, |
| ppg: torch.Tensor, |
| hubert: torch.Tensor, |
| f0: torch.Tensor, |
| spk: torch.Tensor, |
| target_len: int, |
| ) -> torch.Tensor: |
| feat = torch.cat([ppg, hubert, f0], dim=-1) |
|
|
| |
| feat = feat.transpose(1, 2) |
| feat = F.interpolate(feat, size=target_len, mode="linear", align_corners=False) |
| feat = feat.transpose(1, 2) |
|
|
| out = self.content_proj(feat) |
| out = out + self.spk_proj(spk).unsqueeze(1) |
| return out |
|
|