Spaces:
No application file
No application file
Upload 15 files
Browse files- configs/_base_/archs/diff_svc.py +52 -0
- configs/_base_/archs/diff_svc_v2.py +65 -0
- configs/_base_/datasets/audio_folder.py +27 -0
- configs/_base_/schedulers/step.py +13 -0
- configs/_base_/schedulers/warmup_cosine.py +24 -0
- configs/_base_/schedulers/warmup_cosine_finetune.py +24 -0
- configs/_base_/trainers/base.py +34 -0
- configs/svc_cn_hubert_soft.py +13 -0
- configs/svc_cn_hubert_soft_finetune.py +77 -0
- configs/svc_cn_hubert_soft_finetune_crepe.py +77 -0
- configs/svc_hubert_soft.py +21 -0
- configs/svc_hubert_soft_diff_svc.py +58 -0
- configs/svc_hubert_soft_multi_speakers.py +37 -0
- configs/svs_baseline.py +113 -0
- configs/train_my_config.py +4 -0
configs/_base_/archs/diff_svc.py
ADDED
|
@@ -0,0 +1,52 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from fish_diffusion.utils.pitch import pitch_to_scale
|
| 2 |
+
|
| 3 |
+
sampling_rate = 44100
|
| 4 |
+
mel_channels = 128
|
| 5 |
+
hidden_size = 256
|
| 6 |
+
|
| 7 |
+
model = dict(
|
| 8 |
+
type="DiffSVC",
|
| 9 |
+
diffusion=dict(
|
| 10 |
+
type="GaussianDiffusion",
|
| 11 |
+
mel_channels=mel_channels,
|
| 12 |
+
noise_schedule="linear",
|
| 13 |
+
timesteps=1000,
|
| 14 |
+
max_beta=0.01,
|
| 15 |
+
s=0.008,
|
| 16 |
+
noise_loss="smoothed-l1",
|
| 17 |
+
denoiser=dict(
|
| 18 |
+
type="WaveNetDenoiser",
|
| 19 |
+
mel_channels=mel_channels,
|
| 20 |
+
d_encoder=hidden_size,
|
| 21 |
+
residual_channels=512,
|
| 22 |
+
residual_layers=20,
|
| 23 |
+
),
|
| 24 |
+
spec_stats_path="dataset/stats.json",
|
| 25 |
+
sampler_interval=10,
|
| 26 |
+
),
|
| 27 |
+
text_encoder=dict(
|
| 28 |
+
type="NaiveProjectionEncoder",
|
| 29 |
+
input_size=256,
|
| 30 |
+
output_size=hidden_size,
|
| 31 |
+
),
|
| 32 |
+
speaker_encoder=dict(
|
| 33 |
+
type="NaiveProjectionEncoder",
|
| 34 |
+
input_size=10,
|
| 35 |
+
output_size=hidden_size,
|
| 36 |
+
use_embedding=True,
|
| 37 |
+
),
|
| 38 |
+
pitch_encoder=dict(
|
| 39 |
+
type="NaiveProjectionEncoder",
|
| 40 |
+
input_size=1,
|
| 41 |
+
output_size=hidden_size,
|
| 42 |
+
use_embedding=False,
|
| 43 |
+
preprocessing=pitch_to_scale,
|
| 44 |
+
),
|
| 45 |
+
vocoder=dict(
|
| 46 |
+
type="NsfHifiGAN",
|
| 47 |
+
checkpoint_path="checkpoints/nsf_hifigan/model",
|
| 48 |
+
sampling_rate=sampling_rate,
|
| 49 |
+
mel_channels=mel_channels,
|
| 50 |
+
use_natural_log=True,
|
| 51 |
+
),
|
| 52 |
+
)
|
configs/_base_/archs/diff_svc_v2.py
ADDED
|
@@ -0,0 +1,65 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
DiffSVC architecture with WaveNet denoiser and NSF-HiFiGAN vocoder.
|
| 3 |
+
|
| 4 |
+
Comparing to v1, this version
|
| 5 |
+
- Doesn't need spec stats anymore.
|
| 6 |
+
- Added dilation cycle to WaveNet denoiser.
|
| 7 |
+
- Used the log10 mel spectrogram.
|
| 8 |
+
- Better matching DiffSinger architecture.
|
| 9 |
+
"""
|
| 10 |
+
|
| 11 |
+
from fish_diffusion.utils.pitch import pitch_to_scale
|
| 12 |
+
|
| 13 |
+
sampling_rate = 44100
|
| 14 |
+
mel_channels = 128
|
| 15 |
+
hidden_size = 256
|
| 16 |
+
|
| 17 |
+
model = dict(
|
| 18 |
+
type="DiffSVC",
|
| 19 |
+
diffusion=dict(
|
| 20 |
+
type="GaussianDiffusion",
|
| 21 |
+
mel_channels=mel_channels,
|
| 22 |
+
noise_schedule="linear",
|
| 23 |
+
timesteps=1000,
|
| 24 |
+
max_beta=0.01,
|
| 25 |
+
s=0.008,
|
| 26 |
+
noise_loss="l1",
|
| 27 |
+
denoiser=dict(
|
| 28 |
+
type="WaveNetDenoiser",
|
| 29 |
+
mel_channels=mel_channels,
|
| 30 |
+
d_encoder=hidden_size,
|
| 31 |
+
residual_channels=512,
|
| 32 |
+
residual_layers=20,
|
| 33 |
+
dilation_cycle=4,
|
| 34 |
+
use_linear_bias=True,
|
| 35 |
+
),
|
| 36 |
+
sampler_interval=10,
|
| 37 |
+
spec_min=[-5],
|
| 38 |
+
spec_max=[0],
|
| 39 |
+
),
|
| 40 |
+
text_encoder=dict(
|
| 41 |
+
type="NaiveProjectionEncoder",
|
| 42 |
+
input_size=256,
|
| 43 |
+
output_size=hidden_size,
|
| 44 |
+
),
|
| 45 |
+
speaker_encoder=dict(
|
| 46 |
+
type="NaiveProjectionEncoder",
|
| 47 |
+
input_size=10,
|
| 48 |
+
output_size=hidden_size,
|
| 49 |
+
use_embedding=True,
|
| 50 |
+
),
|
| 51 |
+
pitch_encoder=dict(
|
| 52 |
+
type="NaiveProjectionEncoder",
|
| 53 |
+
input_size=1,
|
| 54 |
+
output_size=hidden_size,
|
| 55 |
+
use_embedding=False,
|
| 56 |
+
preprocessing=pitch_to_scale,
|
| 57 |
+
),
|
| 58 |
+
vocoder=dict(
|
| 59 |
+
type="NsfHifiGAN",
|
| 60 |
+
checkpoint_path="checkpoints/nsf_hifigan/model",
|
| 61 |
+
sampling_rate=sampling_rate,
|
| 62 |
+
mel_channels=mel_channels,
|
| 63 |
+
use_natural_log=False,
|
| 64 |
+
),
|
| 65 |
+
)
|
configs/_base_/datasets/audio_folder.py
ADDED
|
@@ -0,0 +1,27 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
dataset = dict(
|
| 2 |
+
train=dict(
|
| 3 |
+
type="AudioFolderDataset",
|
| 4 |
+
path="dataset/train",
|
| 5 |
+
speaker_id=0,
|
| 6 |
+
),
|
| 7 |
+
valid=dict(
|
| 8 |
+
type="AudioFolderDataset",
|
| 9 |
+
path="dataset/valid",
|
| 10 |
+
speaker_id=0,
|
| 11 |
+
),
|
| 12 |
+
)
|
| 13 |
+
|
| 14 |
+
dataloader = dict(
|
| 15 |
+
train=dict(
|
| 16 |
+
batch_size=16,
|
| 17 |
+
shuffle=True,
|
| 18 |
+
num_workers=2,
|
| 19 |
+
persistent_workers=True,
|
| 20 |
+
),
|
| 21 |
+
valid=dict(
|
| 22 |
+
batch_size=2,
|
| 23 |
+
shuffle=False,
|
| 24 |
+
num_workers=2,
|
| 25 |
+
persistent_workers=True,
|
| 26 |
+
),
|
| 27 |
+
)
|
configs/_base_/schedulers/step.py
ADDED
|
@@ -0,0 +1,13 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
optimizer = dict(
|
| 2 |
+
type="AdamW",
|
| 3 |
+
lr=8e-4,
|
| 4 |
+
weight_decay=1e-2,
|
| 5 |
+
betas=(0.9, 0.98),
|
| 6 |
+
eps=1e-9,
|
| 7 |
+
)
|
| 8 |
+
|
| 9 |
+
scheduler = dict(
|
| 10 |
+
type="StepLR",
|
| 11 |
+
step_size=50000,
|
| 12 |
+
gamma=0.5,
|
| 13 |
+
)
|
configs/_base_/schedulers/warmup_cosine.py
ADDED
|
@@ -0,0 +1,24 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from fish_diffusion.schedulers.warmup_cosine_scheduler import (
|
| 2 |
+
LambdaWarmUpCosineScheduler,
|
| 3 |
+
)
|
| 4 |
+
|
| 5 |
+
lambda_func = LambdaWarmUpCosineScheduler(
|
| 6 |
+
warm_up_steps=1000,
|
| 7 |
+
lr_min=1e-4,
|
| 8 |
+
lr_max=8e-4,
|
| 9 |
+
lr_start=1e-5,
|
| 10 |
+
max_decay_steps=150000,
|
| 11 |
+
)
|
| 12 |
+
|
| 13 |
+
optimizer = dict(
|
| 14 |
+
type="AdamW",
|
| 15 |
+
lr=1.0,
|
| 16 |
+
weight_decay=1e-2,
|
| 17 |
+
betas=(0.9, 0.98),
|
| 18 |
+
eps=1e-9,
|
| 19 |
+
)
|
| 20 |
+
|
| 21 |
+
scheduler = dict(
|
| 22 |
+
type="LambdaLR",
|
| 23 |
+
lr_lambda=lambda_func,
|
| 24 |
+
)
|
configs/_base_/schedulers/warmup_cosine_finetune.py
ADDED
|
@@ -0,0 +1,24 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from fish_diffusion.schedulers.warmup_cosine_scheduler import (
|
| 2 |
+
LambdaWarmUpCosineScheduler,
|
| 3 |
+
)
|
| 4 |
+
|
| 5 |
+
lambda_func = LambdaWarmUpCosineScheduler(
|
| 6 |
+
warm_up_steps=1000,
|
| 7 |
+
lr_min=1e-4,
|
| 8 |
+
lr_max=4e-4,
|
| 9 |
+
lr_start=1e-5,
|
| 10 |
+
max_decay_steps=5000,
|
| 11 |
+
)
|
| 12 |
+
|
| 13 |
+
optimizer = dict(
|
| 14 |
+
type="AdamW",
|
| 15 |
+
lr=1.0,
|
| 16 |
+
weight_decay=1e-2,
|
| 17 |
+
betas=(0.9, 0.98),
|
| 18 |
+
eps=1e-9,
|
| 19 |
+
)
|
| 20 |
+
|
| 21 |
+
scheduler = dict(
|
| 22 |
+
type="LambdaLR",
|
| 23 |
+
lr_lambda=lambda_func,
|
| 24 |
+
)
|
configs/_base_/trainers/base.py
ADDED
|
@@ -0,0 +1,34 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import sys
|
| 2 |
+
|
| 3 |
+
import torch
|
| 4 |
+
from pytorch_lightning.callbacks import LearningRateMonitor, ModelCheckpoint
|
| 5 |
+
from pytorch_lightning.strategies import DDPStrategy
|
| 6 |
+
|
| 7 |
+
trainer = dict(
|
| 8 |
+
accelerator="gpu",
|
| 9 |
+
devices=-1,
|
| 10 |
+
gradient_clip_val=0.5,
|
| 11 |
+
log_every_n_steps=10,
|
| 12 |
+
val_check_interval=5000,
|
| 13 |
+
check_val_every_n_epoch=None,
|
| 14 |
+
max_steps=300000,
|
| 15 |
+
# Warning: If you are training the model with fs2 (and see nan), you should either use bf16 or fp32
|
| 16 |
+
precision=16,
|
| 17 |
+
callbacks=[
|
| 18 |
+
ModelCheckpoint(
|
| 19 |
+
filename="{epoch}-{step}-{valid_loss:.2f}",
|
| 20 |
+
every_n_train_steps=5000,
|
| 21 |
+
save_top_k=-1,
|
| 22 |
+
),
|
| 23 |
+
LearningRateMonitor(logging_interval="step"),
|
| 24 |
+
],
|
| 25 |
+
)
|
| 26 |
+
|
| 27 |
+
# Use DDP for multi-gpu training
|
| 28 |
+
if torch.cuda.is_available() and torch.cuda.device_count() > 1:
|
| 29 |
+
# Use gloo for windows
|
| 30 |
+
process_group_backend = "nccl" if sys.platform != "win32" else "gloo"
|
| 31 |
+
|
| 32 |
+
trainer["strategy"] = DDPStrategy(
|
| 33 |
+
find_unused_parameters=True, process_group_backend=process_group_backend
|
| 34 |
+
)
|
configs/svc_cn_hubert_soft.py
ADDED
|
@@ -0,0 +1,13 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from fish_diffusion.datasets.audio_folder import AudioFolderDataset
|
| 2 |
+
|
| 3 |
+
_base_ = [
|
| 4 |
+
"./svc_hubert_soft.py",
|
| 5 |
+
]
|
| 6 |
+
|
| 7 |
+
preprocessing = dict(
|
| 8 |
+
text_features_extractor=dict(
|
| 9 |
+
_delete_=True,
|
| 10 |
+
type="ChineseHubertSoft",
|
| 11 |
+
pretrained=True,
|
| 12 |
+
),
|
| 13 |
+
)
|
configs/svc_cn_hubert_soft_finetune.py
ADDED
|
@@ -0,0 +1,77 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from pytorch_lightning.callbacks import LearningRateMonitor, ModelCheckpoint
|
| 2 |
+
|
| 3 |
+
from fish_diffusion.datasets.audio_folder import AudioFolderDataset
|
| 4 |
+
|
| 5 |
+
_base_ = [
|
| 6 |
+
"./_base_/archs/diff_svc_v2.py",
|
| 7 |
+
"./_base_/trainers/base.py",
|
| 8 |
+
"./_base_/schedulers/warmup_cosine_finetune.py",
|
| 9 |
+
"./_base_/datasets/audio_folder.py",
|
| 10 |
+
]
|
| 11 |
+
|
| 12 |
+
speaker_mapping = {
|
| 13 |
+
"Placeholder": 0,
|
| 14 |
+
}
|
| 15 |
+
|
| 16 |
+
dataset = dict(
|
| 17 |
+
train=dict(
|
| 18 |
+
_delete_=True, # Delete the default train dataset
|
| 19 |
+
type="ConcatDataset",
|
| 20 |
+
datasets=[
|
| 21 |
+
dict(
|
| 22 |
+
type="AudioFolderDataset",
|
| 23 |
+
path="dataset/train",
|
| 24 |
+
speaker_id=speaker_mapping["Placeholder"],
|
| 25 |
+
),
|
| 26 |
+
],
|
| 27 |
+
# Are there any other ways to do this?
|
| 28 |
+
collate_fn=AudioFolderDataset.collate_fn,
|
| 29 |
+
),
|
| 30 |
+
valid=dict(
|
| 31 |
+
_delete_=True, # Delete the default valid dataset
|
| 32 |
+
type="ConcatDataset",
|
| 33 |
+
datasets=[
|
| 34 |
+
dict(
|
| 35 |
+
type="AudioFolderDataset",
|
| 36 |
+
path="dataset/valid",
|
| 37 |
+
speaker_id=speaker_mapping["Placeholder"],
|
| 38 |
+
),
|
| 39 |
+
],
|
| 40 |
+
collate_fn=AudioFolderDataset.collate_fn,
|
| 41 |
+
),
|
| 42 |
+
)
|
| 43 |
+
|
| 44 |
+
model = dict(
|
| 45 |
+
speaker_encoder=dict(
|
| 46 |
+
input_size=len(speaker_mapping),
|
| 47 |
+
),
|
| 48 |
+
text_encoder=dict(
|
| 49 |
+
type="NaiveProjectionEncoder",
|
| 50 |
+
input_size=256,
|
| 51 |
+
output_size=256,
|
| 52 |
+
),
|
| 53 |
+
)
|
| 54 |
+
|
| 55 |
+
preprocessing = dict(
|
| 56 |
+
text_features_extractor=dict(
|
| 57 |
+
type="ChineseHubertSoft",
|
| 58 |
+
pretrained=True,
|
| 59 |
+
gate_size=25,
|
| 60 |
+
),
|
| 61 |
+
pitch_extractor=dict(
|
| 62 |
+
type="ParselMouthPitchExtractor",
|
| 63 |
+
),
|
| 64 |
+
)
|
| 65 |
+
|
| 66 |
+
# The following trainer val and save checkpoints every 1000 steps
|
| 67 |
+
trainer = dict(
|
| 68 |
+
val_check_interval=1000,
|
| 69 |
+
callbacks=[
|
| 70 |
+
ModelCheckpoint(
|
| 71 |
+
filename="{epoch}-{step}-{valid_loss:.2f}",
|
| 72 |
+
every_n_train_steps=5000,
|
| 73 |
+
save_top_k=-1,
|
| 74 |
+
),
|
| 75 |
+
LearningRateMonitor(logging_interval="step"),
|
| 76 |
+
],
|
| 77 |
+
)
|
configs/svc_cn_hubert_soft_finetune_crepe.py
ADDED
|
@@ -0,0 +1,77 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from pytorch_lightning.callbacks import LearningRateMonitor, ModelCheckpoint
|
| 2 |
+
|
| 3 |
+
from fish_diffusion.datasets.audio_folder import AudioFolderDataset
|
| 4 |
+
|
| 5 |
+
_base_ = [
|
| 6 |
+
"./_base_/archs/diff_svc_v2.py",
|
| 7 |
+
"./_base_/trainers/base.py",
|
| 8 |
+
"./_base_/schedulers/warmup_cosine_finetune.py",
|
| 9 |
+
"./_base_/datasets/audio_folder.py",
|
| 10 |
+
]
|
| 11 |
+
|
| 12 |
+
speaker_mapping = {
|
| 13 |
+
"Placeholder": 0,
|
| 14 |
+
}
|
| 15 |
+
|
| 16 |
+
dataset = dict(
|
| 17 |
+
train=dict(
|
| 18 |
+
_delete_=True, # Delete the default train dataset
|
| 19 |
+
type="ConcatDataset",
|
| 20 |
+
datasets=[
|
| 21 |
+
dict(
|
| 22 |
+
type="AudioFolderDataset",
|
| 23 |
+
path="dataset/train",
|
| 24 |
+
speaker_id=speaker_mapping["Placeholder"],
|
| 25 |
+
),
|
| 26 |
+
],
|
| 27 |
+
# Are there any other ways to do this?
|
| 28 |
+
collate_fn=AudioFolderDataset.collate_fn,
|
| 29 |
+
),
|
| 30 |
+
valid=dict(
|
| 31 |
+
_delete_=True, # Delete the default valid dataset
|
| 32 |
+
type="ConcatDataset",
|
| 33 |
+
datasets=[
|
| 34 |
+
dict(
|
| 35 |
+
type="AudioFolderDataset",
|
| 36 |
+
path="dataset/valid",
|
| 37 |
+
speaker_id=speaker_mapping["Placeholder"],
|
| 38 |
+
),
|
| 39 |
+
],
|
| 40 |
+
collate_fn=AudioFolderDataset.collate_fn,
|
| 41 |
+
),
|
| 42 |
+
)
|
| 43 |
+
|
| 44 |
+
model = dict(
|
| 45 |
+
speaker_encoder=dict(
|
| 46 |
+
input_size=len(speaker_mapping),
|
| 47 |
+
),
|
| 48 |
+
text_encoder=dict(
|
| 49 |
+
type="NaiveProjectionEncoder",
|
| 50 |
+
input_size=256,
|
| 51 |
+
output_size=256,
|
| 52 |
+
),
|
| 53 |
+
)
|
| 54 |
+
|
| 55 |
+
preprocessing = dict(
|
| 56 |
+
text_features_extractor=dict(
|
| 57 |
+
type="ChineseHubertSoft",
|
| 58 |
+
pretrained=True,
|
| 59 |
+
gate_size=25,
|
| 60 |
+
),
|
| 61 |
+
pitch_extractor=dict(
|
| 62 |
+
type="CrepePitchExtractor",
|
| 63 |
+
),
|
| 64 |
+
)
|
| 65 |
+
|
| 66 |
+
# The following trainer val and save checkpoints every 1000 steps
|
| 67 |
+
trainer = dict(
|
| 68 |
+
val_check_interval=1000,
|
| 69 |
+
callbacks=[
|
| 70 |
+
ModelCheckpoint(
|
| 71 |
+
filename="{epoch}-{step}-{valid_loss:.2f}",
|
| 72 |
+
every_n_train_steps=5000,
|
| 73 |
+
save_top_k=-1,
|
| 74 |
+
),
|
| 75 |
+
LearningRateMonitor(logging_interval="step"),
|
| 76 |
+
],
|
| 77 |
+
)
|
configs/svc_hubert_soft.py
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Warning: This config has a breaking change in Feb 12, 2023.
|
| 2 |
+
# It updates the arch from diff_svc to diff_svc_v2 and switch to the cosine scheduler.
|
| 3 |
+
|
| 4 |
+
_base_ = [
|
| 5 |
+
"./_base_/archs/diff_svc_v2.py",
|
| 6 |
+
"./_base_/trainers/base.py",
|
| 7 |
+
"./_base_/schedulers/warmup_cosine.py",
|
| 8 |
+
"./_base_/datasets/audio_folder.py",
|
| 9 |
+
]
|
| 10 |
+
|
| 11 |
+
|
| 12 |
+
preprocessing = dict(
|
| 13 |
+
text_features_extractor=dict(
|
| 14 |
+
type="HubertSoft",
|
| 15 |
+
),
|
| 16 |
+
pitch_extractor=dict(
|
| 17 |
+
# ParselMouth is much faster than Crepe
|
| 18 |
+
# However, Crepe may have better performance in some cases
|
| 19 |
+
type="ParselMouthPitchExtractor",
|
| 20 |
+
),
|
| 21 |
+
)
|
configs/svc_hubert_soft_diff_svc.py
ADDED
|
@@ -0,0 +1,58 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from functools import partial
|
| 2 |
+
|
| 3 |
+
import numpy as np
|
| 4 |
+
|
| 5 |
+
from fish_diffusion.utils.pitch import pitch_to_coarse
|
| 6 |
+
|
| 7 |
+
_base_ = [
|
| 8 |
+
"./_base_/archs/diff_svc_v2.py",
|
| 9 |
+
"./_base_/trainers/base.py",
|
| 10 |
+
"./_base_/schedulers/step.py",
|
| 11 |
+
"./_base_/datasets/audio_folder.py",
|
| 12 |
+
]
|
| 13 |
+
|
| 14 |
+
hidden_size = 256
|
| 15 |
+
|
| 16 |
+
model = dict(
|
| 17 |
+
type="DiffSVC",
|
| 18 |
+
speaker_encoder=dict(
|
| 19 |
+
_delete_=True,
|
| 20 |
+
# This is currently not used, all params will be zeroed
|
| 21 |
+
type="NaiveProjectionEncoder",
|
| 22 |
+
input_size=10,
|
| 23 |
+
output_size=hidden_size,
|
| 24 |
+
use_embedding=True,
|
| 25 |
+
),
|
| 26 |
+
pitch_encoder=dict(
|
| 27 |
+
_delete_=True,
|
| 28 |
+
type="NaiveProjectionEncoder",
|
| 29 |
+
input_size=300,
|
| 30 |
+
output_size=hidden_size,
|
| 31 |
+
use_embedding=True,
|
| 32 |
+
# Since the pretrained model uses a 40.0 Hz minimum pitch,
|
| 33 |
+
preprocessing=partial(
|
| 34 |
+
pitch_to_coarse, f0_mel_min=1127 * np.log(1 + 40.0 / 700)
|
| 35 |
+
),
|
| 36 |
+
),
|
| 37 |
+
text_encoder=dict(
|
| 38 |
+
_delete_=True,
|
| 39 |
+
type="IdentityEncoder",
|
| 40 |
+
),
|
| 41 |
+
diffusion=dict(
|
| 42 |
+
denoiser=dict(
|
| 43 |
+
residual_channels=384,
|
| 44 |
+
),
|
| 45 |
+
),
|
| 46 |
+
)
|
| 47 |
+
|
| 48 |
+
preprocessing = dict(
|
| 49 |
+
# You need to choose either "parselmouth" or "crepe" for pitch_extractor
|
| 50 |
+
pitch_extractor=dict(
|
| 51 |
+
type="CrepePitchExtractor",
|
| 52 |
+
f0_min=40.0,
|
| 53 |
+
f0_max=1100.0,
|
| 54 |
+
),
|
| 55 |
+
text_features_extractor=dict(
|
| 56 |
+
type="HubertSoft",
|
| 57 |
+
),
|
| 58 |
+
)
|
configs/svc_hubert_soft_multi_speakers.py
ADDED
|
@@ -0,0 +1,37 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from fish_diffusion.datasets.audio_folder import AudioFolderDataset
|
| 2 |
+
|
| 3 |
+
_base_ = [
|
| 4 |
+
"./svc_hubert_soft.py",
|
| 5 |
+
]
|
| 6 |
+
|
| 7 |
+
dataset = dict(
|
| 8 |
+
train=dict(
|
| 9 |
+
_delete_=True, # Delete the default train dataset
|
| 10 |
+
type="ConcatDataset",
|
| 11 |
+
datasets=[
|
| 12 |
+
dict(
|
| 13 |
+
type="AudioFolderDataset",
|
| 14 |
+
path="dataset/speaker_0",
|
| 15 |
+
speaker_id=0,
|
| 16 |
+
),
|
| 17 |
+
dict(
|
| 18 |
+
type="AudioFolderDataset",
|
| 19 |
+
path="dataset/speaker_1",
|
| 20 |
+
speaker_id=1,
|
| 21 |
+
),
|
| 22 |
+
],
|
| 23 |
+
# Are there any other ways to do this?
|
| 24 |
+
collate_fn=AudioFolderDataset.collate_fn,
|
| 25 |
+
),
|
| 26 |
+
valid=dict(
|
| 27 |
+
type="AudioFolderDataset",
|
| 28 |
+
path="dataset/valid",
|
| 29 |
+
speaker_id=0,
|
| 30 |
+
),
|
| 31 |
+
)
|
| 32 |
+
|
| 33 |
+
model = dict(
|
| 34 |
+
speaker_encoder=dict(
|
| 35 |
+
input_size=2, # 2 speakers
|
| 36 |
+
),
|
| 37 |
+
)
|
configs/svs_baseline.py
ADDED
|
@@ -0,0 +1,113 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Warning: This config is developing, and subject to change.
|
| 2 |
+
|
| 3 |
+
_base_ = [
|
| 4 |
+
"./_base_/archs/diff_svc_v2.py",
|
| 5 |
+
"./_base_/trainers/base.py",
|
| 6 |
+
"./_base_/schedulers/warmup_cosine.py",
|
| 7 |
+
"./_base_/datasets/audio_folder.py",
|
| 8 |
+
]
|
| 9 |
+
|
| 10 |
+
phonemes = [
|
| 11 |
+
"AP",
|
| 12 |
+
"SP",
|
| 13 |
+
"E",
|
| 14 |
+
"En",
|
| 15 |
+
"a",
|
| 16 |
+
"ai",
|
| 17 |
+
"an",
|
| 18 |
+
"ang",
|
| 19 |
+
"ao",
|
| 20 |
+
"b",
|
| 21 |
+
"c",
|
| 22 |
+
"ch",
|
| 23 |
+
"d",
|
| 24 |
+
"e",
|
| 25 |
+
"ei",
|
| 26 |
+
"en",
|
| 27 |
+
"eng",
|
| 28 |
+
"er",
|
| 29 |
+
"f",
|
| 30 |
+
"g",
|
| 31 |
+
"h",
|
| 32 |
+
"i",
|
| 33 |
+
"i0",
|
| 34 |
+
"ia",
|
| 35 |
+
"ian",
|
| 36 |
+
"iang",
|
| 37 |
+
"iao",
|
| 38 |
+
"ie",
|
| 39 |
+
"in",
|
| 40 |
+
"ing",
|
| 41 |
+
"iong",
|
| 42 |
+
"ir",
|
| 43 |
+
"iu",
|
| 44 |
+
"j",
|
| 45 |
+
"k",
|
| 46 |
+
"l",
|
| 47 |
+
"m",
|
| 48 |
+
"n",
|
| 49 |
+
"o",
|
| 50 |
+
"ong",
|
| 51 |
+
"ou",
|
| 52 |
+
"p",
|
| 53 |
+
"q",
|
| 54 |
+
"r",
|
| 55 |
+
"s",
|
| 56 |
+
"sh",
|
| 57 |
+
"t",
|
| 58 |
+
"u",
|
| 59 |
+
"ua",
|
| 60 |
+
"uai",
|
| 61 |
+
"uan",
|
| 62 |
+
"uang",
|
| 63 |
+
"ui",
|
| 64 |
+
"un",
|
| 65 |
+
"uo",
|
| 66 |
+
"v",
|
| 67 |
+
"van",
|
| 68 |
+
"ve",
|
| 69 |
+
"vn",
|
| 70 |
+
"w",
|
| 71 |
+
"x",
|
| 72 |
+
"y",
|
| 73 |
+
"z",
|
| 74 |
+
"zh",
|
| 75 |
+
]
|
| 76 |
+
|
| 77 |
+
preprocessing = dict(
|
| 78 |
+
text_features_extractor=dict(
|
| 79 |
+
type="OpenCpopTranscriptionToPhonemesDuration",
|
| 80 |
+
phonemes=phonemes,
|
| 81 |
+
transcription_path="dataset/transcriptions.txt",
|
| 82 |
+
),
|
| 83 |
+
pitch_extractor=dict(
|
| 84 |
+
type="ParselMouthPitchExtractor",
|
| 85 |
+
),
|
| 86 |
+
)
|
| 87 |
+
|
| 88 |
+
model = dict(
|
| 89 |
+
type="DiffSinger",
|
| 90 |
+
text_encoder=dict(
|
| 91 |
+
_delete_=True,
|
| 92 |
+
type="NaiveProjectionEncoder",
|
| 93 |
+
input_size=len(phonemes) * 2 + 2,
|
| 94 |
+
output_size=256,
|
| 95 |
+
),
|
| 96 |
+
diffusion=dict(
|
| 97 |
+
max_beta=0.02,
|
| 98 |
+
),
|
| 99 |
+
)
|
| 100 |
+
|
| 101 |
+
dataset = dict(
|
| 102 |
+
_delete_=True,
|
| 103 |
+
train=dict(
|
| 104 |
+
type="AudioFolderDataset",
|
| 105 |
+
path="dataset/diff-singer/train",
|
| 106 |
+
speaker_id=0,
|
| 107 |
+
),
|
| 108 |
+
valid=dict(
|
| 109 |
+
type="AudioFolderDataset",
|
| 110 |
+
path="dataset/diff-singer/valid",
|
| 111 |
+
speaker_id=0,
|
| 112 |
+
),
|
| 113 |
+
)
|
configs/train_my_config.py
ADDED
|
@@ -0,0 +1,4 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
_base_ = [
|
| 2 |
+
".\svc_cn_hubert_soft_finetune.py",
|
| 3 |
+
]
|
| 4 |
+
|