shawnpi commited on
Commit
1cd928a
·
verified ·
1 Parent(s): 250b5b6

Upload 753 files

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. .gitattributes +20 -0
  2. configs/hifigan_config.json +38 -0
  3. configs/hq_svc_infer.yaml +25 -0
  4. demo/singing/1.wav +3 -0
  5. demo/singing/2.wav +3 -0
  6. demo/speech/1.wav +3 -0
  7. demo/speech/2.wav +3 -0
  8. images/kon-new.gif +3 -0
  9. logger/__init__.py +0 -0
  10. logger/__pycache__/__init__.cpython-310.pyc +0 -0
  11. logger/__pycache__/__init__.cpython-38.pyc +0 -0
  12. logger/__pycache__/__init__.cpython-39.pyc +0 -0
  13. logger/__pycache__/saver.cpython-38.pyc +0 -0
  14. logger/__pycache__/saver.cpython-39.pyc +0 -0
  15. logger/__pycache__/utils.cpython-310.pyc +0 -0
  16. logger/__pycache__/utils.cpython-38.pyc +0 -0
  17. logger/__pycache__/utils.cpython-39.pyc +0 -0
  18. logger/saver.py +150 -0
  19. logger/utils.py +128 -0
  20. utils/Amphion/.github/CODE_OF_CONDUCT.md +132 -0
  21. utils/Amphion/.github/CONTRIBUTING.md +77 -0
  22. utils/Amphion/.github/ISSUE_TEMPLATE/bug_report.md +32 -0
  23. utils/Amphion/.github/ISSUE_TEMPLATE/docs_feedback.md +17 -0
  24. utils/Amphion/.github/ISSUE_TEMPLATE/feature_request.md +20 -0
  25. utils/Amphion/.github/ISSUE_TEMPLATE/help_wanted.md +32 -0
  26. utils/Amphion/.github/pull_request_template.md +32 -0
  27. utils/Amphion/.github/workflows/check_format.yml +12 -0
  28. utils/Amphion/.gitignore +64 -0
  29. utils/Amphion/Dockerfile +64 -0
  30. utils/Amphion/LICENSE +21 -0
  31. utils/Amphion/README.md +163 -0
  32. utils/Amphion/__init__.py +0 -0
  33. utils/Amphion/__pycache__/__init__.cpython-38.pyc +0 -0
  34. utils/Amphion/__pycache__/__init__.cpython-39.pyc +0 -0
  35. utils/Amphion/bins/calc_metrics.py +268 -0
  36. utils/Amphion/bins/svc/inference.py +265 -0
  37. utils/Amphion/bins/svc/preprocess.py +183 -0
  38. utils/Amphion/bins/svc/train.py +111 -0
  39. utils/Amphion/bins/tta/inference.py +94 -0
  40. utils/Amphion/bins/tta/preprocess.py +195 -0
  41. utils/Amphion/bins/tta/train_tta.py +77 -0
  42. utils/Amphion/bins/tts/inference.py +167 -0
  43. utils/Amphion/bins/tts/preprocess.py +244 -0
  44. utils/Amphion/bins/tts/train.py +111 -0
  45. utils/Amphion/bins/vocoder/inference.py +115 -0
  46. utils/Amphion/bins/vocoder/preprocess.py +151 -0
  47. utils/Amphion/bins/vocoder/train.py +93 -0
  48. utils/Amphion/config/audioldm.json +92 -0
  49. utils/Amphion/config/autoencoderkl.json +69 -0
  50. utils/Amphion/config/base.json +185 -0
.gitattributes CHANGED
@@ -33,3 +33,23 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ demo/singing/1.wav filter=lfs diff=lfs merge=lfs -text
37
+ demo/singing/2.wav filter=lfs diff=lfs merge=lfs -text
38
+ demo/speech/1.wav filter=lfs diff=lfs merge=lfs -text
39
+ demo/speech/2.wav filter=lfs diff=lfs merge=lfs -text
40
+ images/kon-new.gif filter=lfs diff=lfs merge=lfs -text
41
+ utils/Amphion/egs/tts/VALLE/prompt_examples/260_123440_000010_000004.wav filter=lfs diff=lfs merge=lfs -text
42
+ utils/Amphion/egs/tts/VALLE/prompt_examples/5142_33396_000002_000004.wav filter=lfs diff=lfs merge=lfs -text
43
+ utils/Amphion/egs/tts/VALLE/prompt_examples/6829_68771_000027_000000.wav filter=lfs diff=lfs merge=lfs -text
44
+ utils/Amphion/egs/tts/VALLE/prompt_examples/7176_92135_000004_000000.wav filter=lfs diff=lfs merge=lfs -text
45
+ utils/Amphion/imgs/ns3/ns3_facodec.png filter=lfs diff=lfs merge=lfs -text
46
+ utils/Amphion/imgs/ns3/ns3_overview.png filter=lfs diff=lfs merge=lfs -text
47
+ utils/Amphion/imgs/svc/DiffComoSVC.png filter=lfs diff=lfs merge=lfs -text
48
+ utils/Amphion/imgs/svc/MultipleContentsSVC.png filter=lfs diff=lfs merge=lfs -text
49
+ utils/Amphion/imgs/svc/pipeline.png filter=lfs diff=lfs merge=lfs -text
50
+ utils/Amphion/imgs/visualization/SingVisio_demo.png filter=lfs diff=lfs merge=lfs -text
51
+ utils/Amphion/imgs/visualization/SingVisio_system.png filter=lfs diff=lfs merge=lfs -text
52
+ utils/Amphion/imgs/vocoder/diffusion/pipeline.png filter=lfs diff=lfs merge=lfs -text
53
+ utils/Amphion/imgs/vocoder/gan/MSSBCQTD.png filter=lfs diff=lfs merge=lfs -text
54
+ utils/Amphion/visualization/SingVisio/System_Introduction_of_SingVisio.pdf filter=lfs diff=lfs merge=lfs -text
55
+ utils/pretrain/nsf_hifigan/model filter=lfs diff=lfs merge=lfs -text
configs/hifigan_config.json ADDED
@@ -0,0 +1,38 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "resblock": "1",
3
+ "num_gpus": 4,
4
+ "batch_size": 10,
5
+ "learning_rate": 0.0002,
6
+ "adam_b1": 0.8,
7
+ "adam_b2": 0.99,
8
+ "lr_decay": 0.999,
9
+ "seed": 1234,
10
+
11
+ "upsample_rates": [ 8, 8, 2, 2, 2],
12
+ "upsample_kernel_sizes": [16,16, 4, 4, 4],
13
+ "upsample_initial_channel": 512,
14
+ "resblock_kernel_sizes": [3,7,11],
15
+ "resblock_dilation_sizes": [[1,3,5], [1,3,5], [1,3,5]],
16
+ "discriminator_periods": [3, 5, 7, 11, 17, 23, 37],
17
+
18
+ "segment_size": 16384,
19
+ "num_mels": 128,
20
+ "num_freq": 1025,
21
+ "n_fft" : 2048,
22
+ "hop_size": 512,
23
+ "win_size": 2048,
24
+
25
+ "sampling_rate": 44100,
26
+
27
+ "fmin": 40,
28
+ "fmax": 16000,
29
+ "fmax_for_loss": null,
30
+
31
+ "num_workers": 16,
32
+
33
+ "dist_config": {
34
+ "dist_backend": "nccl",
35
+ "dist_url": "tcp://localhost:54321",
36
+ "world_size": 1
37
+ }
38
+ }
configs/hq_svc_infer.yaml ADDED
@@ -0,0 +1,25 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # 推理环境配置
2
+ device: 'cuda'
3
+
4
+ # 模型路径
5
+ model_path: utils/pretrain/250000_step_val_loss_0.50.pth
6
+
7
+ # 核心模型参数 (必须保留,用于初始化网络结构)
8
+ use_tfm: True
9
+ mode: [film_mlp, infonce, pred_f0]
10
+
11
+ # 音频处理参数
12
+ sample_rate: 44100
13
+ encoder_sr: 16000
14
+ vocoder: 'nsf-hifigan'
15
+ hop_size: 256 # 建议保留,部分模型初始化需要显式指定步长
16
+
17
+ # 扩散模型推理设置
18
+ infer_speedup: 10
19
+ infer_method: 'dpm-solver'
20
+
21
+ # 特征提取配置
22
+ f0_extractor: rmvpe
23
+ block_size: 512
24
+ f0_min: 60
25
+ f0_max: 1200
demo/singing/1.wav ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5c879c6ee5921229c98ba264869a1a6e502a6de197b3506e68a9de7771992a8a
3
+ size 900764
demo/singing/2.wav ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:961a715b14292591938047d0ad8448a7a10ff6cf3437f0cec44eb1539fc84216
3
+ size 877004
demo/speech/1.wav ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:88def6269f9d345d9dc374f5797ff8f7997fd6cfc87e69077246d9b25b18c8d1
3
+ size 293804
demo/speech/2.wav ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:bca0e22549e1ff358002b672b5301f0275d39b9eeced7cb527ea64a001f572a3
3
+ size 733868
images/kon-new.gif ADDED

Git LFS Details

  • SHA256: 8dcd62795f2818d6bdc085b405ef61c28614ffee98359dbd7b3cfe56faf0856e
  • Pointer size: 132 Bytes
  • Size of remote file: 1.6 MB
logger/__init__.py ADDED
File without changes
logger/__pycache__/__init__.cpython-310.pyc ADDED
Binary file (136 Bytes). View file
 
logger/__pycache__/__init__.cpython-38.pyc ADDED
Binary file (147 Bytes). View file
 
logger/__pycache__/__init__.cpython-39.pyc ADDED
Binary file (134 Bytes). View file
 
logger/__pycache__/saver.cpython-38.pyc ADDED
Binary file (3.72 kB). View file
 
logger/__pycache__/saver.cpython-39.pyc ADDED
Binary file (3.73 kB). View file
 
logger/__pycache__/utils.cpython-310.pyc ADDED
Binary file (3.97 kB). View file
 
logger/__pycache__/utils.cpython-38.pyc ADDED
Binary file (3.91 kB). View file
 
logger/__pycache__/utils.cpython-39.pyc ADDED
Binary file (3.93 kB). View file
 
logger/saver.py ADDED
@@ -0,0 +1,150 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ '''
2
+ author: wayn391@mastertones
3
+ '''
4
+
5
+ import os
6
+ import json
7
+ import time
8
+ import yaml
9
+ import datetime
10
+ import torch
11
+ import matplotlib.pyplot as plt
12
+ from . import utils
13
+ import numpy as np
14
+ from torch.utils.tensorboard import SummaryWriter
15
+ class Saver(object):
16
+ def __init__(
17
+ self,
18
+ args,
19
+ initial_global_step=0):
20
+
21
+ # cold start
22
+ self.global_step = initial_global_step
23
+ self.init_time = time.time()
24
+ self.last_time = time.time()
25
+ self.log_dir = args.log_dir
26
+ self.sample_rate = args.sample_rate
27
+
28
+ # ckpt
29
+ os.makedirs(self.log_dir, exist_ok=True)
30
+
31
+ # writer
32
+ self.writer = SummaryWriter(self.log_dir)
33
+
34
+
35
+ def log_info(self, msg):
36
+ '''log method'''
37
+ if isinstance(msg, dict):
38
+ msg_list = []
39
+ for k, v in msg.items():
40
+ tmp_str = ''
41
+ if isinstance(v, int):
42
+ tmp_str = '{}: {:,}'.format(k, v)
43
+ else:
44
+ tmp_str = '{}: {}'.format(k, v)
45
+
46
+ msg_list.append(tmp_str)
47
+ msg_str = '\n'.join(msg_list)
48
+ else:
49
+ msg_str = msg
50
+
51
+ # dsplay
52
+ print(msg_str)
53
+
54
+ # save
55
+ with open(self.path_log_info, 'a') as fp:
56
+ fp.write(msg_str+'\n')
57
+
58
+ def log_value(self, dict):
59
+ for k, v in dict.items():
60
+ self.writer.add_scalar(k, v, self.global_step)
61
+
62
+ def log_spec(self, name, spec, vmin=-14, vmax=3.5):
63
+ # 检查 spec 是否为 Tensor,并转换为 numpy
64
+ if isinstance(spec, torch.Tensor):
65
+ spec = spec.cpu().numpy()
66
+
67
+ # 为 spec 绘制图像
68
+ fig = plt.figure(figsize=(12, 6))
69
+ # font_path = 'SimHei' # 或者字体的绝对路径
70
+ # font_prop = FontProperties(fname=font_path, size=14)
71
+ plt.imshow(spec, aspect='auto', vmin=vmin, vmax=vmax)
72
+ plt.colorbar()
73
+ # plt.title(name, fontproperties=font_prop)
74
+ plt.gca().invert_yaxis() # 反转y轴
75
+ plt.tight_layout()
76
+
77
+ # 将图像添加到 TensorBoard
78
+ self.writer.add_figure(name, fig, self.global_step)
79
+
80
+ # 关闭图形以释放资源
81
+ plt.close(fig)
82
+
83
+ def log_audio(self, dict):
84
+ for k, v in dict.items():
85
+ self.writer.add_audio(k, v, global_step=self.global_step, sample_rate=self.sample_rate)
86
+
87
+ def get_interval_time(self, update=True):
88
+ cur_time = time.time()
89
+ time_interval = cur_time - self.last_time
90
+ if update:
91
+ self.last_time = cur_time
92
+ return time_interval
93
+
94
+ def get_total_time(self, to_str=True):
95
+ total_time = time.time() - self.init_time
96
+ if to_str:
97
+ total_time = str(datetime.timedelta(
98
+ seconds=total_time))[:-5]
99
+ return total_time
100
+
101
+ def save_model(
102
+ self,
103
+ model,
104
+ optimizer,
105
+ name='model',
106
+ postfix='',
107
+ to_json=False):
108
+ # os.makedirs(os.path.join(self.expdir), exist_ok=True)
109
+ # path
110
+ if postfix:
111
+ postfix = '_' + postfix
112
+ path_pt = os.path.join(
113
+ self.log_dir , name+postfix+'.pt')
114
+
115
+ # check
116
+ print(' [*] model checkpoint saved: {}'.format(path_pt))
117
+
118
+ # save
119
+ if optimizer is not None:
120
+ torch.save({
121
+ 'global_step': self.global_step,
122
+ 'model': model.state_dict(),
123
+ 'optimizer': optimizer.state_dict()}, path_pt)
124
+ else:
125
+ torch.save({
126
+ 'global_step': self.global_step,
127
+ 'model': model.state_dict()}, path_pt)
128
+
129
+ # to json
130
+ # if to_json:
131
+ # path_json = os.path.join(
132
+ # self.expdir , name+'.json')
133
+ # utils.to_json(path_params, path_json)
134
+
135
+ def delete_model(self, name='model', postfix=''):
136
+ # path
137
+ if postfix:
138
+ postfix = '_' + postfix
139
+ path_pt = os.path.join(
140
+ self.expdir , name+postfix+'.pt')
141
+
142
+ # delete
143
+ if os.path.exists(path_pt):
144
+ os.remove(path_pt)
145
+ print(' [*] model checkpoint deleted: {}'.format(path_pt))
146
+
147
+ def global_step_increment(self):
148
+ self.global_step += 1
149
+
150
+
logger/utils.py ADDED
@@ -0,0 +1,128 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import yaml
3
+ import json
4
+ import pickle
5
+ import torch
6
+ import shutil
7
+
8
+ def traverse_dir(
9
+ root_dir,
10
+ extensions,
11
+ amount=None,
12
+ str_include=None,
13
+ str_exclude=None,
14
+ is_pure=False,
15
+ is_sort=False,
16
+ is_ext=True):
17
+
18
+ file_list = []
19
+ cnt = 0
20
+ for root, _, files in os.walk(root_dir):
21
+ for file in files:
22
+ if any([file.endswith(f".{ext}") for ext in extensions]):
23
+ # path
24
+ mix_path = os.path.join(root, file)
25
+ pure_path = mix_path[len(root_dir)+1:] if is_pure else mix_path
26
+
27
+ # amount
28
+ if (amount is not None) and (cnt == amount):
29
+ if is_sort:
30
+ file_list.sort()
31
+ return file_list
32
+
33
+ # check string
34
+ if (str_include is not None) and (str_include not in pure_path):
35
+ continue
36
+ if (str_exclude is not None) and (str_exclude in pure_path):
37
+ continue
38
+
39
+ if not is_ext:
40
+ ext = pure_path.split('.')[-1]
41
+ pure_path = pure_path[:-(len(ext)+1)]
42
+ file_list.append(pure_path)
43
+ cnt += 1
44
+ if is_sort:
45
+ file_list.sort()
46
+ return file_list
47
+
48
+
49
+
50
+ class DotDict(dict):
51
+ def __getattr__(*args):
52
+ val = dict.get(*args)
53
+ return DotDict(val) if type(val) is dict else val
54
+
55
+ __setattr__ = dict.__setitem__
56
+ __delattr__ = dict.__delitem__
57
+
58
+
59
+ def get_network_paras_amount(model_dict):
60
+ info = dict()
61
+ for model_name, model in model_dict.items():
62
+ # all_params = sum(p.numel() for p in model.parameters())
63
+ trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
64
+
65
+ info[model_name] = trainable_params
66
+ return info
67
+
68
+
69
+ def load_config(path_config):
70
+ with open(path_config, "r") as config:
71
+ args = yaml.safe_load(config)
72
+ args = DotDict(args)
73
+ # print(args)
74
+ return args
75
+
76
+ def save_config(path_config, output_dir):
77
+ file_name = path_config.split('/')[-1]
78
+ output_path = os.path.join(output_dir, file_name)
79
+ shutil.copy(path_config, output_path)
80
+ print(f" [*] Save config to {output_path}")
81
+
82
+ def to_json(path_params, path_json):
83
+ params = torch.load(path_params, map_location=torch.device('cpu'))
84
+ raw_state_dict = {}
85
+ for k, v in params.items():
86
+ val = v.flatten().numpy().tolist()
87
+ raw_state_dict[k] = val
88
+
89
+ with open(path_json, 'w') as outfile:
90
+ json.dump(raw_state_dict, outfile,indent= "\t")
91
+
92
+
93
+ def convert_tensor_to_numpy(tensor, is_squeeze=True):
94
+ if is_squeeze:
95
+ tensor = tensor.squeeze()
96
+ if tensor.requires_grad:
97
+ tensor = tensor.detach()
98
+ if tensor.is_cuda:
99
+ tensor = tensor.cpu()
100
+ return tensor.numpy()
101
+
102
+
103
+ def load_model(
104
+ expdir,
105
+ model,
106
+ optimizer,
107
+ name='model',
108
+ postfix='',
109
+ device='cpu'):
110
+ if postfix == '':
111
+ postfix = '_' + postfix
112
+ path = os.path.join(expdir, name+postfix)
113
+ path_pt = traverse_dir(expdir, ['pt'], is_ext=False)
114
+ global_step = 0
115
+ if len(path_pt) > 0:
116
+ steps = [s[len(path):] for s in path_pt]
117
+ maxstep = max([int(s) if s.isdigit() else 0 for s in steps])
118
+ if maxstep >= 0:
119
+ path_pt = path+str(maxstep)+'.pt'
120
+ else:
121
+ path_pt = path+'best.pt'
122
+ print(' [*] restoring model from', path_pt)
123
+ ckpt = torch.load(path_pt, map_location=torch.device(device))
124
+ global_step = ckpt['global_step']
125
+ model.load_state_dict(ckpt['model'], strict=False)
126
+ if ckpt.get('optimizer') != None:
127
+ optimizer.load_state_dict(ckpt['optimizer'])
128
+ return global_step, model, optimizer
utils/Amphion/.github/CODE_OF_CONDUCT.md ADDED
@@ -0,0 +1,132 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ # Contributor Covenant Code of Conduct
3
+
4
+ ## Our Pledge
5
+
6
+ We as members, contributors, and leaders pledge to make participation in our
7
+ community a harassment-free experience for everyone, regardless of age, body
8
+ size, visible or invisible disability, ethnicity, sex characteristics, gender
9
+ identity and expression, level of experience, education, socio-economic status,
10
+ nationality, personal appearance, race, caste, color, religion, or sexual
11
+ identity and orientation.
12
+
13
+ We pledge to act and interact in ways that contribute to an open, welcoming,
14
+ diverse, inclusive, and healthy community.
15
+
16
+ ## Our Standards
17
+
18
+ Examples of behavior that contributes to a positive environment for our
19
+ community include:
20
+
21
+ * Demonstrating empathy and kindness toward other people
22
+ * Being respectful of differing opinions, viewpoints, and experiences
23
+ * Giving and gracefully accepting constructive feedback
24
+ * Accepting responsibility and apologizing to those affected by our mistakes,
25
+ and learning from the experience
26
+ * Focusing on what is best not just for us as individuals, but for the overall
27
+ community
28
+
29
+ Examples of unacceptable behavior include:
30
+
31
+ * The use of sexualized language or imagery, and sexual attention or advances of
32
+ any kind
33
+ * Trolling, insulting or derogatory comments, and personal or political attacks
34
+ * Public or private harassment
35
+ * Publishing others' private information, such as a physical or email address,
36
+ without their explicit permission
37
+ * Other conduct which could reasonably be considered inappropriate in a
38
+ professional setting
39
+
40
+ ## Enforcement Responsibilities
41
+
42
+ Community leaders are responsible for clarifying and enforcing our standards of
43
+ acceptable behavior and will take appropriate and fair corrective action in
44
+ response to any behavior that they deem inappropriate, threatening, offensive,
45
+ or harmful.
46
+
47
+ Community leaders have the right and responsibility to remove, edit, or reject
48
+ comments, commits, code, wiki edits, issues, and other contributions that are
49
+ not aligned to this Code of Conduct, and will communicate reasons for moderation
50
+ decisions when appropriate.
51
+
52
+ ## Scope
53
+
54
+ This Code of Conduct applies within all community spaces, and also applies when
55
+ an individual is officially representing the community in public spaces.
56
+ Examples of representing our community include using an official email address,
57
+ posting via an official social media account, or acting as an appointed
58
+ representative at an online or offline event.
59
+
60
+ ## Enforcement
61
+
62
+ Instances of abusive, harassing, or otherwise unacceptable behavior may be
63
+ reported to the community leaders responsible for enforcement.
64
+ All complaints will be reviewed and investigated promptly and fairly.
65
+
66
+ All community leaders are obligated to respect the privacy and security of the
67
+ reporter of any incident.
68
+
69
+ ## Enforcement Guidelines
70
+
71
+ Community leaders will follow these Community Impact Guidelines in determining
72
+ the consequences for any action they deem in violation of this Code of Conduct:
73
+
74
+ ### 1. Correction
75
+
76
+ **Community Impact**: Use of inappropriate language or other behavior deemed
77
+ unprofessional or unwelcome in the community.
78
+
79
+ **Consequence**: A private, written warning from community leaders, providing
80
+ clarity around the nature of the violation and an explanation of why the
81
+ behavior was inappropriate. A public apology may be requested.
82
+
83
+ ### 2. Warning
84
+
85
+ **Community Impact**: A violation through a single incident or series of
86
+ actions.
87
+
88
+ **Consequence**: A warning with consequences for continued behavior. No
89
+ interaction with the people involved, including unsolicited interaction with
90
+ those enforcing the Code of Conduct, for a specified period of time. This
91
+ includes avoiding interactions in community spaces as well as external channels
92
+ like social media. Violating these terms may lead to a temporary or permanent
93
+ ban.
94
+
95
+ ### 3. Temporary Ban
96
+
97
+ **Community Impact**: A serious violation of community standards, including
98
+ sustained inappropriate behavior.
99
+
100
+ **Consequence**: A temporary ban from any sort of interaction or public
101
+ communication with the community for a specified period of time. No public or
102
+ private interaction with the people involved, including unsolicited interaction
103
+ with those enforcing the Code of Conduct, is allowed during this period.
104
+ Violating these terms may lead to a permanent ban.
105
+
106
+ ### 4. Permanent Ban
107
+
108
+ **Community Impact**: Demonstrating a pattern of violation of community
109
+ standards, including sustained inappropriate behavior, harassment of an
110
+ individual, or aggression toward or disparagement of classes of individuals.
111
+
112
+ **Consequence**: A permanent ban from any sort of public interaction within the
113
+ community.
114
+
115
+ ## Attribution
116
+
117
+ This Code of Conduct is adapted from the [Contributor Covenant][homepage],
118
+ version 2.1, available at
119
+ [https://www.contributor-covenant.org/version/2/1/code_of_conduct.html][v2.1].
120
+
121
+ Community Impact Guidelines were inspired by
122
+ [Mozilla's code of conduct enforcement ladder][Mozilla CoC].
123
+
124
+ For answers to common questions about this code of conduct, see the FAQ at
125
+ [https://www.contributor-covenant.org/faq][FAQ]. Translations are available at
126
+ [https://www.contributor-covenant.org/translations][translations].
127
+
128
+ [homepage]: https://www.contributor-covenant.org
129
+ [v2.1]: https://www.contributor-covenant.org/version/2/1/code_of_conduct.html
130
+ [Mozilla CoC]: https://github.com/mozilla/diversity
131
+ [FAQ]: https://www.contributor-covenant.org/faq
132
+ [translations]: https://www.contributor-covenant.org/translations
utils/Amphion/.github/CONTRIBUTING.md ADDED
@@ -0,0 +1,77 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Welcome to the Amphion Community!
2
+
3
+ We greatly appreciate your interest in contributing to Amphion. Your involvement plays a pivotal role in our collective growth, and we are dedicated to nurturing a cooperative and inclusive space for all contributors. To ensure a respectful and productive atmosphere, all contributors must adhere to the Amphion [Code of Conduct](CODE_OF_CONDUCT.md).
4
+
5
+ ## Contributions
6
+
7
+ All kinds of contributions are welcome, including but not limited to:
8
+ - **Issue Reporting**: Report bugs or suggest features through GitHub Issues.
9
+ - **Bug Fixes**: Identify and rectify software issues to boost functionality.
10
+ - **Developing New Features**: Bring innovation and impactful enhancements to Amphion.
11
+ - **Implementing New Checkpoints**: Introduce checkpoints to optimize workflows.
12
+
13
+ - **Recipe Contributions**: Share your unique and practical coding solutions.
14
+ - **Diverse Contributions**: Your participation isn't limited! Contribute to documentation, community support, and more.
15
+
16
+ ## How to Contribute
17
+ 1. **Fork the Repository**: Start by forking the Amphion repository on GitHub.
18
+ 2. **Clone Your Fork**: Localize your fork on your development machine.
19
+ 3. **Create a Branch**: Initiate a new branch for your changes.
20
+ 4. **Test Your Changes**: Ensure compatibility and non-disruption of your updates.
21
+ 5. **Commit Your Changes**: Make small, focused commits with clear descriptions.
22
+ 6. **Update Your Fork**: Upload your modifications to your GitHub fork.
23
+ 7. **Open a Pull Request**: Suggest a pull request from your fork to the main Amphion repository with our [Pull Request Template](pull_request_template.md).
24
+ 8. **Participate in Code Reviews**: Collaborate with reviewers and address their feedback.
25
+
26
+ ## Coding Standards
27
+ - **License Headers**: Each new code file should include license headers.
28
+ - **Style Consistency**: Align with the project's existing coding style.
29
+ - **Code Quality**: Aim for clarity, maintainability, and efficiency.
30
+ - **Clear Commenting**: Describe the purpose and usage of each function and other crucial code segments.
31
+ - **Code Formatting**:
32
+ - Install 'black' formatter: `pip install black`.
33
+ - Format files: `black file.py`.
34
+ - Format directories: `black directory/`.
35
+
36
+ ## Contributor Agreement
37
+ By contributing to Amphion, you agree to abide by our Code of Conduct, and the Developer Certificate of Origin, Version 1.1:
38
+
39
+ ```
40
+ Developer Certificate of Origin
41
+ Version 1.1
42
+
43
+ Copyright (C) 2004, 2006 The Linux Foundation and its contributors.
44
+
45
+ Everyone is permitted to copy and distribute verbatim copies of this
46
+ license document, but changing it is not allowed.
47
+
48
+
49
+ Developer's Certificate of Origin 1.1
50
+
51
+ By making a contribution to this project, I certify that:
52
+
53
+ (a) The contribution was created in whole or in part by me and I
54
+ have the right to submit it under the open source license
55
+ indicated in the file; or
56
+
57
+ (b) The contribution is based upon previous work that, to the best
58
+ of my knowledge, is covered under an appropriate open source
59
+ license and I have the right under that license to submit that
60
+ work with modifications, whether created in whole or in part
61
+ by me, under the same open source license (unless I am
62
+ permitted to submit under a different license), as indicated
63
+ in the file; or
64
+
65
+ (c) The contribution was provided directly to me by some other
66
+ person who certified (a), (b) or (c) and I have not modified
67
+ it.
68
+
69
+ (d) I understand and agree that this project and the contribution
70
+ are public and that a record of the contribution (including all
71
+ personal information I submit with it, including my sign-off) is
72
+ maintained indefinitely and may be redistributed consistent with
73
+ this project or the open source license(s) involved.
74
+ ```
75
+
76
+ ## Need Help?
77
+ For any queries or support, feel free to open an issue for community discussions and help.
utils/Amphion/.github/ISSUE_TEMPLATE/bug_report.md ADDED
@@ -0,0 +1,32 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ name: Bug report
3
+ about: Create a report to help us improve Amphion.
4
+ title: "[BUG]: "
5
+ labels: 'bug'
6
+ assignees: ''
7
+
8
+ ---
9
+
10
+ ## Describe the bug
11
+ (A clear and concise description of what the bug is.)
12
+
13
+ ## How To Reproduce
14
+ Steps to reproduce the behavior:
15
+ 1. Config/File changes: ...
16
+ 2. Run command: ...
17
+ 3. See error: ...
18
+
19
+ ## Expected behavior
20
+ (A clear and concise description of what you expected to happen.)
21
+
22
+ ## Screenshots
23
+ (If applicable, add screenshots to help explain your problem.)
24
+
25
+ ## Environment Information
26
+ - Operating System: [e.g. Ubuntu 20.04.5 LTS]
27
+ - Python Version: [e.g. Python 3.9.15]
28
+ - Driver & CUDA Version: [e.g. Driver 470.103.01 & CUDA 11.4]
29
+ - Error Messages and Logs: [If applicable, provide any error messages or relevant log outputs]
30
+
31
+ ## Additional context
32
+ (Add any other context about the problem here.)
utils/Amphion/.github/ISSUE_TEMPLATE/docs_feedback.md ADDED
@@ -0,0 +1,17 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ name: Docs feedback
3
+ about: Improve documentation about Amphion.
4
+ title: "[Docs]: "
5
+ labels: 'documentation'
6
+ assignees: ''
7
+
8
+ ---
9
+
10
+ ## Documentation Reference
11
+ (Path/Link to the documentation file)
12
+
13
+ ## Feedback on documentation
14
+ (Your suggestions to the documentation. e.g., accuracy, complex explanations, structural organization, practical examples, technical reliability, and consistency)
15
+
16
+ ## Additional context
17
+ (Add any other context or screenshots about the documentation here.)
utils/Amphion/.github/ISSUE_TEMPLATE/feature_request.md ADDED
@@ -0,0 +1,20 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ name: Feature request
3
+ about: Suggest an idea for Amphion.
4
+ title: "[Feature]: "
5
+ labels: 'enhancement'
6
+ assignees: ''
7
+
8
+ ---
9
+
10
+ ## Is your feature request related to a problem? Please describe.
11
+ (A clear and concise description of what the problem is.)
12
+
13
+ ## Describe the solution you'd like
14
+ (A clear and concise description of what you want to happen.)
15
+
16
+ ## Describe alternatives you've considered
17
+ (A clear and concise description of any alternative solutions or features you've considered.)
18
+
19
+ ## Additional context
20
+ (Add any other context or screenshots about the feature request here.)
utils/Amphion/.github/ISSUE_TEMPLATE/help_wanted.md ADDED
@@ -0,0 +1,32 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ name: Help wanted
3
+ about: Want help from Amphion team.
4
+ title: "[Help]: "
5
+ labels: 'help wanted'
6
+ assignees: ''
7
+
8
+ ---
9
+
10
+ ## Problem Overview
11
+ (Briefly and clearly describe the issue you're facing and seeking help with.)
12
+
13
+ ## Steps Taken
14
+ (Detail your attempts to resolve the issue, including any relevant steps or processes.)
15
+ 1. Config/File changes: ...
16
+ 2. Run command: ...
17
+ 3. See errors: ...
18
+
19
+ ## Expected Outcome
20
+ (A clear and concise description of what you expected to happen.)
21
+
22
+ ## Screenshots
23
+ (If applicable, add screenshots to help explain your problem.)
24
+
25
+ ## Environment Information
26
+ - Operating System: [e.g. Ubuntu 20.04.5 LTS]
27
+ - Python Version: [e.g. Python 3.9.15]
28
+ - Driver & CUDA Version: [e.g. Driver 470.103.01 & CUDA 11.4]
29
+ - Error Messages and Logs: [If applicable, provide any error messages or relevant log outputs]
30
+
31
+ ## Additional context
32
+ (Add any other context about the problem here.)
utils/Amphion/.github/pull_request_template.md ADDED
@@ -0,0 +1,32 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ ## ✨ Description
3
+
4
+ [Please describe the background, purpose, changes made, and how to test this PR]
5
+
6
+ ## 🚧 Related Issues
7
+
8
+ [List the issue numbers related to this PR]
9
+
10
+ ## 👨‍💻 Changes Proposed
11
+
12
+ - [ ] change1
13
+ - [ ] ...
14
+
15
+ ## 🧑‍🤝‍🧑 Who Can Review?
16
+
17
+ [Please use the '@' symbol to mention any community member who is free to review the PR once the tests have passed. Feel free to tag members or contributors who might be interested in your PR.]
18
+
19
+ ## 🛠 TODO
20
+
21
+ - [ ] task1
22
+ - [ ] ...
23
+
24
+ ## ✅ Checklist
25
+
26
+ - [ ] Code has been reviewed
27
+ - [ ] Code complies with the project's code standards and best practices
28
+ - [ ] Code has passed all tests
29
+ - [ ] Code does not affect the normal use of existing features
30
+ - [ ] Code has been commented properly
31
+ - [ ] Documentation has been updated (if applicable)
32
+ - [ ] Demo/checkpoint has been attached (if applicable)
utils/Amphion/.github/workflows/check_format.yml ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ name: Check Format
2
+
3
+ on: [push, pull_request]
4
+
5
+ jobs:
6
+ CheckCodeFormat:
7
+ runs-on: ubuntu-latest
8
+ steps:
9
+ - uses: actions/checkout@v3
10
+ - uses: psf/black@stable
11
+ with:
12
+ options: "--check --diff --color"
utils/Amphion/.gitignore ADDED
@@ -0,0 +1,64 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Mac OS files
2
+ .DS_Store
3
+
4
+ # IDEs
5
+ .idea
6
+ .vs
7
+ .vscode
8
+ .cache
9
+
10
+ # GitHub files
11
+ .github
12
+
13
+ # Byte-compiled / optimized / DLL / cached files
14
+ __pycache__/
15
+ *.py[cod]
16
+ *$py.class
17
+ *.pyc
18
+ .temp
19
+ *.c
20
+ *.so
21
+ *.o
22
+
23
+ # Developing mode
24
+ _*.sh
25
+ _*.json
26
+ *.lst
27
+ yard*
28
+ *.out
29
+ evaluation/evalset_selection
30
+ mfa
31
+ egs/svc/*wavmark
32
+ egs/svc/custom
33
+ egs/svc/*/dev*
34
+ egs/svc/dev_exp_config.json
35
+ egs/svc/dev
36
+ bins/svc/demo*
37
+ bins/svc/preprocess_custom.py
38
+ data
39
+ ckpts
40
+
41
+ # Data and ckpt
42
+ *.pkl
43
+ *.pt
44
+ *.npy
45
+ *.npz
46
+ *.tar.gz
47
+ *.ckpt
48
+ *.wav
49
+ *.flac
50
+ pretrained/wenet/*conformer_exp
51
+ pretrained/bigvgan/args.json
52
+ !egs/tts/VALLE/prompt_examples/*.wav
53
+
54
+ # Runtime data dirs
55
+ processed_data
56
+ data
57
+ model_ckpt
58
+ logs
59
+ *.ipynb
60
+ *.lst
61
+ source_audio
62
+ result
63
+ conversion_results
64
+ get_available_gpu.py
utils/Amphion/Dockerfile ADDED
@@ -0,0 +1,64 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) 2023 Amphion.
2
+ #
3
+ # This source code is licensed under the MIT license found in the
4
+ # LICENSE file in the root directory of this source tree.
5
+
6
+ # Other version: https://hub.docker.com/r/nvidia/cuda/tags
7
+ FROM nvidia/cuda:11.8.0-cudnn8-devel-ubuntu18.04
8
+
9
+ ARG DEBIAN_FRONTEND=noninteractive
10
+ ARG PYTORCH='2.0.0'
11
+ ARG CUDA='cu118'
12
+ ARG SHELL='/bin/bash'
13
+ ARG MINICONDA='Miniconda3-py39_23.3.1-0-Linux-x86_64.sh'
14
+
15
+ ENV LANG=en_US.UTF-8 PYTHONIOENCODING=utf-8 PYTHONDONTWRITEBYTECODE=1 CUDA_HOME=/usr/local/cuda CONDA_HOME=/opt/conda SHELL=${SHELL}
16
+ ENV PATH=$CONDA_HOME/bin:$CUDA_HOME/bin:$PATH \
17
+ LD_LIBRARY_PATH=$CUDA_HOME/lib64:$LD_LIBRARY_PATH \
18
+ LIBRARY_PATH=$CUDA_HOME/lib64:$LIBRARY_PATH \
19
+ CONDA_PREFIX=$CONDA_HOME \
20
+ NCCL_HOME=$CUDA_HOME
21
+
22
+ # Install ubuntu packages
23
+ RUN sed -i 's/archive.ubuntu.com/mirrors.cloud.tencent.com/g' /etc/apt/sources.list \
24
+ && sed -i 's/security.ubuntu.com/mirrors.cloud.tencent.com/g' /etc/apt/sources.list \
25
+ && rm /etc/apt/sources.list.d/cuda.list \
26
+ && apt-get update \
27
+ && apt-get -y install \
28
+ python3-pip ffmpeg git less wget libsm6 libxext6 libxrender-dev \
29
+ build-essential cmake pkg-config libx11-dev libatlas-base-dev \
30
+ libgtk-3-dev libboost-python-dev vim libgl1-mesa-glx \
31
+ libaio-dev software-properties-common tmux \
32
+ espeak-ng
33
+
34
+ # Install miniconda with python 3.9
35
+ USER root
36
+ # COPY Miniconda3-py39_23.3.1-0-Linux-x86_64.sh /root/anaconda.sh
37
+ RUN wget -t 0 -c -O /tmp/anaconda.sh https://repo.anaconda.com/miniconda/${MINICONDA} \
38
+ && mv /tmp/anaconda.sh /root/anaconda.sh \
39
+ && ${SHELL} /root/anaconda.sh -b -p $CONDA_HOME \
40
+ && rm /root/anaconda.sh
41
+
42
+ RUN conda create -y --name amphion python=3.9.15
43
+
44
+ WORKDIR /app
45
+ COPY env.sh env.sh
46
+ RUN chmod +x ./env.sh
47
+
48
+ RUN ["conda", "run", "-n", "amphion", "-vvv", "--no-capture-output", "./env.sh"]
49
+
50
+ RUN conda init \
51
+ && echo "\nconda activate amphion\n" >> ~/.bashrc
52
+
53
+ CMD ["/bin/bash"]
54
+
55
+ # *** Build ***
56
+ # docker build -t realamphion/amphion .
57
+
58
+ # *** Run ***
59
+ # cd Amphion
60
+ # docker run --runtime=nvidia --gpus all -it -v .:/app -v /mnt:/mnt_host realamphion/amphion
61
+
62
+ # *** Push and release ***
63
+ # docker login
64
+ # docker push realamphion/amphion
utils/Amphion/LICENSE ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ MIT License
2
+
3
+ Copyright (c) 2023 Amphion
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
utils/Amphion/README.md ADDED
@@ -0,0 +1,163 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Amphion: An Open-Source Audio, Music, and Speech Generation Toolkit
2
+
3
+ <div>
4
+ <a href="https://arxiv.org/abs/2312.09911"><img src="https://img.shields.io/badge/arXiv-Paper-<COLOR>.svg"></a>
5
+ <a href="https://huggingface.co/amphion"><img src="https://img.shields.io/badge/%F0%9F%A4%97%20HuggingFace-Amphion-pink"></a>
6
+ <a href="https://openxlab.org.cn/usercenter/Amphion"><img src="https://cdn-static.openxlab.org.cn/app-center/openxlab_app.svg"></a>
7
+ <a href="egs/tts/README.md"><img src="https://img.shields.io/badge/README-TTS-blue"></a>
8
+ <a href="egs/svc/README.md"><img src="https://img.shields.io/badge/README-SVC-blue"></a>
9
+ <a href="egs/tta/README.md"><img src="https://img.shields.io/badge/README-TTA-blue"></a>
10
+ <a href="egs/vocoder/README.md"><img src="https://img.shields.io/badge/README-Vocoder-purple"></a>
11
+ <a href="egs/metrics/README.md"><img src="https://img.shields.io/badge/README-Evaluation-yellow"></a>
12
+ <a href="LICENSE"><img src="https://img.shields.io/badge/LICENSE-MIT-red"></a>
13
+ </div>
14
+ <br>
15
+
16
+ **Amphion (/æmˈfaɪən/) is a toolkit for Audio, Music, and Speech Generation.** Its purpose is to support reproducible research and help junior researchers and engineers get started in the field of audio, music, and speech generation research and development. Amphion offers a unique feature: **visualizations** of classic models or architectures. We believe that these visualizations are beneficial for junior researchers and engineers who wish to gain a better understanding of the model.
17
+
18
+ **The North-Star objective of Amphion is to offer a platform for studying the conversion of any inputs into audio.** Amphion is designed to support individual generation tasks, including but not limited to,
19
+
20
+ - **TTS**: Text to Speech (⛳ supported)
21
+ - **SVS**: Singing Voice Synthesis (👨‍💻 developing)
22
+ - **VC**: Voice Conversion (👨‍💻 developing)
23
+ - **SVC**: Singing Voice Conversion (⛳ supported)
24
+ - **TTA**: Text to Audio (⛳ supported)
25
+ - **TTM**: Text to Music (👨‍💻 developing)
26
+ - more…
27
+
28
+ In addition to the specific generation tasks, Amphion also includes several **vocoders** and **evaluation metrics**. A vocoder is an important module for producing high-quality audio signals, while evaluation metrics are critical for ensuring consistent metrics in generation tasks.
29
+
30
+ Here is the Amphion v0.1 demo, whose voice, audio effects, and singing voice are generated by our models. Just enjoy it!
31
+
32
+ [amphion-v0.1-en](https://github.com/open-mmlab/Amphion/assets/24860155/7fcdcea5-3d95-4b31-bd93-4b4da734ef9b
33
+ )
34
+
35
+ ## 🚀 News
36
+ - **2024/03/12**: Amphion now support **NaturalSpeech3 FACodec** and release pretrained checkpoints. [![arXiv](https://img.shields.io/badge/arXiv-Paper-COLOR.svg)](https://arxiv.org/abs/2403.03100) [![hf](https://img.shields.io/badge/%F0%9F%A4%97%20HuggingFace-model-yellow)](https://huggingface.co/amphion/naturalspeech3_facodec) [![hf](https://img.shields.io/badge/%F0%9F%A4%97%20HuggingFace-demo-pink)](https://huggingface.co/spaces/amphion/naturalspeech3_facodec) [![readme](https://img.shields.io/badge/README-Key%20Features-blue)](models/codec/ns3_codec/README.md)
37
+ - **2024/02/22**: The first Amphion visualization tool, **SingVisio**, release. [![arXiv](https://img.shields.io/badge/arXiv-Paper-COLOR.svg)](https://arxiv.org/abs/2402.12660) [![openxlab](https://cdn-static.openxlab.org.cn/app-center/openxlab_app.svg)](https://openxlab.org.cn/apps/detail/Amphion/SingVisio) [![Video](https://img.shields.io/badge/Video-Demo-orange)](https://github.com/open-mmlab/Amphion/assets/33707885/0a6e39e8-d5f1-4288-b0f8-32da5a2d6e96) [![readme](https://img.shields.io/badge/README-Key%20Features-blue)](egs/visualization/SingVisio/README.md)
38
+ - **2023/12/18**: Amphion v0.1 release. [![arXiv](https://img.shields.io/badge/arXiv-Paper-<COLOR>.svg)](https://arxiv.org/abs/2312.09911) [![hf](https://img.shields.io/badge/%F0%9F%A4%97%20HuggingFace-Amphion-pink)](https://huggingface.co/amphion) [![youtube](https://img.shields.io/badge/YouTube-Demo-red)](https://www.youtube.com/watch?v=1aw0HhcggvQ) [![readme](https://img.shields.io/badge/README-Key%20Features-blue)](https://github.com/open-mmlab/Amphion/pull/39)
39
+ - **2023/11/28**: Amphion alpha release. [![readme](https://img.shields.io/badge/README-Key%20Features-blue)](https://github.com/open-mmlab/Amphion/pull/2)
40
+
41
+ ## ⭐ Key Features
42
+
43
+ ### TTS: Text to Speech
44
+
45
+ - Amphion achieves state-of-the-art performance when compared with existing open-source repositories on text-to-speech (TTS) systems. It supports the following models or architectures:
46
+ - [FastSpeech2](https://arxiv.org/abs/2006.04558): A non-autoregressive TTS architecture that utilizes feed-forward Transformer blocks.
47
+ - [VITS](https://arxiv.org/abs/2106.06103): An end-to-end TTS architecture that utilizes conditional variational autoencoder with adversarial learning
48
+ - [Vall-E](https://arxiv.org/abs/2301.02111): A zero-shot TTS architecture that uses a neural codec language model with discrete codes.
49
+ - [NaturalSpeech2](https://arxiv.org/abs/2304.09116): An architecture for TTS that utilizes a latent diffusion model to generate natural-sounding voices.
50
+
51
+ ### SVC: Singing Voice Conversion
52
+
53
+ - Ampion supports multiple content-based features from various pretrained models, including [WeNet](https://github.com/wenet-e2e/wenet), [Whisper](https://github.com/openai/whisper), and [ContentVec](https://github.com/auspicious3000/contentvec). Their specific roles in SVC has been investigated in our NeurIPS 2023 workshop paper. [![arXiv](https://img.shields.io/badge/arXiv-Paper-<COLOR>.svg)](https://arxiv.org/abs/2310.11160) [![code](https://img.shields.io/badge/README-Code-red)](egs/svc/MultipleContentsSVC)
54
+ - Amphion implements several state-of-the-art model architectures, including diffusion-, transformer-, VAE- and flow-based models. The diffusion-based architecture uses [Bidirectional dilated CNN](https://openreview.net/pdf?id=a-xFK8Ymz5J) as a backend and supports several sampling algorithms such as [DDPM](https://arxiv.org/pdf/2006.11239.pdf), [DDIM](https://arxiv.org/pdf/2010.02502.pdf), and [PNDM](https://arxiv.org/pdf/2202.09778.pdf). Additionally, it supports single-step inference based on the [Consistency Model](https://openreview.net/pdf?id=FmqFfMTNnv).
55
+
56
+ ### TTA: Text to Audio
57
+
58
+ - Amphion supports the TTA with a latent diffusion model. It is designed like [AudioLDM](https://arxiv.org/abs/2301.12503), [Make-an-Audio](https://arxiv.org/abs/2301.12661), and [AUDIT](https://arxiv.org/abs/2304.00830). It is also the official implementation of the text-to-audio generation part of our NeurIPS 2023 paper. [![arXiv](https://img.shields.io/badge/arXiv-Paper-<COLOR>.svg)](https://arxiv.org/abs/2304.00830) [![code](https://img.shields.io/badge/README-Code-red)](egs/tta/RECIPE.md)
59
+
60
+ ### Vocoder
61
+
62
+ - Amphion supports various widely-used neural vocoders, including:
63
+ - GAN-based vocoders: [MelGAN](https://arxiv.org/abs/1910.06711), [HiFi-GAN](https://arxiv.org/abs/2010.05646), [NSF-HiFiGAN](https://github.com/nii-yamagishilab/project-NN-Pytorch-scripts), [BigVGAN](https://arxiv.org/abs/2206.04658), [APNet](https://arxiv.org/abs/2305.07952).
64
+ - Flow-based vocoders: [WaveGlow](https://arxiv.org/abs/1811.00002).
65
+ - Diffusion-based vocoders: [Diffwave](https://arxiv.org/abs/2009.09761).
66
+ - Auto-regressive based vocoders: [WaveNet](https://arxiv.org/abs/1609.03499), [WaveRNN](https://arxiv.org/abs/1802.08435v1).
67
+ - Amphion provides the official implementation of [Multi-Scale Constant-Q Transform Discriminator](https://arxiv.org/abs/2311.14957) (our ICASSP 2024 paper). It can be used to enhance any architecture GAN-based vocoders during training, and keep the inference stage (such as memory or speed) unchanged. [![arXiv](https://img.shields.io/badge/arXiv-Paper-<COLOR>.svg)](https://arxiv.org/abs/2311.14957) [![code](https://img.shields.io/badge/README-Code-red)](egs/vocoder/gan/tfr_enhanced_hifigan)
68
+
69
+ ### Evaluation
70
+
71
+ Amphion provides a comprehensive objective evaluation of the generated audio. The evaluation metrics contain:
72
+
73
+ - **F0 Modeling**: F0 Pearson Coefficients, F0 Periodicity Root Mean Square Error, F0 Root Mean Square Error, Voiced/Unvoiced F1 Score, etc.
74
+ - **Energy Modeling**: Energy Root Mean Square Error, Energy Pearson Coefficients, etc.
75
+ - **Intelligibility**: Character/Word Error Rate, which can be calculated based on [Whisper](https://github.com/openai/whisper) and more.
76
+ - **Spectrogram Distortion**: Frechet Audio Distance (FAD), Mel Cepstral Distortion (MCD), Multi-Resolution STFT Distance (MSTFT), Perceptual Evaluation of Speech Quality (PESQ), Short Time Objective Intelligibility (STOI), etc.
77
+ - **Speaker Similarity**: Cosine similarity, which can be calculated based on [RawNet3](https://github.com/Jungjee/RawNet), [Resemblyzer](https://github.com/resemble-ai/Resemblyzer), [WeSpeaker](https://github.com/wenet-e2e/wespeaker), [WavLM](https://github.com/microsoft/unilm/tree/master/wavlm) and more.
78
+
79
+ ### Datasets
80
+
81
+ Amphion unifies the data preprocess of the open-source datasets including [AudioCaps](https://audiocaps.github.io/), [LibriTTS](https://www.openslr.org/60/), [LJSpeech](https://keithito.com/LJ-Speech-Dataset/), [M4Singer](https://github.com/M4Singer/M4Singer), [Opencpop](https://wenet.org.cn/opencpop/), [OpenSinger](https://github.com/Multi-Singer/Multi-Singer.github.io), [SVCC](http://vc-challenge.org/), [VCTK](https://datashare.ed.ac.uk/handle/10283/3443), and more. The supported dataset list can be seen [here](egs/datasets/README.md) (updating).
82
+
83
+ ### Visualization
84
+
85
+ Amphion provides visualization tools to interactively illustrate the internal processing mechanism of classic models. This provides an invaluable resource for educational purposes and for facilitating understandable research.
86
+
87
+ Currently, Amphion supports [SingVisio](egs/visualization/SingVisio/README.md), a visualization tool of the diffusion model for singing voice conversion. [![arXiv](https://img.shields.io/badge/arXiv-Paper-COLOR.svg)](https://arxiv.org/abs/2402.12660) [![openxlab](https://cdn-static.openxlab.org.cn/app-center/openxlab_app.svg)](https://openxlab.org.cn/apps/detail/Amphion/SingVisio) [![Video](https://img.shields.io/badge/Video-Demo-orange)](https://github.com/open-mmlab/Amphion/assets/33707885/0a6e39e8-d5f1-4288-b0f8-32da5a2d6e96)
88
+
89
+
90
+ ## 📀 Installation
91
+
92
+ Amphion can be installed through either Setup Installer or Docker Image.
93
+
94
+ ### Setup Installer
95
+
96
+ ```bash
97
+ git clone https://github.com/open-mmlab/Amphion.git
98
+ cd Amphion
99
+
100
+ # Install Python Environment
101
+ conda create --name amphion python=3.9.15
102
+ conda activate amphion
103
+
104
+ # Install Python Packages Dependencies
105
+ sh env.sh
106
+ ```
107
+
108
+ ### Docker Image
109
+
110
+ 1. Install [Docker](https://docs.docker.com/get-docker/), [NVIDIA Driver](https://www.nvidia.com/download/index.aspx), [NVIDIA Container Toolkit](https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/install-guide.html), and [CUDA](https://developer.nvidia.com/cuda-downloads).
111
+
112
+ 2. Run the following commands:
113
+ ```bash
114
+ git clone https://github.com/open-mmlab/Amphion.git
115
+ cd Amphion
116
+
117
+ docker pull realamphion/amphion
118
+ docker run --runtime=nvidia --gpus all -it -v .:/app realamphion/amphion
119
+ ```
120
+ Mount dataset by argument `-v` is necessary when using Docker. Please refer to [Mount dataset in Docker container](egs/datasets/docker.md) and [Docker Docs](https://docs.docker.com/engine/reference/commandline/container_run/#volume) for more details.
121
+
122
+
123
+ ## 🐍 Usage in Python
124
+
125
+ We detail the instructions of different tasks in the following recipes:
126
+
127
+ - [Text to Speech (TTS)](egs/tts/README.md)
128
+ - [Singing Voice Conversion (SVC)](egs/svc/README.md)
129
+ - [Text to Audio (TTA)](egs/tta/README.md)
130
+ - [Vocoder](egs/vocoder/README.md)
131
+ - [Evaluation](egs/metrics/README.md)
132
+ - [Visualization](egs/visualization/README.md)
133
+
134
+ ## 👨‍💻 Contributing
135
+ We appreciate all contributions to improve Amphion. Please refer to [CONTRIBUTING.md](.github/CONTRIBUTING.md) for the contributing guideline.
136
+
137
+ ## 🙏 Acknowledgement
138
+
139
+
140
+ - [ming024's FastSpeech2](https://github.com/ming024/FastSpeech2) and [jaywalnut310's VITS](https://github.com/jaywalnut310/vits) for model architecture code.
141
+ - [lifeiteng's VALL-E](https://github.com/lifeiteng/vall-e) for training pipeline and model architecture design.
142
+ - [WeNet](https://github.com/wenet-e2e/wenet), [Whisper](https://github.com/openai/whisper), [ContentVec](https://github.com/auspicious3000/contentvec), and [RawNet3](https://github.com/Jungjee/RawNet) for pretrained models and inference code.
143
+ - [HiFi-GAN](https://github.com/jik876/hifi-gan) for GAN-based Vocoder's architecture design and training strategy.
144
+ - [Encodec](https://github.com/facebookresearch/encodec) for well-organized GAN Discriminator's architecture and basic blocks.
145
+ - [Latent Diffusion](https://github.com/CompVis/latent-diffusion) for model architecture design.
146
+ - [TensorFlowTTS](https://github.com/TensorSpeech/TensorFlowTTS) for preparing the MFA tools.
147
+
148
+
149
+ ## ©️ License
150
+
151
+ Amphion is under the [MIT License](LICENSE). It is free for both research and commercial use cases.
152
+
153
+ ## 📚 Citations
154
+
155
+ ```bibtex
156
+ @article{zhang2023amphion,
157
+ title={Amphion: An Open-Source Audio, Music and Speech Generation Toolkit},
158
+ author={Xueyao Zhang and Liumeng Xue and Yicheng Gu and Yuancheng Wang and Haorui He and Chaoren Wang and Xi Chen and Zihao Fang and Haopeng Chen and Junan Zhang and Tze Ying Tang and Lexiao Zou and Mingxuan Wang and Jun Han and Kai Chen and Haizhou Li and Zhizheng Wu},
159
+ journal={arXiv},
160
+ year={2024},
161
+ volume={abs/2312.09911}
162
+ }
163
+ ```
utils/Amphion/__init__.py ADDED
File without changes
utils/Amphion/__pycache__/__init__.cpython-38.pyc ADDED
Binary file (154 Bytes). View file
 
utils/Amphion/__pycache__/__init__.cpython-39.pyc ADDED
Binary file (150 Bytes). View file
 
utils/Amphion/bins/calc_metrics.py ADDED
@@ -0,0 +1,268 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) 2023 Amphion.
2
+ #
3
+ # This source code is licensed under the MIT license found in the
4
+ # LICENSE file in the root directory of this source tree.
5
+
6
+ import os
7
+ import sys
8
+ import numpy as np
9
+ import json
10
+ import argparse
11
+ import whisper
12
+ import torch
13
+
14
+ from glob import glob
15
+ from tqdm import tqdm
16
+ from collections import defaultdict
17
+
18
+
19
+ from evaluation.metrics.energy.energy_rmse import extract_energy_rmse
20
+ from evaluation.metrics.energy.energy_pearson_coefficients import (
21
+ extract_energy_pearson_coeffcients,
22
+ )
23
+ from evaluation.metrics.f0.f0_pearson_coefficients import extract_fpc
24
+ from evaluation.metrics.f0.f0_periodicity_rmse import extract_f0_periodicity_rmse
25
+ from evaluation.metrics.f0.f0_rmse import extract_f0rmse
26
+ from evaluation.metrics.f0.v_uv_f1 import extract_f1_v_uv
27
+ from evaluation.metrics.intelligibility.character_error_rate import extract_cer
28
+ from evaluation.metrics.intelligibility.word_error_rate import extract_wer
29
+ from evaluation.metrics.similarity.speaker_similarity import extract_similarity
30
+ from evaluation.metrics.spectrogram.frechet_distance import extract_fad
31
+ from evaluation.metrics.spectrogram.mel_cepstral_distortion import extract_mcd
32
+ from evaluation.metrics.spectrogram.multi_resolution_stft_distance import extract_mstft
33
+ from evaluation.metrics.spectrogram.pesq import extract_pesq
34
+ from evaluation.metrics.spectrogram.scale_invariant_signal_to_distortion_ratio import (
35
+ extract_si_sdr,
36
+ )
37
+ from evaluation.metrics.spectrogram.scale_invariant_signal_to_noise_ratio import (
38
+ extract_si_snr,
39
+ )
40
+ from evaluation.metrics.spectrogram.short_time_objective_intelligibility import (
41
+ extract_stoi,
42
+ )
43
+
44
+ METRIC_FUNC = {
45
+ "energy_rmse": extract_energy_rmse,
46
+ "energy_pc": extract_energy_pearson_coeffcients,
47
+ "fpc": extract_fpc,
48
+ "f0_periodicity_rmse": extract_f0_periodicity_rmse,
49
+ "f0rmse": extract_f0rmse,
50
+ "v_uv_f1": extract_f1_v_uv,
51
+ "cer": extract_cer,
52
+ "wer": extract_wer,
53
+ "similarity": extract_similarity,
54
+ "fad": extract_fad,
55
+ "mcd": extract_mcd,
56
+ "mstft": extract_mstft,
57
+ "pesq": extract_pesq,
58
+ "si_sdr": extract_si_sdr,
59
+ "si_snr": extract_si_snr,
60
+ "stoi": extract_stoi,
61
+ }
62
+
63
+
64
+ def calc_metric(
65
+ ref_dir,
66
+ deg_dir,
67
+ dump_dir,
68
+ metrics,
69
+ **kwargs,
70
+ ):
71
+ result = defaultdict()
72
+
73
+ for metric in tqdm(metrics):
74
+ if metric in ["fad", "similarity"]:
75
+ result[metric] = str(METRIC_FUNC[metric](ref_dir, deg_dir, kwargs=kwargs))
76
+ continue
77
+
78
+ audios_ref = []
79
+ audios_deg = []
80
+
81
+ files = glob(deg_dir + "/*.wav")
82
+
83
+ for file in files:
84
+ audios_deg.append(file)
85
+ uid = file.split("/")[-1].split(".wav")[0]
86
+ file_gt = ref_dir + "/{}.wav".format(uid)
87
+ audios_ref.append(file_gt)
88
+
89
+ if metric in ["wer", "cer"] and kwargs["intelligibility_mode"] == "gt_content":
90
+ ltr_path = kwargs["ltr_path"]
91
+ tmpltrs = {}
92
+ with open(ltr_path, "r") as f:
93
+ for line in f:
94
+ paras = line.replace("\n", "").split("|")
95
+ paras[1] = paras[1].replace(" ", "")
96
+ paras[1] = paras[1].replace(".", "")
97
+ paras[1] = paras[1].replace("'", "")
98
+ paras[1] = paras[1].replace("-", "")
99
+ paras[1] = paras[1].replace(",", "")
100
+ paras[1] = paras[1].replace("!", "")
101
+ paras[1] = paras[1].lower()
102
+ tmpltrs[paras[0]] = paras[1]
103
+ ltrs = []
104
+ files = glob(ref_dir + "/*.wav")
105
+ for file in files:
106
+ ltrs.append(tmpltrs[os.path.basename(file)])
107
+
108
+ if metric in ["v_uv_f1"]:
109
+ tp_total = 0
110
+ fp_total = 0
111
+ fn_total = 0
112
+
113
+ for i in tqdm(range(len(audios_ref))):
114
+ audio_ref = audios_ref[i]
115
+ audio_deg = audios_deg[i]
116
+ tp, fp, fn = METRIC_FUNC[metric](audio_ref, audio_deg, kwargs=kwargs)
117
+ tp_total += tp
118
+ fp_total += fp
119
+ fn_total += fn
120
+
121
+ result[metric] = str(tp_total / (tp_total + (fp_total + fn_total) / 2))
122
+ else:
123
+ scores = []
124
+ for i in tqdm(range(len(audios_ref))):
125
+ audio_ref = audios_ref[i]
126
+ audio_deg = audios_deg[i]
127
+
128
+ if metric in ["wer", "cer"]:
129
+ model = whisper.load_model("large")
130
+ mode = kwargs["intelligibility_mode"]
131
+ if torch.cuda.is_available():
132
+ device = torch.device("cuda")
133
+ model = model.to(device)
134
+
135
+ if mode == "gt_audio":
136
+ kwargs["audio_ref"] = audio_ref
137
+ kwargs["audio_deg"] = audio_deg
138
+ score = METRIC_FUNC[metric](
139
+ model,
140
+ kwargs=kwargs,
141
+ )
142
+ elif mode == "gt_content":
143
+ kwargs["content_gt"] = ltrs[i]
144
+ kwargs["audio_deg"] = audio_deg
145
+ score = METRIC_FUNC[metric](
146
+ model,
147
+ kwargs=kwargs,
148
+ )
149
+ else:
150
+ score = METRIC_FUNC[metric](
151
+ audio_ref,
152
+ audio_deg,
153
+ kwargs=kwargs,
154
+ )
155
+ if not np.isnan(score):
156
+ scores.append(score)
157
+
158
+ scores = np.array(scores)
159
+ result["{}".format(metric)] = str(np.mean(scores))
160
+
161
+ data = json.dumps(result, indent=4)
162
+
163
+ with open(os.path.join(dump_dir, "result.json"), "w", newline="\n") as f:
164
+ f.write(data)
165
+
166
+
167
+ if __name__ == "__main__":
168
+ parser = argparse.ArgumentParser()
169
+ parser.add_argument(
170
+ "--ref_dir",
171
+ type=str,
172
+ help="Path to the reference audio folder.",
173
+ )
174
+ parser.add_argument(
175
+ "--deg_dir",
176
+ type=str,
177
+ help="Path to the test audio folder.",
178
+ )
179
+ parser.add_argument(
180
+ "--dump_dir",
181
+ type=str,
182
+ help="Path to dump the results.",
183
+ )
184
+ parser.add_argument(
185
+ "--metrics",
186
+ nargs="+",
187
+ help="Metrics used to evaluate.",
188
+ )
189
+ parser.add_argument(
190
+ "--fs",
191
+ type=str,
192
+ default="None",
193
+ help="(Optional) Sampling rate",
194
+ )
195
+ parser.add_argument(
196
+ "--align_method",
197
+ type=str,
198
+ default="dtw",
199
+ help="(Optional) Method for aligning feature length. ['cut', 'dtw']",
200
+ )
201
+
202
+ parser.add_argument(
203
+ "--db_scale",
204
+ type=str,
205
+ default="True",
206
+ help="(Optional) Wether or not computing energy related metrics in db scale.",
207
+ )
208
+ parser.add_argument(
209
+ "--f0_subtract_mean",
210
+ type=str,
211
+ default="True",
212
+ help="(Optional) Wether or not computing f0 related metrics with mean value subtracted.",
213
+ )
214
+
215
+ parser.add_argument(
216
+ "--similarity_model",
217
+ type=str,
218
+ default="wavlm",
219
+ help="(Optional)The model for computing speaker similarity. ['rawnet', 'wavlm', 'resemblyzer']",
220
+ )
221
+ parser.add_argument(
222
+ "--similarity_mode",
223
+ type=str,
224
+ default="pairwith",
225
+ help="(Optional)The method of calculating similarity, where set to overall means computing \
226
+ the speaker similarity between two folder of audios content freely, and set to pairwith means \
227
+ computing the speaker similarity between a seires of paired gt/pred audios",
228
+ )
229
+
230
+ parser.add_argument(
231
+ "--ltr_path",
232
+ type=str,
233
+ default="None",
234
+ help="(Optional)Path to the transcription file,Note that the format in the transcription \
235
+ file is 'file name|transcription'",
236
+ )
237
+ parser.add_argument(
238
+ "--intelligibility_mode",
239
+ type=str,
240
+ default="gt_audio",
241
+ help="(Optional)The method of calculating WER and CER, where set to gt_audio means selecting \
242
+ the recognition content of the reference audio as the target, and set to gt_content means \
243
+ using transcription as the target",
244
+ )
245
+ parser.add_argument(
246
+ "--language",
247
+ type=str,
248
+ default="english",
249
+ help="(Optional)['english','chinese']",
250
+ )
251
+
252
+ args = parser.parse_args()
253
+
254
+ calc_metric(
255
+ args.ref_dir,
256
+ args.deg_dir,
257
+ args.dump_dir,
258
+ args.metrics,
259
+ fs=int(args.fs) if args.fs != "None" else None,
260
+ method=args.align_method,
261
+ db_scale=True if args.db_scale == "True" else False,
262
+ need_mean=True if args.f0_subtract_mean == "True" else False,
263
+ model_name=args.similarity_model,
264
+ similarity_mode=args.similarity_mode,
265
+ ltr_path=args.ltr_path,
266
+ intelligibility_mode=args.intelligibility_mode,
267
+ language=args.language,
268
+ )
utils/Amphion/bins/svc/inference.py ADDED
@@ -0,0 +1,265 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) 2023 Amphion.
2
+ #
3
+ # This source code is licensed under the MIT license found in the
4
+ # LICENSE file in the root directory of this source tree.
5
+
6
+ import argparse
7
+ import os
8
+ import glob
9
+ from tqdm import tqdm
10
+ import json
11
+ import torch
12
+ import time
13
+
14
+ from models.svc.diffusion.diffusion_inference import DiffusionInference
15
+ from models.svc.comosvc.comosvc_inference import ComoSVCInference
16
+ from models.svc.transformer.transformer_inference import TransformerInference
17
+ from models.svc.vits.vits_inference import VitsInference
18
+ from utils.util import load_config
19
+ from utils.audio_slicer import split_audio, merge_segments_encodec
20
+ from processors import acoustic_extractor, content_extractor
21
+
22
+
23
+ def build_inference(args, cfg, infer_type="from_dataset"):
24
+ supported_inference = {
25
+ "DiffWaveNetSVC": DiffusionInference,
26
+ "DiffComoSVC": ComoSVCInference,
27
+ "TransformerSVC": TransformerInference,
28
+ "VitsSVC": VitsInference,
29
+ }
30
+
31
+ inference_class = supported_inference[cfg.model_type]
32
+ return inference_class(args, cfg, infer_type)
33
+
34
+
35
+ def prepare_for_audio_file(args, cfg, num_workers=1):
36
+ preprocess_path = cfg.preprocess.processed_dir
37
+ audio_name = cfg.inference.source_audio_name
38
+ temp_audio_dir = os.path.join(preprocess_path, audio_name)
39
+
40
+ ### eval file
41
+ t = time.time()
42
+ eval_file = prepare_source_eval_file(cfg, temp_audio_dir, audio_name)
43
+ args.source = eval_file
44
+ with open(eval_file, "r") as f:
45
+ metadata = json.load(f)
46
+ print("Prepare for meta eval data: {:.1f}s".format(time.time() - t))
47
+
48
+ ### acoustic features
49
+ t = time.time()
50
+ acoustic_extractor.extract_utt_acoustic_features_serial(
51
+ metadata, temp_audio_dir, cfg
52
+ )
53
+ if cfg.preprocess.use_min_max_norm_mel == True:
54
+ acoustic_extractor.cal_mel_min_max(
55
+ dataset=audio_name, output_path=preprocess_path, cfg=cfg, metadata=metadata
56
+ )
57
+ acoustic_extractor.cal_pitch_statistics_svc(
58
+ dataset=audio_name, output_path=preprocess_path, cfg=cfg, metadata=metadata
59
+ )
60
+ print("Prepare for acoustic features: {:.1f}s".format(time.time() - t))
61
+
62
+ ### content features
63
+ t = time.time()
64
+ content_extractor.extract_utt_content_features_dataloader(
65
+ cfg, metadata, num_workers
66
+ )
67
+ print("Prepare for content features: {:.1f}s".format(time.time() - t))
68
+ return args, cfg, temp_audio_dir
69
+
70
+
71
+ def merge_for_audio_segments(audio_files, args, cfg):
72
+ audio_name = cfg.inference.source_audio_name
73
+ target_singer_name = args.target_singer
74
+
75
+ merge_segments_encodec(
76
+ wav_files=audio_files,
77
+ fs=cfg.preprocess.sample_rate,
78
+ output_path=os.path.join(
79
+ args.output_dir, "{}_{}.wav".format(audio_name, target_singer_name)
80
+ ),
81
+ overlap_duration=cfg.inference.segments_overlap_duration,
82
+ )
83
+
84
+ for tmp_file in audio_files:
85
+ os.remove(tmp_file)
86
+
87
+
88
+ def prepare_source_eval_file(cfg, temp_audio_dir, audio_name):
89
+ """
90
+ Prepare the eval file (json) for an audio
91
+ """
92
+
93
+ audio_chunks_results = split_audio(
94
+ wav_file=cfg.inference.source_audio_path,
95
+ target_sr=cfg.preprocess.sample_rate,
96
+ output_dir=os.path.join(temp_audio_dir, "wavs"),
97
+ max_duration_of_segment=cfg.inference.segments_max_duration,
98
+ overlap_duration=cfg.inference.segments_overlap_duration,
99
+ )
100
+
101
+ metadata = []
102
+ for i, res in enumerate(audio_chunks_results):
103
+ res["index"] = i
104
+ res["Dataset"] = audio_name
105
+ res["Singer"] = audio_name
106
+ res["Uid"] = "{}_{}".format(audio_name, res["Uid"])
107
+ metadata.append(res)
108
+
109
+ eval_file = os.path.join(temp_audio_dir, "eval.json")
110
+ with open(eval_file, "w") as f:
111
+ json.dump(metadata, f, indent=4, ensure_ascii=False, sort_keys=True)
112
+
113
+ return eval_file
114
+
115
+
116
+ def cuda_relevant(deterministic=False):
117
+ torch.cuda.empty_cache()
118
+ # TF32 on Ampere and above
119
+ torch.backends.cuda.matmul.allow_tf32 = True
120
+ torch.backends.cudnn.enabled = True
121
+ torch.backends.cudnn.allow_tf32 = True
122
+ # Deterministic
123
+ torch.backends.cudnn.deterministic = deterministic
124
+ torch.backends.cudnn.benchmark = not deterministic
125
+ torch.use_deterministic_algorithms(deterministic)
126
+
127
+
128
+ def infer(args, cfg, infer_type):
129
+ # Build inference
130
+ t = time.time()
131
+ trainer = build_inference(args, cfg, infer_type)
132
+ print("Model Init: {:.1f}s".format(time.time() - t))
133
+
134
+ # Run inference
135
+ t = time.time()
136
+ output_audio_files = trainer.inference()
137
+ print("Model inference: {:.1f}s".format(time.time() - t))
138
+ return output_audio_files
139
+
140
+
141
+ def build_parser():
142
+ r"""Build argument parser for inference.py.
143
+ Anything else should be put in an extra config YAML file.
144
+ """
145
+
146
+ parser = argparse.ArgumentParser()
147
+ parser.add_argument(
148
+ "--config",
149
+ type=str,
150
+ required=True,
151
+ help="JSON/YAML file for configurations.",
152
+ )
153
+ parser.add_argument(
154
+ "--acoustics_dir",
155
+ type=str,
156
+ help="Acoustics model checkpoint directory. If a directory is given, "
157
+ "search for the latest checkpoint dir in the directory. If a specific "
158
+ "checkpoint dir is given, directly load the checkpoint.",
159
+ )
160
+ parser.add_argument(
161
+ "--vocoder_dir",
162
+ type=str,
163
+ required=True,
164
+ help="Vocoder checkpoint directory. Searching behavior is the same as "
165
+ "the acoustics one.",
166
+ )
167
+ parser.add_argument(
168
+ "--target_singer",
169
+ type=str,
170
+ required=True,
171
+ help="convert to a specific singer (e.g. --target_singers singer_id).",
172
+ )
173
+ parser.add_argument(
174
+ "--trans_key",
175
+ default=0,
176
+ help="0: no pitch shift; autoshift: pitch shift; int: key shift.",
177
+ )
178
+ parser.add_argument(
179
+ "--source",
180
+ type=str,
181
+ default="source_audio",
182
+ help="Source audio file or directory. If a JSON file is given, "
183
+ "inference from dataset is applied. If a directory is given, "
184
+ "inference from all wav/flac/mp3 audio files in the directory is applied. "
185
+ "Default: inference from all wav/flac/mp3 audio files in ./source_audio",
186
+ )
187
+ parser.add_argument(
188
+ "--output_dir",
189
+ type=str,
190
+ default="conversion_results",
191
+ help="Output directory. Default: ./conversion_results",
192
+ )
193
+ parser.add_argument(
194
+ "--log_level",
195
+ type=str,
196
+ default="warning",
197
+ help="Logging level. Default: warning",
198
+ )
199
+ parser.add_argument(
200
+ "--keep_cache",
201
+ action="store_true",
202
+ default=True,
203
+ help="Keep cache files. Only applicable to inference from files.",
204
+ )
205
+ parser.add_argument(
206
+ "--diffusion_inference_steps",
207
+ type=int,
208
+ default=1000,
209
+ help="Number of inference steps. Only applicable to diffusion inference.",
210
+ )
211
+ return parser
212
+
213
+
214
+ def main():
215
+ ### Parse arguments and config
216
+ args = build_parser().parse_args()
217
+ cfg = load_config(args.config)
218
+
219
+ # CUDA settings
220
+ cuda_relevant()
221
+
222
+ if os.path.isdir(args.source):
223
+ ### Infer from file
224
+
225
+ # Get all the source audio files (.wav, .flac, .mp3)
226
+ source_audio_dir = args.source
227
+ audio_list = []
228
+ for suffix in ["wav", "flac", "mp3"]:
229
+ audio_list += glob.glob(
230
+ os.path.join(source_audio_dir, "**/*.{}".format(suffix)), recursive=True
231
+ )
232
+ print("There are {} source audios: ".format(len(audio_list)))
233
+
234
+ # Infer for every file as dataset
235
+ output_root_path = args.output_dir
236
+ for audio_path in tqdm(audio_list):
237
+ audio_name = audio_path.split("/")[-1].split(".")[0]
238
+ args.output_dir = os.path.join(output_root_path, audio_name)
239
+ print("\n{}\nConversion for {}...\n".format("*" * 10, audio_name))
240
+
241
+ cfg.inference.source_audio_path = audio_path
242
+ cfg.inference.source_audio_name = audio_name
243
+ cfg.inference.segments_max_duration = 10.0
244
+ cfg.inference.segments_overlap_duration = 1.0
245
+
246
+ # Prepare metadata and features
247
+ args, cfg, cache_dir = prepare_for_audio_file(args, cfg)
248
+
249
+ # Infer from file
250
+ output_audio_files = infer(args, cfg, infer_type="from_file")
251
+
252
+ # Merge the split segments
253
+ merge_for_audio_segments(output_audio_files, args, cfg)
254
+
255
+ # Keep or remove caches
256
+ if not args.keep_cache:
257
+ os.removedirs(cache_dir)
258
+
259
+ else:
260
+ ### Infer from dataset
261
+ infer(args, cfg, infer_type="from_dataset")
262
+
263
+
264
+ if __name__ == "__main__":
265
+ main()
utils/Amphion/bins/svc/preprocess.py ADDED
@@ -0,0 +1,183 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) 2023 Amphion.
2
+ #
3
+ # This source code is licensed under the MIT license found in the
4
+ # LICENSE file in the root directory of this source tree.
5
+
6
+ import faulthandler
7
+
8
+ faulthandler.enable()
9
+
10
+ import os
11
+ import argparse
12
+ import json
13
+ from multiprocessing import cpu_count
14
+
15
+
16
+ from utils.util import load_config
17
+ from preprocessors.processor import preprocess_dataset
18
+ from preprocessors.metadata import cal_metadata
19
+ from processors import acoustic_extractor, content_extractor, data_augment
20
+
21
+
22
+ def extract_acoustic_features(dataset, output_path, cfg, n_workers=1):
23
+ """Extract acoustic features of utterances in the dataset
24
+
25
+ Args:
26
+ dataset (str): name of dataset, e.g. opencpop
27
+ output_path (str): directory that stores train, test and feature files of datasets
28
+ cfg (dict): dictionary that stores configurations
29
+ n_workers (int, optional): num of processes to extract features in parallel. Defaults to 1.
30
+ """
31
+ types = ["train", "test"] if "eval" not in dataset else ["test"]
32
+ metadata = []
33
+ dataset_output = os.path.join(output_path, dataset)
34
+
35
+ for dataset_type in types:
36
+ dataset_file = os.path.join(dataset_output, "{}.json".format(dataset_type))
37
+ with open(dataset_file, "r") as f:
38
+ metadata.extend(json.load(f))
39
+
40
+ # acoustic_extractor.extract_utt_acoustic_features_parallel(
41
+ # metadata, dataset_output, cfg, n_workers=n_workers
42
+ # )
43
+ acoustic_extractor.extract_utt_acoustic_features_serial(
44
+ metadata, dataset_output, cfg
45
+ )
46
+
47
+
48
+ def extract_content_features(dataset, output_path, cfg, num_workers=1):
49
+ """Extract content features of utterances in the dataset
50
+
51
+ Args:
52
+ dataset (str): name of dataset, e.g. opencpop
53
+ output_path (str): directory that stores train, test and feature files of datasets
54
+ cfg (dict): dictionary that stores configurations
55
+ """
56
+ types = ["train", "test"] if "eval" not in dataset else ["test"]
57
+ metadata = []
58
+ for dataset_type in types:
59
+ dataset_output = os.path.join(output_path, dataset)
60
+ dataset_file = os.path.join(dataset_output, "{}.json".format(dataset_type))
61
+ with open(dataset_file, "r") as f:
62
+ metadata.extend(json.load(f))
63
+
64
+ content_extractor.extract_utt_content_features_dataloader(
65
+ cfg, metadata, num_workers
66
+ )
67
+
68
+
69
+ def preprocess(cfg, args):
70
+ """Proprocess raw data of single or multiple datasets (in cfg.dataset)
71
+
72
+ Args:
73
+ cfg (dict): dictionary that stores configurations
74
+ args (ArgumentParser): specify the configuration file and num_workers
75
+ """
76
+ # Specify the output root path to save the processed data
77
+ output_path = cfg.preprocess.processed_dir
78
+ os.makedirs(output_path, exist_ok=True)
79
+
80
+ ## Split train and test sets
81
+ for dataset in cfg.dataset:
82
+ print("Preprocess {}...".format(dataset))
83
+ preprocess_dataset(
84
+ dataset,
85
+ cfg.dataset_path[dataset],
86
+ output_path,
87
+ cfg.preprocess,
88
+ cfg.task_type,
89
+ is_custom_dataset=dataset in cfg.use_custom_dataset,
90
+ )
91
+
92
+ # Data augmentation: create new wav files with pitch shift, formant shift, equalizer, time stretch
93
+ try:
94
+ assert isinstance(
95
+ cfg.preprocess.data_augment, list
96
+ ), "Please provide a list of datasets need to be augmented."
97
+ if len(cfg.preprocess.data_augment) > 0:
98
+ new_datasets_list = []
99
+ for dataset in cfg.preprocess.data_augment:
100
+ new_datasets = data_augment.augment_dataset(cfg, dataset)
101
+ new_datasets_list.extend(new_datasets)
102
+ cfg.dataset.extend(new_datasets_list)
103
+ print("Augmentation datasets: ", cfg.dataset)
104
+ except:
105
+ print("No Data Augmentation.")
106
+
107
+ # Dump metadata of datasets (singers, train/test durations, etc.)
108
+ cal_metadata(cfg)
109
+
110
+ ## Prepare the acoustic features
111
+ for dataset in cfg.dataset:
112
+ # Skip augmented datasets which do not need to extract acoustic features
113
+ # We will copy acoustic features from the original dataset later
114
+ if (
115
+ "pitch_shift" in dataset
116
+ or "formant_shift" in dataset
117
+ or "equalizer" in dataset in dataset
118
+ ):
119
+ continue
120
+ print(
121
+ "Extracting acoustic features for {} using {} workers ...".format(
122
+ dataset, args.num_workers
123
+ )
124
+ )
125
+ extract_acoustic_features(dataset, output_path, cfg, args.num_workers)
126
+ # Calculate the statistics of acoustic features
127
+ if cfg.preprocess.mel_min_max_norm:
128
+ acoustic_extractor.cal_mel_min_max(dataset, output_path, cfg)
129
+
130
+ if cfg.preprocess.extract_pitch:
131
+ acoustic_extractor.cal_pitch_statistics_svc(dataset, output_path, cfg)
132
+
133
+ # Copy acoustic features for augmented datasets by creating soft-links
134
+ for dataset in cfg.dataset:
135
+ if "pitch_shift" in dataset:
136
+ src_dataset = dataset.replace("_pitch_shift", "")
137
+ src_dataset_dir = os.path.join(output_path, src_dataset)
138
+ elif "formant_shift" in dataset:
139
+ src_dataset = dataset.replace("_formant_shift", "")
140
+ src_dataset_dir = os.path.join(output_path, src_dataset)
141
+ elif "equalizer" in dataset:
142
+ src_dataset = dataset.replace("_equalizer", "")
143
+ src_dataset_dir = os.path.join(output_path, src_dataset)
144
+ else:
145
+ continue
146
+ dataset_dir = os.path.join(output_path, dataset)
147
+ metadata = []
148
+ for split in ["train", "test"] if not "eval" in dataset else ["test"]:
149
+ metadata_file_path = os.path.join(src_dataset_dir, "{}.json".format(split))
150
+ with open(metadata_file_path, "r") as f:
151
+ metadata.extend(json.load(f))
152
+ print("Copying acoustic features for {}...".format(dataset))
153
+ acoustic_extractor.copy_acoustic_features(
154
+ metadata, dataset_dir, src_dataset_dir, cfg
155
+ )
156
+ if cfg.preprocess.mel_min_max_norm:
157
+ acoustic_extractor.cal_mel_min_max(dataset, output_path, cfg)
158
+
159
+ if cfg.preprocess.extract_pitch:
160
+ acoustic_extractor.cal_pitch_statistics(dataset, output_path, cfg)
161
+
162
+ # Prepare the content features
163
+ for dataset in cfg.dataset:
164
+ print("Extracting content features for {}...".format(dataset))
165
+ extract_content_features(dataset, output_path, cfg, args.num_workers)
166
+
167
+
168
+ def main():
169
+ parser = argparse.ArgumentParser()
170
+ parser.add_argument(
171
+ "--config", default="config.json", help="json files for configurations."
172
+ )
173
+ parser.add_argument("--num_workers", type=int, default=int(cpu_count()))
174
+ parser.add_argument("--prepare_alignment", type=bool, default=False)
175
+
176
+ args = parser.parse_args()
177
+ cfg = load_config(args.config)
178
+
179
+ preprocess(cfg, args)
180
+
181
+
182
+ if __name__ == "__main__":
183
+ main()
utils/Amphion/bins/svc/train.py ADDED
@@ -0,0 +1,111 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) 2023 Amphion.
2
+ #
3
+ # This source code is licensed under the MIT license found in the
4
+ # LICENSE file in the root directory of this source tree.
5
+
6
+ import argparse
7
+
8
+ import torch
9
+
10
+ from models.svc.diffusion.diffusion_trainer import DiffusionTrainer
11
+ from models.svc.comosvc.comosvc_trainer import ComoSVCTrainer
12
+ from models.svc.transformer.transformer_trainer import TransformerTrainer
13
+ from models.svc.vits.vits_trainer import VitsSVCTrainer
14
+ from utils.util import load_config
15
+
16
+
17
+ def build_trainer(args, cfg):
18
+ supported_trainer = {
19
+ "DiffWaveNetSVC": DiffusionTrainer,
20
+ "DiffComoSVC": ComoSVCTrainer,
21
+ "TransformerSVC": TransformerTrainer,
22
+ "VitsSVC": VitsSVCTrainer,
23
+ }
24
+
25
+ trainer_class = supported_trainer[cfg.model_type]
26
+ trainer = trainer_class(args, cfg)
27
+ return trainer
28
+
29
+
30
+ def cuda_relevant(deterministic=False):
31
+ torch.cuda.empty_cache()
32
+ # TF32 on Ampere and above
33
+ torch.backends.cuda.matmul.allow_tf32 = True
34
+ torch.backends.cudnn.enabled = True
35
+ torch.backends.cudnn.allow_tf32 = True
36
+ # Deterministic
37
+ torch.backends.cudnn.deterministic = deterministic
38
+ torch.backends.cudnn.benchmark = not deterministic
39
+ torch.use_deterministic_algorithms(deterministic)
40
+
41
+
42
+ def main():
43
+ parser = argparse.ArgumentParser()
44
+ parser.add_argument(
45
+ "--config",
46
+ default="config.json",
47
+ help="json files for configurations.",
48
+ required=True,
49
+ )
50
+ parser.add_argument(
51
+ "--exp_name",
52
+ type=str,
53
+ default="exp_name",
54
+ help="A specific name to note the experiment",
55
+ required=True,
56
+ )
57
+ parser.add_argument(
58
+ "--resume",
59
+ action="store_true",
60
+ help="If specified, to resume from the existing checkpoint.",
61
+ )
62
+ parser.add_argument(
63
+ "--resume_from_ckpt_path",
64
+ type=str,
65
+ default="",
66
+ help="The specific checkpoint path that you want to resume from.",
67
+ )
68
+ parser.add_argument(
69
+ "--resume_type",
70
+ type=str,
71
+ default="",
72
+ help="`resume` for loading all the things (including model weights, optimizer, scheduler, and random states). `finetune` for loading only the model weights",
73
+ )
74
+
75
+ parser.add_argument(
76
+ "--log_level", default="warning", help="logging level (debug, info, warning)"
77
+ )
78
+ args = parser.parse_args()
79
+ cfg = load_config(args.config)
80
+
81
+ # Data Augmentation
82
+ if (
83
+ type(cfg.preprocess.data_augment) == list
84
+ and len(cfg.preprocess.data_augment) > 0
85
+ ):
86
+ new_datasets_list = []
87
+ for dataset in cfg.preprocess.data_augment:
88
+ new_datasets = [
89
+ f"{dataset}_pitch_shift" if cfg.preprocess.use_pitch_shift else None,
90
+ (
91
+ f"{dataset}_formant_shift"
92
+ if cfg.preprocess.use_formant_shift
93
+ else None
94
+ ),
95
+ f"{dataset}_equalizer" if cfg.preprocess.use_equalizer else None,
96
+ f"{dataset}_time_stretch" if cfg.preprocess.use_time_stretch else None,
97
+ ]
98
+ new_datasets_list.extend(filter(None, new_datasets))
99
+ cfg.dataset.extend(new_datasets_list)
100
+
101
+ # CUDA settings
102
+ cuda_relevant()
103
+
104
+ # Build trainer
105
+ trainer = build_trainer(args, cfg)
106
+
107
+ trainer.train_loop()
108
+
109
+
110
+ if __name__ == "__main__":
111
+ main()
utils/Amphion/bins/tta/inference.py ADDED
@@ -0,0 +1,94 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) 2023 Amphion.
2
+ #
3
+ # This source code is licensed under the MIT license found in the
4
+ # LICENSE file in the root directory of this source tree.
5
+
6
+ import argparse
7
+ from argparse import ArgumentParser
8
+ import os
9
+
10
+ from models.tta.ldm.audioldm_inference import AudioLDMInference
11
+ from utils.util import save_config, load_model_config, load_config
12
+ import numpy as np
13
+ import torch
14
+
15
+
16
+ def build_inference(args, cfg):
17
+ supported_inference = {
18
+ "AudioLDM": AudioLDMInference,
19
+ }
20
+
21
+ inference_class = supported_inference[cfg.model_type]
22
+ inference = inference_class(args, cfg)
23
+ return inference
24
+
25
+
26
+ def build_parser():
27
+ parser = argparse.ArgumentParser()
28
+
29
+ parser.add_argument(
30
+ "--config",
31
+ type=str,
32
+ required=True,
33
+ help="JSON/YAML file for configurations.",
34
+ )
35
+ parser.add_argument(
36
+ "--text",
37
+ help="Text to be synthesized",
38
+ type=str,
39
+ default="Text to be synthesized.",
40
+ )
41
+ parser.add_argument(
42
+ "--checkpoint_path",
43
+ type=str,
44
+ )
45
+ parser.add_argument(
46
+ "--vocoder_path", type=str, help="Checkpoint path of the vocoder"
47
+ )
48
+ parser.add_argument(
49
+ "--vocoder_config_path", type=str, help="Config path of the vocoder"
50
+ )
51
+ parser.add_argument(
52
+ "--output_dir",
53
+ type=str,
54
+ default=None,
55
+ help="Output dir for saving generated results",
56
+ )
57
+ parser.add_argument(
58
+ "--num_steps",
59
+ type=int,
60
+ default=200,
61
+ help="The total number of denosing steps",
62
+ )
63
+ parser.add_argument(
64
+ "--guidance_scale",
65
+ type=float,
66
+ default=4.0,
67
+ help="The scale of classifer free guidance",
68
+ )
69
+ parser.add_argument("--local_rank", default=-1, type=int)
70
+ return parser
71
+
72
+
73
+ def main():
74
+ # Parse arguments
75
+ args = build_parser().parse_args()
76
+ # args, infer_type = formulate_parser(args)
77
+
78
+ # Parse config
79
+ cfg = load_config(args.config)
80
+ if torch.cuda.is_available():
81
+ args.local_rank = torch.device("cuda")
82
+ else:
83
+ args.local_rank = torch.device("cpu")
84
+ print("args: ", args)
85
+
86
+ # Build inference
87
+ inferencer = build_inference(args, cfg)
88
+
89
+ # Run inference
90
+ inferencer.inference()
91
+
92
+
93
+ if __name__ == "__main__":
94
+ main()
utils/Amphion/bins/tta/preprocess.py ADDED
@@ -0,0 +1,195 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) 2023 Amphion.
2
+ #
3
+ # This source code is licensed under the MIT license found in the
4
+ # LICENSE file in the root directory of this source tree.
5
+
6
+ import faulthandler
7
+
8
+ faulthandler.enable()
9
+
10
+ import os
11
+ import argparse
12
+ import json
13
+ import pyworld as pw
14
+ from multiprocessing import cpu_count
15
+
16
+
17
+ from utils.util import load_config
18
+ from preprocessors.processor import preprocess_dataset, prepare_align
19
+ from preprocessors.metadata import cal_metadata
20
+ from processors import acoustic_extractor, content_extractor, data_augment
21
+
22
+
23
+ def extract_acoustic_features(dataset, output_path, cfg, n_workers=1):
24
+ """Extract acoustic features of utterances in the dataset
25
+
26
+ Args:
27
+ dataset (str): name of dataset, e.g. opencpop
28
+ output_path (str): directory that stores train, test and feature files of datasets
29
+ cfg (dict): dictionary that stores configurations
30
+ n_workers (int, optional): num of processes to extract features in parallel. Defaults to 1.
31
+ """
32
+ types = ["train", "test"] if "eval" not in dataset else ["test"]
33
+ metadata = []
34
+ for dataset_type in types:
35
+ dataset_output = os.path.join(output_path, dataset)
36
+ dataset_file = os.path.join(dataset_output, "{}.json".format(dataset_type))
37
+ with open(dataset_file, "r") as f:
38
+ metadata.extend(json.load(f))
39
+
40
+ # acoustic_extractor.extract_utt_acoustic_features_parallel(
41
+ # metadata, dataset_output, cfg, n_workers=n_workers
42
+ # )
43
+ acoustic_extractor.extract_utt_acoustic_features_serial(
44
+ metadata, dataset_output, cfg
45
+ )
46
+
47
+
48
+ def extract_content_features(dataset, output_path, cfg, num_workers=1):
49
+ """Extract content features of utterances in the dataset
50
+
51
+ Args:
52
+ dataset (str): name of dataset, e.g. opencpop
53
+ output_path (str): directory that stores train, test and feature files of datasets
54
+ cfg (dict): dictionary that stores configurations
55
+ """
56
+ types = ["train", "test"] if "eval" not in dataset else ["test"]
57
+ metadata = []
58
+ for dataset_type in types:
59
+ dataset_output = os.path.join(output_path, dataset)
60
+ dataset_file = os.path.join(dataset_output, "{}.json".format(dataset_type))
61
+ with open(dataset_file, "r") as f:
62
+ metadata.extend(json.load(f))
63
+
64
+ content_extractor.extract_utt_content_features_dataloader(
65
+ cfg, metadata, num_workers
66
+ )
67
+
68
+
69
+ def preprocess(cfg, args):
70
+ """Proprocess raw data of single or multiple datasets (in cfg.dataset)
71
+
72
+ Args:
73
+ cfg (dict): dictionary that stores configurations
74
+ args (ArgumentParser): specify the configuration file and num_workers
75
+ """
76
+ # Specify the output root path to save the processed data
77
+ output_path = cfg.preprocess.processed_dir
78
+ os.makedirs(output_path, exist_ok=True)
79
+
80
+ ## Split train and test sets
81
+ for dataset in cfg.dataset:
82
+ print("Preprocess {}...".format(dataset))
83
+
84
+ if args.prepare_alignment:
85
+ ## Prepare alignment with MFA
86
+ print("Prepare alignment {}...".format(dataset))
87
+ prepare_align(
88
+ dataset, cfg.dataset_path[dataset], cfg.preprocess, output_path
89
+ )
90
+ preprocess_dataset(
91
+ dataset,
92
+ cfg.dataset_path[dataset],
93
+ output_path,
94
+ cfg.preprocess,
95
+ cfg.task_type,
96
+ is_custom_dataset=dataset in cfg.use_custom_dataset,
97
+ )
98
+
99
+ # Data augmentation: create new wav files with pitch shift, formant shift, equalizer, time stretch
100
+ try:
101
+ assert isinstance(
102
+ cfg.preprocess.data_augment, list
103
+ ), "Please provide a list of datasets need to be augmented."
104
+ if len(cfg.preprocess.data_augment) > 0:
105
+ new_datasets_list = []
106
+ for dataset in cfg.preprocess.data_augment:
107
+ new_datasets = data_augment.augment_dataset(cfg, dataset)
108
+ new_datasets_list.extend(new_datasets)
109
+ cfg.dataset.extend(new_datasets_list)
110
+ print("Augmentation datasets: ", cfg.dataset)
111
+ except:
112
+ print("No Data Augmentation.")
113
+
114
+ # Dump metadata of datasets (singers, train/test durations, etc.)
115
+ cal_metadata(cfg)
116
+
117
+ ## Prepare the acoustic features
118
+ for dataset in cfg.dataset:
119
+ # Skip augmented datasets which do not need to extract acoustic features
120
+ # We will copy acoustic features from the original dataset later
121
+ if (
122
+ "pitch_shift" in dataset
123
+ or "formant_shift" in dataset
124
+ or "equalizer" in dataset in dataset
125
+ ):
126
+ continue
127
+ print(
128
+ "Extracting acoustic features for {} using {} workers ...".format(
129
+ dataset, args.num_workers
130
+ )
131
+ )
132
+ extract_acoustic_features(dataset, output_path, cfg, args.num_workers)
133
+ # Calculate the statistics of acoustic features
134
+ if cfg.preprocess.mel_min_max_norm:
135
+ acoustic_extractor.cal_mel_min_max(dataset, output_path, cfg)
136
+
137
+ if cfg.preprocess.extract_pitch:
138
+ acoustic_extractor.cal_pitch_statistics(dataset, output_path, cfg)
139
+ if cfg.preprocess.extract_energy:
140
+ acoustic_extractor.cal_energy_statistics(dataset, output_path, cfg)
141
+
142
+ if cfg.preprocess.align_mel_duration:
143
+ acoustic_extractor.align_duration_mel(dataset, output_path, cfg)
144
+
145
+ # Copy acoustic features for augmented datasets by creating soft-links
146
+ for dataset in cfg.dataset:
147
+ if "pitch_shift" in dataset:
148
+ src_dataset = dataset.replace("_pitch_shift", "")
149
+ src_dataset_dir = os.path.join(output_path, src_dataset)
150
+ elif "formant_shift" in dataset:
151
+ src_dataset = dataset.replace("_formant_shift", "")
152
+ src_dataset_dir = os.path.join(output_path, src_dataset)
153
+ elif "equalizer" in dataset:
154
+ src_dataset = dataset.replace("_equalizer", "")
155
+ src_dataset_dir = os.path.join(output_path, src_dataset)
156
+ else:
157
+ continue
158
+ dataset_dir = os.path.join(output_path, dataset)
159
+ metadata = []
160
+ for split in ["train", "test"] if not "eval" in dataset else ["test"]:
161
+ metadata_file_path = os.path.join(src_dataset_dir, "{}.json".format(split))
162
+ with open(metadata_file_path, "r") as f:
163
+ metadata.extend(json.load(f))
164
+ print("Copying acoustic features for {}...".format(dataset))
165
+ acoustic_extractor.copy_acoustic_features(
166
+ metadata, dataset_dir, src_dataset_dir, cfg
167
+ )
168
+ if cfg.preprocess.mel_min_max_norm:
169
+ acoustic_extractor.cal_mel_min_max(dataset, output_path, cfg)
170
+
171
+ if cfg.preprocess.extract_pitch:
172
+ acoustic_extractor.cal_pitch_statistics(dataset, output_path, cfg)
173
+
174
+ # Prepare the content features
175
+ for dataset in cfg.dataset:
176
+ print("Extracting content features for {}...".format(dataset))
177
+ extract_content_features(dataset, output_path, cfg, args.num_workers)
178
+
179
+
180
+ def main():
181
+ parser = argparse.ArgumentParser()
182
+ parser.add_argument(
183
+ "--config", default="config.json", help="json files for configurations."
184
+ )
185
+ parser.add_argument("--num_workers", type=int, default=int(cpu_count()))
186
+ parser.add_argument("--prepare_alignment", type=bool, default=False)
187
+
188
+ args = parser.parse_args()
189
+ cfg = load_config(args.config)
190
+
191
+ preprocess(cfg, args)
192
+
193
+
194
+ if __name__ == "__main__":
195
+ main()
utils/Amphion/bins/tta/train_tta.py ADDED
@@ -0,0 +1,77 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) 2023 Amphion.
2
+ #
3
+ # This source code is licensed under the MIT license found in the
4
+ # LICENSE file in the root directory of this source tree.
5
+
6
+ import argparse
7
+ import os
8
+ import torch
9
+
10
+ from models.tta.autoencoder.autoencoder_trainer import AutoencoderKLTrainer
11
+ from models.tta.ldm.audioldm_trainer import AudioLDMTrainer
12
+ from utils.util import load_config
13
+
14
+
15
+ def build_trainer(args, cfg):
16
+ supported_trainer = {
17
+ "AutoencoderKL": AutoencoderKLTrainer,
18
+ "AudioLDM": AudioLDMTrainer,
19
+ }
20
+
21
+ trainer_class = supported_trainer[cfg.model_type]
22
+ trainer = trainer_class(args, cfg)
23
+ return trainer
24
+
25
+
26
+ def main():
27
+ parser = argparse.ArgumentParser()
28
+ parser.add_argument(
29
+ "--config",
30
+ default="config.json",
31
+ help="json files for configurations.",
32
+ required=True,
33
+ )
34
+ parser.add_argument(
35
+ "--num_workers", type=int, default=6, help="Number of dataloader workers."
36
+ )
37
+ parser.add_argument(
38
+ "--exp_name",
39
+ type=str,
40
+ default="exp_name",
41
+ help="A specific name to note the experiment",
42
+ required=True,
43
+ )
44
+ parser.add_argument(
45
+ "--resume",
46
+ type=str,
47
+ default=None,
48
+ # action="store_true",
49
+ help="The model name to restore",
50
+ )
51
+ parser.add_argument(
52
+ "--log_level", default="info", help="logging level (info, debug, warning)"
53
+ )
54
+ parser.add_argument("--stdout_interval", default=5, type=int)
55
+ parser.add_argument("--local_rank", default=-1, type=int)
56
+ args = parser.parse_args()
57
+ cfg = load_config(args.config)
58
+ cfg.exp_name = args.exp_name
59
+
60
+ # Model saving dir
61
+ args.log_dir = os.path.join(cfg.log_dir, args.exp_name)
62
+ os.makedirs(args.log_dir, exist_ok=True)
63
+
64
+ if not cfg.train.ddp:
65
+ args.local_rank = torch.device("cuda")
66
+
67
+ # Build trainer
68
+ trainer = build_trainer(args, cfg)
69
+
70
+ # Restore models
71
+ if args.resume:
72
+ trainer.restore()
73
+ trainer.train()
74
+
75
+
76
+ if __name__ == "__main__":
77
+ main()
utils/Amphion/bins/tts/inference.py ADDED
@@ -0,0 +1,167 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) 2023 Amphion.
2
+ #
3
+ # This source code is licensed under the MIT license found in the
4
+ # LICENSE file in the root directory of this source tree.
5
+
6
+ import argparse
7
+ from argparse import ArgumentParser
8
+ import os
9
+
10
+ from models.tts.fastspeech2.fs2_inference import FastSpeech2Inference
11
+ from models.tts.vits.vits_inference import VitsInference
12
+ from models.tts.valle.valle_inference import VALLEInference
13
+ from models.tts.naturalspeech2.ns2_inference import NS2Inference
14
+ from utils.util import load_config
15
+ import torch
16
+
17
+
18
+ def build_inference(args, cfg):
19
+ supported_inference = {
20
+ "FastSpeech2": FastSpeech2Inference,
21
+ "VITS": VitsInference,
22
+ "VALLE": VALLEInference,
23
+ "NaturalSpeech2": NS2Inference,
24
+ }
25
+
26
+ inference_class = supported_inference[cfg.model_type]
27
+ inference = inference_class(args, cfg)
28
+ return inference
29
+
30
+
31
+ def cuda_relevant(deterministic=False):
32
+ torch.cuda.empty_cache()
33
+ # TF32 on Ampere and above
34
+ torch.backends.cuda.matmul.allow_tf32 = True
35
+ torch.backends.cudnn.enabled = True
36
+ torch.backends.cudnn.allow_tf32 = True
37
+ # Deterministic
38
+ torch.backends.cudnn.deterministic = deterministic
39
+ torch.backends.cudnn.benchmark = not deterministic
40
+ torch.use_deterministic_algorithms(deterministic)
41
+
42
+
43
+ def build_parser():
44
+ parser = argparse.ArgumentParser()
45
+
46
+ parser.add_argument(
47
+ "--config",
48
+ type=str,
49
+ required=True,
50
+ help="JSON/YAML file for configurations.",
51
+ )
52
+ parser.add_argument(
53
+ "--dataset",
54
+ type=str,
55
+ help="convert from the source data",
56
+ default=None,
57
+ )
58
+ parser.add_argument(
59
+ "--testing_set",
60
+ type=str,
61
+ help="train, test, golden_test",
62
+ default="test",
63
+ )
64
+ parser.add_argument(
65
+ "--test_list_file",
66
+ type=str,
67
+ help="convert from the test list file",
68
+ default=None,
69
+ )
70
+ parser.add_argument(
71
+ "--speaker_name",
72
+ type=str,
73
+ default=None,
74
+ help="speaker name for multi-speaker synthesis, for single-sentence mode only",
75
+ )
76
+ parser.add_argument(
77
+ "--text",
78
+ help="Text to be synthesized.",
79
+ type=str,
80
+ default="",
81
+ )
82
+ parser.add_argument(
83
+ "--vocoder_dir",
84
+ type=str,
85
+ default=None,
86
+ help="Vocoder checkpoint directory. Searching behavior is the same as "
87
+ "the acoustics one.",
88
+ )
89
+ parser.add_argument(
90
+ "--acoustics_dir",
91
+ type=str,
92
+ default=None,
93
+ help="Acoustic model checkpoint directory. If a directory is given, "
94
+ "search for the latest checkpoint dir in the directory. If a specific "
95
+ "checkpoint dir is given, directly load the checkpoint.",
96
+ )
97
+ parser.add_argument(
98
+ "--checkpoint_path",
99
+ type=str,
100
+ default=None,
101
+ help="Acoustic model checkpoint directory. If a directory is given, "
102
+ "search for the latest checkpoint dir in the directory. If a specific "
103
+ "checkpoint dir is given, directly load the checkpoint.",
104
+ )
105
+ parser.add_argument(
106
+ "--mode",
107
+ type=str,
108
+ choices=["batch", "single"],
109
+ required=True,
110
+ help="Synthesize a whole dataset or a single sentence",
111
+ )
112
+ parser.add_argument(
113
+ "--log_level",
114
+ type=str,
115
+ default="warning",
116
+ help="Logging level. Default: warning",
117
+ )
118
+ parser.add_argument(
119
+ "--pitch_control",
120
+ type=float,
121
+ default=1.0,
122
+ help="control the pitch of the whole utterance, larger value for higher pitch",
123
+ )
124
+ parser.add_argument(
125
+ "--energy_control",
126
+ type=float,
127
+ default=1.0,
128
+ help="control the energy of the whole utterance, larger value for larger volume",
129
+ )
130
+ parser.add_argument(
131
+ "--duration_control",
132
+ type=float,
133
+ default=1.0,
134
+ help="control the speed of the whole utterance, larger value for slower speaking rate",
135
+ )
136
+ parser.add_argument(
137
+ "--output_dir",
138
+ type=str,
139
+ default=None,
140
+ help="Output dir for saving generated results",
141
+ )
142
+ return parser
143
+
144
+
145
+ def main():
146
+ # Parse arguments
147
+ parser = build_parser()
148
+ VALLEInference.add_arguments(parser)
149
+ NS2Inference.add_arguments(parser)
150
+ args = parser.parse_args()
151
+ print(args)
152
+
153
+ # Parse config
154
+ cfg = load_config(args.config)
155
+
156
+ # CUDA settings
157
+ cuda_relevant()
158
+
159
+ # Build inference
160
+ inferencer = build_inference(args, cfg)
161
+
162
+ # Run inference
163
+ inferencer.inference()
164
+
165
+
166
+ if __name__ == "__main__":
167
+ main()
utils/Amphion/bins/tts/preprocess.py ADDED
@@ -0,0 +1,244 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) 2023 Amphion.
2
+ #
3
+ # This source code is licensed under the MIT license found in the
4
+ # LICENSE file in the root directory of this source tree.
5
+
6
+ import faulthandler
7
+
8
+ faulthandler.enable()
9
+
10
+ import os
11
+ import argparse
12
+ import json
13
+ import pyworld as pw
14
+ from multiprocessing import cpu_count
15
+
16
+
17
+ from utils.util import load_config
18
+ from preprocessors.processor import preprocess_dataset, prepare_align
19
+ from preprocessors.metadata import cal_metadata
20
+ from processors import (
21
+ acoustic_extractor,
22
+ content_extractor,
23
+ data_augment,
24
+ phone_extractor,
25
+ )
26
+
27
+
28
+ def extract_acoustic_features(dataset, output_path, cfg, dataset_types, n_workers=1):
29
+ """Extract acoustic features of utterances in the dataset
30
+
31
+ Args:
32
+ dataset (str): name of dataset, e.g. opencpop
33
+ output_path (str): directory that stores train, test and feature files of datasets
34
+ cfg (dict): dictionary that stores configurations
35
+ n_workers (int, optional): num of processes to extract features in parallel. Defaults to 1.
36
+ """
37
+
38
+ metadata = []
39
+ for dataset_type in dataset_types:
40
+ dataset_output = os.path.join(output_path, dataset)
41
+ dataset_file = os.path.join(dataset_output, "{}.json".format(dataset_type))
42
+ with open(dataset_file, "r") as f:
43
+ metadata.extend(json.load(f))
44
+
45
+ # acoustic_extractor.extract_utt_acoustic_features_parallel(
46
+ # metadata, dataset_output, cfg, n_workers=n_workers
47
+ # )
48
+ acoustic_extractor.extract_utt_acoustic_features_serial(
49
+ metadata, dataset_output, cfg
50
+ )
51
+
52
+
53
+ def extract_content_features(dataset, output_path, cfg, dataset_types, num_workers=1):
54
+ """Extract content features of utterances in the dataset
55
+
56
+ Args:
57
+ dataset (str): name of dataset, e.g. opencpop
58
+ output_path (str): directory that stores train, test and feature files of datasets
59
+ cfg (dict): dictionary that stores configurations
60
+ """
61
+
62
+ metadata = []
63
+ for dataset_type in dataset_types:
64
+ dataset_output = os.path.join(output_path, dataset)
65
+ # dataset_file = os.path.join(dataset_output, "{}.json".format(dataset_type))
66
+ dataset_file = os.path.join(dataset_output, "{}.json".format(dataset_type))
67
+ with open(dataset_file, "r") as f:
68
+ metadata.extend(json.load(f))
69
+
70
+ content_extractor.extract_utt_content_features_dataloader(
71
+ cfg, metadata, num_workers
72
+ )
73
+
74
+
75
+ def extract_phonme_sequences(dataset, output_path, cfg, dataset_types):
76
+ """Extract phoneme features of utterances in the dataset
77
+
78
+ Args:
79
+ dataset (str): name of dataset, e.g. opencpop
80
+ output_path (str): directory that stores train, test and feature files of datasets
81
+ cfg (dict): dictionary that stores configurations
82
+
83
+ """
84
+
85
+ metadata = []
86
+ for dataset_type in dataset_types:
87
+ dataset_output = os.path.join(output_path, dataset)
88
+ dataset_file = os.path.join(dataset_output, "{}.json".format(dataset_type))
89
+ with open(dataset_file, "r") as f:
90
+ metadata.extend(json.load(f))
91
+ phone_extractor.extract_utt_phone_sequence(dataset, cfg, metadata)
92
+
93
+
94
+ def preprocess(cfg, args):
95
+ """Preprocess raw data of single or multiple datasets (in cfg.dataset)
96
+
97
+ Args:
98
+ cfg (dict): dictionary that stores configurations
99
+ args (ArgumentParser): specify the configuration file and num_workers
100
+ """
101
+ # Specify the output root path to save the processed data
102
+ output_path = cfg.preprocess.processed_dir
103
+ os.makedirs(output_path, exist_ok=True)
104
+
105
+ # Split train and test sets
106
+ for dataset in cfg.dataset:
107
+ print("Preprocess {}...".format(dataset))
108
+
109
+ if args.prepare_alignment:
110
+ # Prepare alignment with MFA
111
+ print("Prepare alignment {}...".format(dataset))
112
+ prepare_align(
113
+ dataset, cfg.dataset_path[dataset], cfg.preprocess, output_path
114
+ )
115
+
116
+ preprocess_dataset(
117
+ dataset,
118
+ cfg.dataset_path[dataset],
119
+ output_path,
120
+ cfg.preprocess,
121
+ cfg.task_type,
122
+ is_custom_dataset=dataset in cfg.use_custom_dataset,
123
+ )
124
+
125
+ # Data augmentation: create new wav files with pitch shift, formant shift, equalizer, time stretch
126
+ try:
127
+ assert isinstance(
128
+ cfg.preprocess.data_augment, list
129
+ ), "Please provide a list of datasets need to be augmented."
130
+ if len(cfg.preprocess.data_augment) > 0:
131
+ new_datasets_list = []
132
+ for dataset in cfg.preprocess.data_augment:
133
+ new_datasets = data_augment.augment_dataset(cfg, dataset)
134
+ new_datasets_list.extend(new_datasets)
135
+ cfg.dataset.extend(new_datasets_list)
136
+ print("Augmentation datasets: ", cfg.dataset)
137
+ except:
138
+ print("No Data Augmentation.")
139
+
140
+ # json files
141
+ dataset_types = list()
142
+ dataset_types.append((cfg.preprocess.train_file).split(".")[0])
143
+ dataset_types.append((cfg.preprocess.valid_file).split(".")[0])
144
+ if "test" not in dataset_types:
145
+ dataset_types.append("test")
146
+ if "eval" in dataset:
147
+ dataset_types = ["test"]
148
+
149
+ # Dump metadata of datasets (singers, train/test durations, etc.)
150
+ cal_metadata(cfg, dataset_types)
151
+
152
+ # Prepare the acoustic features
153
+ for dataset in cfg.dataset:
154
+ # Skip augmented datasets which do not need to extract acoustic features
155
+ # We will copy acoustic features from the original dataset later
156
+ if (
157
+ "pitch_shift" in dataset
158
+ or "formant_shift" in dataset
159
+ or "equalizer" in dataset in dataset
160
+ ):
161
+ continue
162
+ print(
163
+ "Extracting acoustic features for {} using {} workers ...".format(
164
+ dataset, args.num_workers
165
+ )
166
+ )
167
+ extract_acoustic_features(
168
+ dataset, output_path, cfg, dataset_types, args.num_workers
169
+ )
170
+ # Calculate the statistics of acoustic features
171
+ if cfg.preprocess.mel_min_max_norm:
172
+ acoustic_extractor.cal_mel_min_max(dataset, output_path, cfg)
173
+
174
+ if cfg.preprocess.extract_pitch:
175
+ acoustic_extractor.cal_pitch_statistics(dataset, output_path, cfg)
176
+
177
+ if cfg.preprocess.extract_energy:
178
+ acoustic_extractor.cal_energy_statistics(dataset, output_path, cfg)
179
+
180
+ if cfg.preprocess.pitch_norm:
181
+ acoustic_extractor.normalize(dataset, cfg.preprocess.pitch_dir, cfg)
182
+
183
+ if cfg.preprocess.energy_norm:
184
+ acoustic_extractor.normalize(dataset, cfg.preprocess.energy_dir, cfg)
185
+
186
+ # Copy acoustic features for augmented datasets by creating soft-links
187
+ for dataset in cfg.dataset:
188
+ if "pitch_shift" in dataset:
189
+ src_dataset = dataset.replace("_pitch_shift", "")
190
+ src_dataset_dir = os.path.join(output_path, src_dataset)
191
+ elif "formant_shift" in dataset:
192
+ src_dataset = dataset.replace("_formant_shift", "")
193
+ src_dataset_dir = os.path.join(output_path, src_dataset)
194
+ elif "equalizer" in dataset:
195
+ src_dataset = dataset.replace("_equalizer", "")
196
+ src_dataset_dir = os.path.join(output_path, src_dataset)
197
+ else:
198
+ continue
199
+ dataset_dir = os.path.join(output_path, dataset)
200
+ metadata = []
201
+ for split in ["train", "test"] if not "eval" in dataset else ["test"]:
202
+ metadata_file_path = os.path.join(src_dataset_dir, "{}.json".format(split))
203
+ with open(metadata_file_path, "r") as f:
204
+ metadata.extend(json.load(f))
205
+ print("Copying acoustic features for {}...".format(dataset))
206
+ acoustic_extractor.copy_acoustic_features(
207
+ metadata, dataset_dir, src_dataset_dir, cfg
208
+ )
209
+ if cfg.preprocess.mel_min_max_norm:
210
+ acoustic_extractor.cal_mel_min_max(dataset, output_path, cfg)
211
+
212
+ if cfg.preprocess.extract_pitch:
213
+ acoustic_extractor.cal_pitch_statistics(dataset, output_path, cfg)
214
+
215
+ # Prepare the content features
216
+ for dataset in cfg.dataset:
217
+ print("Extracting content features for {}...".format(dataset))
218
+ extract_content_features(
219
+ dataset, output_path, cfg, dataset_types, args.num_workers
220
+ )
221
+
222
+ # Prepare the phenome squences
223
+ if cfg.preprocess.extract_phone:
224
+ for dataset in cfg.dataset:
225
+ print("Extracting phoneme sequence for {}...".format(dataset))
226
+ extract_phonme_sequences(dataset, output_path, cfg, dataset_types)
227
+
228
+
229
+ def main():
230
+ parser = argparse.ArgumentParser()
231
+ parser.add_argument(
232
+ "--config", default="config.json", help="json files for configurations."
233
+ )
234
+ parser.add_argument("--num_workers", type=int, default=int(cpu_count()))
235
+ parser.add_argument("--prepare_alignment", type=bool, default=False)
236
+
237
+ args = parser.parse_args()
238
+ cfg = load_config(args.config)
239
+
240
+ preprocess(cfg, args)
241
+
242
+
243
+ if __name__ == "__main__":
244
+ main()
utils/Amphion/bins/tts/train.py ADDED
@@ -0,0 +1,111 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) 2023 Amphion.
2
+ #
3
+ # This source code is licensed under the MIT license found in the
4
+ # LICENSE file in the root directory of this source tree.
5
+
6
+ import argparse
7
+
8
+ import torch
9
+
10
+ from models.tts.fastspeech2.fs2_trainer import FastSpeech2Trainer
11
+ from models.tts.vits.vits_trainer import VITSTrainer
12
+ from models.tts.valle.valle_trainer import VALLETrainer
13
+ from models.tts.naturalspeech2.ns2_trainer import NS2Trainer
14
+ from utils.util import load_config
15
+
16
+
17
+ def build_trainer(args, cfg):
18
+ supported_trainer = {
19
+ "FastSpeech2": FastSpeech2Trainer,
20
+ "VITS": VITSTrainer,
21
+ "VALLE": VALLETrainer,
22
+ "NaturalSpeech2": NS2Trainer,
23
+ }
24
+
25
+ trainer_class = supported_trainer[cfg.model_type]
26
+ trainer = trainer_class(args, cfg)
27
+ return trainer
28
+
29
+
30
+ def cuda_relevant(deterministic=False):
31
+ torch.cuda.empty_cache()
32
+ # TF32 on Ampere and above
33
+ torch.backends.cuda.matmul.allow_tf32 = True
34
+ torch.backends.cudnn.enabled = True
35
+ torch.backends.cudnn.allow_tf32 = True
36
+ # Deterministic
37
+ torch.backends.cudnn.deterministic = deterministic
38
+ torch.backends.cudnn.benchmark = not deterministic
39
+ torch.use_deterministic_algorithms(deterministic)
40
+
41
+
42
+ def main():
43
+ parser = argparse.ArgumentParser()
44
+ parser.add_argument(
45
+ "--config",
46
+ default="config.json",
47
+ help="json files for configurations.",
48
+ required=True,
49
+ )
50
+ parser.add_argument(
51
+ "--exp_name",
52
+ type=str,
53
+ default="exp_name",
54
+ help="A specific name to note the experiment",
55
+ required=True,
56
+ )
57
+ parser.add_argument(
58
+ "--resume", action="store_true", help="The model name to restore"
59
+ )
60
+ parser.add_argument(
61
+ "--log_level", default="warning", help="logging level (debug, info, warning)"
62
+ )
63
+ parser.add_argument(
64
+ "--resume_type",
65
+ type=str,
66
+ default="resume",
67
+ help="Resume training or finetuning.",
68
+ )
69
+ parser.add_argument(
70
+ "--checkpoint_path",
71
+ type=str,
72
+ default=None,
73
+ help="Checkpoint for resume training or finetuning.",
74
+ )
75
+
76
+ VALLETrainer.add_arguments(parser)
77
+ args = parser.parse_args()
78
+ cfg = load_config(args.config)
79
+
80
+ # Data Augmentation
81
+ if (
82
+ type(cfg.preprocess.data_augment) == list
83
+ and len(cfg.preprocess.data_augment) > 0
84
+ ):
85
+ new_datasets_list = []
86
+ for dataset in cfg.preprocess.data_augment:
87
+ new_datasets = [
88
+ f"{dataset}_pitch_shift" if cfg.preprocess.use_pitch_shift else None,
89
+ (
90
+ f"{dataset}_formant_shift"
91
+ if cfg.preprocess.use_formant_shift
92
+ else None
93
+ ),
94
+ f"{dataset}_equalizer" if cfg.preprocess.use_equalizer else None,
95
+ f"{dataset}_time_stretch" if cfg.preprocess.use_time_stretch else None,
96
+ ]
97
+ new_datasets_list.extend(filter(None, new_datasets))
98
+ cfg.dataset.extend(new_datasets_list)
99
+
100
+ # # CUDA settings
101
+ cuda_relevant()
102
+
103
+ # Build trainer
104
+ trainer = build_trainer(args, cfg)
105
+ torch.set_num_threads(1)
106
+ torch.set_num_interop_threads(1)
107
+ trainer.train_loop()
108
+
109
+
110
+ if __name__ == "__main__":
111
+ main()
utils/Amphion/bins/vocoder/inference.py ADDED
@@ -0,0 +1,115 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) 2023 Amphion.
2
+ #
3
+ # This source code is licensed under the MIT license found in the
4
+ # LICENSE file in the root directory of this source tree.
5
+
6
+ import argparse
7
+ import os
8
+
9
+ import torch
10
+
11
+ from models.vocoders.vocoder_inference import VocoderInference
12
+ from utils.util import load_config
13
+
14
+
15
+ def build_inference(args, cfg, infer_type="infer_from_dataset"):
16
+ supported_inference = {
17
+ "GANVocoder": VocoderInference,
18
+ "DiffusionVocoder": VocoderInference,
19
+ }
20
+
21
+ inference_class = supported_inference[cfg.model_type]
22
+ return inference_class(args, cfg, infer_type)
23
+
24
+
25
+ def cuda_relevant(deterministic=False):
26
+ torch.cuda.empty_cache()
27
+ # TF32 on Ampere and above
28
+ torch.backends.cuda.matmul.allow_tf32 = True
29
+ torch.backends.cudnn.enabled = True
30
+ torch.backends.cudnn.allow_tf32 = True
31
+ # Deterministic
32
+ torch.backends.cudnn.deterministic = deterministic
33
+ torch.backends.cudnn.benchmark = not deterministic
34
+ torch.use_deterministic_algorithms(deterministic)
35
+
36
+
37
+ def build_parser():
38
+ r"""Build argument parser for inference.py.
39
+ Anything else should be put in an extra config YAML file.
40
+ """
41
+
42
+ parser = argparse.ArgumentParser()
43
+ parser.add_argument(
44
+ "--config",
45
+ type=str,
46
+ required=True,
47
+ help="JSON/YAML file for configurations.",
48
+ )
49
+ parser.add_argument(
50
+ "--infer_mode",
51
+ type=str,
52
+ required=None,
53
+ )
54
+ parser.add_argument(
55
+ "--infer_datasets",
56
+ nargs="+",
57
+ default=None,
58
+ )
59
+ parser.add_argument(
60
+ "--feature_folder",
61
+ type=str,
62
+ default=None,
63
+ )
64
+ parser.add_argument(
65
+ "--audio_folder",
66
+ type=str,
67
+ default=None,
68
+ )
69
+ parser.add_argument(
70
+ "--vocoder_dir",
71
+ type=str,
72
+ required=True,
73
+ help="Vocoder checkpoint directory. Searching behavior is the same as "
74
+ "the acoustics one.",
75
+ )
76
+ parser.add_argument(
77
+ "--output_dir",
78
+ type=str,
79
+ default="result",
80
+ help="Output directory. Default: ./result",
81
+ )
82
+ parser.add_argument(
83
+ "--log_level",
84
+ type=str,
85
+ default="warning",
86
+ help="Logging level. Default: warning",
87
+ )
88
+ parser.add_argument(
89
+ "--keep_cache",
90
+ action="store_true",
91
+ default=False,
92
+ help="Keep cache files. Only applicable to inference from files.",
93
+ )
94
+ return parser
95
+
96
+
97
+ def main():
98
+ # Parse arguments
99
+ args = build_parser().parse_args()
100
+
101
+ # Parse config
102
+ cfg = load_config(args.config)
103
+
104
+ # CUDA settings
105
+ cuda_relevant()
106
+
107
+ # Build inference
108
+ trainer = build_inference(args, cfg, args.infer_mode)
109
+
110
+ # Run inference
111
+ trainer.inference()
112
+
113
+
114
+ if __name__ == "__main__":
115
+ main()
utils/Amphion/bins/vocoder/preprocess.py ADDED
@@ -0,0 +1,151 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) 2023 Amphion.
2
+ #
3
+ # This source code is licensed under the MIT license found in the
4
+ # LICENSE file in the root directory of this source tree.
5
+
6
+ import faulthandler
7
+
8
+ faulthandler.enable()
9
+
10
+ import os
11
+ import argparse
12
+ import json
13
+ import pyworld as pw
14
+ from multiprocessing import cpu_count
15
+
16
+
17
+ from utils.util import load_config
18
+ from preprocessors.processor import preprocess_dataset, prepare_align
19
+ from preprocessors.metadata import cal_metadata
20
+ from processors import acoustic_extractor, content_extractor, data_augment
21
+
22
+
23
+ def extract_acoustic_features(dataset, output_path, cfg, n_workers=1):
24
+ """Extract acoustic features of utterances in the dataset
25
+
26
+ Args:
27
+ dataset (str): name of dataset, e.g. opencpop
28
+ output_path (str): directory that stores train, test and feature files of datasets
29
+ cfg (dict): dictionary that stores configurations
30
+ n_workers (int, optional): num of processes to extract features in parallel. Defaults to 1.
31
+ """
32
+ types = ["train", "test"] if "eval" not in dataset else ["test"]
33
+ metadata = []
34
+ for dataset_type in types:
35
+ dataset_output = os.path.join(output_path, dataset)
36
+ dataset_file = os.path.join(dataset_output, "{}.json".format(dataset_type))
37
+ with open(dataset_file, "r") as f:
38
+ metadata.extend(json.load(f))
39
+
40
+ acoustic_extractor.extract_utt_acoustic_features_serial(
41
+ metadata, dataset_output, cfg
42
+ )
43
+
44
+
45
+ def preprocess(cfg, args):
46
+ """Proprocess raw data of single or multiple datasets (in cfg.dataset)
47
+
48
+ Args:
49
+ cfg (dict): dictionary that stores configurations
50
+ args (ArgumentParser): specify the configuration file and num_workers
51
+ """
52
+ # Specify the output root path to save the processed data
53
+ output_path = cfg.preprocess.processed_dir
54
+ os.makedirs(output_path, exist_ok=True)
55
+
56
+ ## Split train and test sets
57
+ for dataset in cfg.dataset:
58
+ print("Preprocess {}...".format(dataset))
59
+
60
+ preprocess_dataset(
61
+ dataset,
62
+ cfg.dataset_path[dataset],
63
+ output_path,
64
+ cfg.preprocess,
65
+ cfg.task_type,
66
+ is_custom_dataset=dataset in cfg.use_custom_dataset,
67
+ )
68
+
69
+ # Data augmentation: create new wav files with pitch shift, formant shift, equalizer, time stretch
70
+ try:
71
+ assert isinstance(
72
+ cfg.preprocess.data_augment, list
73
+ ), "Please provide a list of datasets need to be augmented."
74
+ if len(cfg.preprocess.data_augment) > 0:
75
+ new_datasets_list = []
76
+ for dataset in cfg.preprocess.data_augment:
77
+ new_datasets = data_augment.augment_dataset(cfg, dataset)
78
+ new_datasets_list.extend(new_datasets)
79
+ cfg.dataset.extend(new_datasets_list)
80
+ print("Augmentation datasets: ", cfg.dataset)
81
+ except:
82
+ print("No Data Augmentation.")
83
+
84
+ # Dump metadata of datasets (singers, train/test durations, etc.)
85
+ cal_metadata(cfg)
86
+
87
+ ## Prepare the acoustic features
88
+ for dataset in cfg.dataset:
89
+ # Skip augmented datasets which do not need to extract acoustic features
90
+ # We will copy acoustic features from the original dataset later
91
+ if (
92
+ "pitch_shift" in dataset
93
+ or "formant_shift" in dataset
94
+ or "equalizer" in dataset in dataset
95
+ ):
96
+ continue
97
+ print(
98
+ "Extracting acoustic features for {} using {} workers ...".format(
99
+ dataset, args.num_workers
100
+ )
101
+ )
102
+ extract_acoustic_features(dataset, output_path, cfg, args.num_workers)
103
+ # Calculate the statistics of acoustic features
104
+ if cfg.preprocess.mel_min_max_norm:
105
+ acoustic_extractor.cal_mel_min_max(dataset, output_path, cfg)
106
+
107
+ # Copy acoustic features for augmented datasets by creating soft-links
108
+ for dataset in cfg.dataset:
109
+ if "pitch_shift" in dataset:
110
+ src_dataset = dataset.replace("_pitch_shift", "")
111
+ src_dataset_dir = os.path.join(output_path, src_dataset)
112
+ elif "formant_shift" in dataset:
113
+ src_dataset = dataset.replace("_formant_shift", "")
114
+ src_dataset_dir = os.path.join(output_path, src_dataset)
115
+ elif "equalizer" in dataset:
116
+ src_dataset = dataset.replace("_equalizer", "")
117
+ src_dataset_dir = os.path.join(output_path, src_dataset)
118
+ else:
119
+ continue
120
+ dataset_dir = os.path.join(output_path, dataset)
121
+ metadata = []
122
+ for split in ["train", "test"] if not "eval" in dataset else ["test"]:
123
+ metadata_file_path = os.path.join(src_dataset_dir, "{}.json".format(split))
124
+ with open(metadata_file_path, "r") as f:
125
+ metadata.extend(json.load(f))
126
+ print("Copying acoustic features for {}...".format(dataset))
127
+ acoustic_extractor.copy_acoustic_features(
128
+ metadata, dataset_dir, src_dataset_dir, cfg
129
+ )
130
+ if cfg.preprocess.mel_min_max_norm:
131
+ acoustic_extractor.cal_mel_min_max(dataset, output_path, cfg)
132
+
133
+ if cfg.preprocess.extract_pitch:
134
+ acoustic_extractor.cal_pitch_statistics(dataset, output_path, cfg)
135
+
136
+
137
+ def main():
138
+ parser = argparse.ArgumentParser()
139
+ parser.add_argument(
140
+ "--config", default="config.json", help="json files for configurations."
141
+ )
142
+ parser.add_argument("--num_workers", type=int, default=int(cpu_count()))
143
+
144
+ args = parser.parse_args()
145
+ cfg = load_config(args.config)
146
+
147
+ preprocess(cfg, args)
148
+
149
+
150
+ if __name__ == "__main__":
151
+ main()
utils/Amphion/bins/vocoder/train.py ADDED
@@ -0,0 +1,93 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) 2023 Amphion.
2
+ #
3
+ # This source code is licensed under the MIT license found in the
4
+ # LICENSE file in the root directory of this source tree.
5
+
6
+ import argparse
7
+
8
+ import torch
9
+
10
+ from models.vocoders.gan.gan_vocoder_trainer import GANVocoderTrainer
11
+ from models.vocoders.diffusion.diffusion_vocoder_trainer import DiffusionVocoderTrainer
12
+
13
+ from utils.util import load_config
14
+
15
+
16
+ def build_trainer(args, cfg):
17
+ supported_trainer = {
18
+ "GANVocoder": GANVocoderTrainer,
19
+ "DiffusionVocoder": DiffusionVocoderTrainer,
20
+ }
21
+
22
+ trainer_class = supported_trainer[cfg.model_type]
23
+ trainer = trainer_class(args, cfg)
24
+ return trainer
25
+
26
+
27
+ def cuda_relevant(deterministic=False):
28
+ torch.cuda.empty_cache()
29
+ # TF32 on Ampere and above
30
+ torch.backends.cuda.matmul.allow_tf32 = True
31
+ torch.backends.cudnn.enabled = True
32
+ torch.backends.cudnn.allow_tf32 = True
33
+ # Deterministic
34
+ torch.backends.cudnn.deterministic = deterministic
35
+ torch.backends.cudnn.benchmark = not deterministic
36
+ torch.use_deterministic_algorithms(deterministic)
37
+
38
+
39
+ def main():
40
+ parser = argparse.ArgumentParser()
41
+ parser.add_argument(
42
+ "--config",
43
+ default="config.json",
44
+ help="json files for configurations.",
45
+ required=True,
46
+ )
47
+ parser.add_argument(
48
+ "--exp_name",
49
+ type=str,
50
+ default="exp_name",
51
+ help="A specific name to note the experiment",
52
+ required=True,
53
+ )
54
+ parser.add_argument(
55
+ "--resume_type",
56
+ type=str,
57
+ help="resume for continue to train, finetune for finetuning",
58
+ )
59
+ parser.add_argument(
60
+ "--checkpoint",
61
+ type=str,
62
+ help="checkpoint to resume",
63
+ )
64
+ parser.add_argument(
65
+ "--log_level", default="warning", help="logging level (debug, info, warning)"
66
+ )
67
+ args = parser.parse_args()
68
+ cfg = load_config(args.config)
69
+
70
+ # Data Augmentation
71
+ if cfg.preprocess.data_augment:
72
+ new_datasets_list = []
73
+ for dataset in cfg.preprocess.data_augment:
74
+ new_datasets = [
75
+ # f"{dataset}_pitch_shift",
76
+ # f"{dataset}_formant_shift",
77
+ f"{dataset}_equalizer",
78
+ f"{dataset}_time_stretch",
79
+ ]
80
+ new_datasets_list.extend(new_datasets)
81
+ cfg.dataset.extend(new_datasets_list)
82
+
83
+ # CUDA settings
84
+ cuda_relevant()
85
+
86
+ # Build trainer
87
+ trainer = build_trainer(args, cfg)
88
+
89
+ trainer.train_loop()
90
+
91
+
92
+ if __name__ == "__main__":
93
+ main()
utils/Amphion/config/audioldm.json ADDED
@@ -0,0 +1,92 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "base_config": "config/base.json",
3
+ "model_type": "AudioLDM",
4
+ "task_type": "tta",
5
+ "dataset": [
6
+ "AudioCaps"
7
+ ],
8
+ "preprocess": {
9
+ // feature used for model training
10
+ "use_spkid": false,
11
+ "use_uv": false,
12
+ "use_frame_pitch": false,
13
+ "use_phone_pitch": false,
14
+ "use_frame_energy": false,
15
+ "use_phone_energy": false,
16
+ "use_mel": false,
17
+ "use_audio": false,
18
+ "use_label": false,
19
+ "use_one_hot": false,
20
+ "cond_mask_prob": 0.1
21
+ },
22
+ // model
23
+ "model": {
24
+ "audioldm": {
25
+ "image_size": 32,
26
+ "in_channels": 4,
27
+ "out_channels": 4,
28
+ "model_channels": 256,
29
+ "attention_resolutions": [
30
+ 4,
31
+ 2,
32
+ 1
33
+ ],
34
+ "num_res_blocks": 2,
35
+ "channel_mult": [
36
+ 1,
37
+ 2,
38
+ 4
39
+ ],
40
+ "num_heads": 8,
41
+ "use_spatial_transformer": true,
42
+ "transformer_depth": 1,
43
+ "context_dim": 768,
44
+ "use_checkpoint": true,
45
+ "legacy": false
46
+ },
47
+ "autoencoderkl": {
48
+ "ch": 128,
49
+ "ch_mult": [
50
+ 1,
51
+ 1,
52
+ 2,
53
+ 2,
54
+ 4
55
+ ],
56
+ "num_res_blocks": 2,
57
+ "in_channels": 1,
58
+ "z_channels": 4,
59
+ "out_ch": 1,
60
+ "double_z": true
61
+ },
62
+ "noise_scheduler": {
63
+ "num_train_timesteps": 1000,
64
+ "beta_start": 0.00085,
65
+ "beta_end": 0.012,
66
+ "beta_schedule": "scaled_linear",
67
+ "clip_sample": false,
68
+ "steps_offset": 1,
69
+ "set_alpha_to_one": false,
70
+ "skip_prk_steps": true,
71
+ "prediction_type": "epsilon"
72
+ }
73
+ },
74
+ // train
75
+ "train": {
76
+ "lronPlateau": {
77
+ "factor": 0.9,
78
+ "patience": 100,
79
+ "min_lr": 4.0e-5,
80
+ "verbose": true
81
+ },
82
+ "adam": {
83
+ "lr": 5.0e-5,
84
+ "betas": [
85
+ 0.9,
86
+ 0.999
87
+ ],
88
+ "weight_decay": 1.0e-2,
89
+ "eps": 1.0e-8
90
+ }
91
+ }
92
+ }
utils/Amphion/config/autoencoderkl.json ADDED
@@ -0,0 +1,69 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "base_config": "config/base.json",
3
+ "model_type": "AutoencoderKL",
4
+ "task_type": "tta",
5
+ "dataset": [
6
+ "AudioCaps"
7
+ ],
8
+ "preprocess": {
9
+ // feature used for model training
10
+ "use_spkid": false,
11
+ "use_uv": false,
12
+ "use_frame_pitch": false,
13
+ "use_phone_pitch": false,
14
+ "use_frame_energy": false,
15
+ "use_phone_energy": false,
16
+ "use_mel": false,
17
+ "use_audio": false,
18
+ "use_label": false,
19
+ "use_one_hot": false
20
+ },
21
+ // model
22
+ "model": {
23
+ "autoencoderkl": {
24
+ "ch": 128,
25
+ "ch_mult": [
26
+ 1,
27
+ 1,
28
+ 2,
29
+ 2,
30
+ 4
31
+ ],
32
+ "num_res_blocks": 2,
33
+ "in_channels": 1,
34
+ "z_channels": 4,
35
+ "out_ch": 1,
36
+ "double_z": true
37
+ },
38
+ "loss": {
39
+ "kl_weight": 1e-8,
40
+ "disc_weight": 0.5,
41
+ "disc_factor": 1.0,
42
+ "logvar_init": 0.0,
43
+ "min_adapt_d_weight": 0.0,
44
+ "max_adapt_d_weight": 10.0,
45
+ "disc_start": 50001,
46
+ "disc_in_channels": 1,
47
+ "disc_num_layers": 3,
48
+ "use_actnorm": false
49
+ }
50
+ },
51
+ // train
52
+ "train": {
53
+ "lronPlateau": {
54
+ "factor": 0.9,
55
+ "patience": 100,
56
+ "min_lr": 4.0e-5,
57
+ "verbose": true
58
+ },
59
+ "adam": {
60
+ "lr": 4.0e-4,
61
+ "betas": [
62
+ 0.9,
63
+ 0.999
64
+ ],
65
+ "weight_decay": 1.0e-2,
66
+ "eps": 1.0e-8
67
+ }
68
+ }
69
+ }
utils/Amphion/config/base.json ADDED
@@ -0,0 +1,185 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "supported_model_type": [
3
+ "GANVocoder",
4
+ "Fastspeech2",
5
+ "DiffSVC",
6
+ "Transformer",
7
+ "EDM",
8
+ "CD"
9
+ ],
10
+ "task_type": "",
11
+ "dataset": [],
12
+ "use_custom_dataset": [],
13
+ "preprocess": {
14
+ "phone_extractor": "espeak", // "espeak, pypinyin, pypinyin_initials_finals, lexicon"
15
+ // trim audio silence
16
+ "data_augment": false,
17
+ "trim_silence": false,
18
+ "num_silent_frames": 8,
19
+ "trim_fft_size": 512, // fft size used in trimming
20
+ "trim_hop_size": 128, // hop size used in trimming
21
+ "trim_top_db": 30, // top db used in trimming sensitive to each dataset
22
+ // acoustic features
23
+ "extract_mel": false,
24
+ "mel_extract_mode": "",
25
+ "extract_linear_spec": false,
26
+ "extract_mcep": false,
27
+ "extract_pitch": false,
28
+ "extract_acoustic_token": false,
29
+ "pitch_remove_outlier": false,
30
+ "extract_uv": false,
31
+ "pitch_norm": false,
32
+ "extract_audio": false,
33
+ "extract_label": false,
34
+ "pitch_extractor": "parselmouth", // pyin, dio, pyworld, pyreaper, parselmouth, CWT (Continuous Wavelet Transform)
35
+ "extract_energy": false,
36
+ "energy_remove_outlier": false,
37
+ "energy_norm": false,
38
+ "energy_extract_mode": "from_mel",
39
+ "extract_duration": false,
40
+ "extract_amplitude_phase": false,
41
+ "mel_min_max_norm": false,
42
+ // lingusitic features
43
+ "extract_phone": false,
44
+ "lexicon_path": "./text/lexicon/librispeech-lexicon.txt",
45
+ // content features
46
+ "extract_whisper_feature": false,
47
+ "extract_contentvec_feature": false,
48
+ "extract_mert_feature": false,
49
+ "extract_wenet_feature": false,
50
+ // Settings for data preprocessing
51
+ "n_mel": 80,
52
+ "win_size": 480,
53
+ "hop_size": 120,
54
+ "sample_rate": 24000,
55
+ "n_fft": 1024,
56
+ "fmin": 0,
57
+ "fmax": 12000,
58
+ "min_level_db": -115,
59
+ "ref_level_db": 20,
60
+ "bits": 8,
61
+ // Directory names of processed data or extracted features
62
+ "processed_dir": "processed_data",
63
+ "trimmed_wav_dir": "trimmed_wavs", // directory name of silence trimed wav
64
+ "raw_data": "raw_data",
65
+ "phone_dir": "phones",
66
+ "wav_dir": "wavs", // directory name of processed wav (such as downsampled waveform)
67
+ "audio_dir": "audios",
68
+ "log_amplitude_dir": "log_amplitudes",
69
+ "phase_dir": "phases",
70
+ "real_dir": "reals",
71
+ "imaginary_dir": "imaginarys",
72
+ "label_dir": "labels",
73
+ "linear_dir": "linears",
74
+ "mel_dir": "mels", // directory name of extraced mel features
75
+ "mcep_dir": "mcep", // directory name of extraced mcep features
76
+ "dur_dir": "durs",
77
+ "symbols_dict": "symbols.dict",
78
+ "lab_dir": "labs", // directory name of extraced label features
79
+ "wenet_dir": "wenet", // directory name of extraced wenet features
80
+ "contentvec_dir": "contentvec", // directory name of extraced wenet features
81
+ "pitch_dir": "pitches", // directory name of extraced pitch features
82
+ "energy_dir": "energys", // directory name of extracted energy features
83
+ "phone_pitch_dir": "phone_pitches", // directory name of extraced pitch features
84
+ "phone_energy_dir": "phone_energys", // directory name of extracted energy features
85
+ "uv_dir": "uvs", // directory name of extracted unvoiced features
86
+ "duration_dir": "duration", // ground-truth duration file
87
+ "phone_seq_file": "phone_seq_file", // phoneme sequence file
88
+ "file_lst": "file.lst",
89
+ "train_file": "train.json", // training set, the json file contains detailed information about the dataset, including dataset name, utterance id, duration of the utterance
90
+ "valid_file": "valid.json", // validattion set
91
+ "spk2id": "spk2id.json", // used for multi-speaker dataset
92
+ "utt2spk": "utt2spk", // used for multi-speaker dataset
93
+ "emo2id": "emo2id.json", // used for multi-emotion dataset
94
+ "utt2emo": "utt2emo", // used for multi-emotion dataset
95
+ // Features used for model training
96
+ "use_text": false,
97
+ "use_phone": false,
98
+ "use_phn_seq": false,
99
+ "use_lab": false,
100
+ "use_linear": false,
101
+ "use_mel": false,
102
+ "use_min_max_norm_mel": false,
103
+ "use_wav": false,
104
+ "use_phone_pitch": false,
105
+ "use_log_scale_pitch": false,
106
+ "use_phone_energy": false,
107
+ "use_phone_duration": false,
108
+ "use_log_scale_energy": false,
109
+ "use_wenet": false,
110
+ "use_dur": false,
111
+ "use_spkid": false, // True: use speaker id for multi-speaker dataset
112
+ "use_emoid": false, // True: use emotion id for multi-emotion dataset
113
+ "use_frame_pitch": false,
114
+ "use_uv": false,
115
+ "use_frame_energy": false,
116
+ "use_frame_duration": false,
117
+ "use_audio": false,
118
+ "use_label": false,
119
+ "use_one_hot": false,
120
+ "use_amplitude_phase": false,
121
+ "align_mel_duration": false
122
+ },
123
+ "train": {
124
+ "ddp": true,
125
+ "batch_size": 16,
126
+ "max_steps": 1000000,
127
+ // Trackers
128
+ "tracker": [
129
+ "tensorboard"
130
+ // "wandb",
131
+ // "cometml",
132
+ // "mlflow",
133
+ ],
134
+ "max_epoch": -1,
135
+ // -1 means no limit
136
+ "save_checkpoint_stride": [
137
+ 5,
138
+ 20
139
+ ],
140
+ // unit is epoch
141
+ "keep_last": [
142
+ 3,
143
+ -1
144
+ ],
145
+ // -1 means infinite, if one number will broadcast
146
+ "run_eval": [
147
+ false,
148
+ true
149
+ ],
150
+ // if one number will broadcast
151
+ // Fix the random seed
152
+ "random_seed": 10086,
153
+ // Optimizer
154
+ "optimizer": "AdamW",
155
+ "adamw": {
156
+ "lr": 4.0e-4
157
+ // nn model lr
158
+ },
159
+ // LR Scheduler
160
+ "scheduler": "ReduceLROnPlateau",
161
+ "reducelronplateau": {
162
+ "factor": 0.8,
163
+ "patience": 10,
164
+ // unit is epoch
165
+ "min_lr": 1.0e-4
166
+ },
167
+ // Batchsampler
168
+ "sampler": {
169
+ "holistic_shuffle": true,
170
+ "drop_last": true
171
+ },
172
+ // Dataloader
173
+ "dataloader": {
174
+ "num_worker": 32,
175
+ "pin_memory": true
176
+ },
177
+ "gradient_accumulation_step": 1,
178
+ "total_training_steps": 50000,
179
+ "save_summary_steps": 500,
180
+ "save_checkpoints_steps": 10000,
181
+ "valid_interval": 10000,
182
+ "keep_checkpoint_max": 5,
183
+ "multi_speaker_training": false // True: train multi-speaker model; False: training single-speaker model;
184
+ }
185
+ }