shawnpi commited on Dec 23, 2025

Commit

1cd928a

verified ·

1 Parent(s): 250b5b6

Upload 753 files

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

.gitattributes +20 -0
configs/hifigan_config.json +38 -0
configs/hq_svc_infer.yaml +25 -0
demo/singing/1.wav +3 -0
demo/singing/2.wav +3 -0
demo/speech/1.wav +3 -0
demo/speech/2.wav +3 -0
images/kon-new.gif +3 -0
logger/__init__.py +0 -0
logger/__pycache__/__init__.cpython-310.pyc +0 -0
logger/__pycache__/__init__.cpython-38.pyc +0 -0
logger/__pycache__/__init__.cpython-39.pyc +0 -0
logger/__pycache__/saver.cpython-38.pyc +0 -0
logger/__pycache__/saver.cpython-39.pyc +0 -0
logger/__pycache__/utils.cpython-310.pyc +0 -0
logger/__pycache__/utils.cpython-38.pyc +0 -0
logger/__pycache__/utils.cpython-39.pyc +0 -0
logger/saver.py +150 -0
logger/utils.py +128 -0
utils/Amphion/.github/CODE_OF_CONDUCT.md +132 -0
utils/Amphion/.github/CONTRIBUTING.md +77 -0
utils/Amphion/.github/ISSUE_TEMPLATE/bug_report.md +32 -0
utils/Amphion/.github/ISSUE_TEMPLATE/docs_feedback.md +17 -0
utils/Amphion/.github/ISSUE_TEMPLATE/feature_request.md +20 -0
utils/Amphion/.github/ISSUE_TEMPLATE/help_wanted.md +32 -0
utils/Amphion/.github/pull_request_template.md +32 -0
utils/Amphion/.github/workflows/check_format.yml +12 -0
utils/Amphion/.gitignore +64 -0
utils/Amphion/Dockerfile +64 -0
utils/Amphion/LICENSE +21 -0
utils/Amphion/README.md +163 -0
utils/Amphion/__init__.py +0 -0
utils/Amphion/__pycache__/__init__.cpython-38.pyc +0 -0
utils/Amphion/__pycache__/__init__.cpython-39.pyc +0 -0
utils/Amphion/bins/calc_metrics.py +268 -0
utils/Amphion/bins/svc/inference.py +265 -0
utils/Amphion/bins/svc/preprocess.py +183 -0
utils/Amphion/bins/svc/train.py +111 -0
utils/Amphion/bins/tta/inference.py +94 -0
utils/Amphion/bins/tta/preprocess.py +195 -0
utils/Amphion/bins/tta/train_tta.py +77 -0
utils/Amphion/bins/tts/inference.py +167 -0
utils/Amphion/bins/tts/preprocess.py +244 -0
utils/Amphion/bins/tts/train.py +111 -0
utils/Amphion/bins/vocoder/inference.py +115 -0
utils/Amphion/bins/vocoder/preprocess.py +151 -0
utils/Amphion/bins/vocoder/train.py +93 -0
utils/Amphion/config/audioldm.json +92 -0
utils/Amphion/config/autoencoderkl.json +69 -0
utils/Amphion/config/base.json +185 -0

.gitattributes CHANGED Viewed

@@ -33,3 +33,23 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+demo/singing/1.wav filter=lfs diff=lfs merge=lfs -text
+demo/singing/2.wav filter=lfs diff=lfs merge=lfs -text
+demo/speech/1.wav filter=lfs diff=lfs merge=lfs -text
+demo/speech/2.wav filter=lfs diff=lfs merge=lfs -text
+images/kon-new.gif filter=lfs diff=lfs merge=lfs -text
+utils/Amphion/egs/tts/VALLE/prompt_examples/260_123440_000010_000004.wav filter=lfs diff=lfs merge=lfs -text
+utils/Amphion/egs/tts/VALLE/prompt_examples/5142_33396_000002_000004.wav filter=lfs diff=lfs merge=lfs -text
+utils/Amphion/egs/tts/VALLE/prompt_examples/6829_68771_000027_000000.wav filter=lfs diff=lfs merge=lfs -text
+utils/Amphion/egs/tts/VALLE/prompt_examples/7176_92135_000004_000000.wav filter=lfs diff=lfs merge=lfs -text
+utils/Amphion/imgs/ns3/ns3_facodec.png filter=lfs diff=lfs merge=lfs -text
+utils/Amphion/imgs/ns3/ns3_overview.png filter=lfs diff=lfs merge=lfs -text
+utils/Amphion/imgs/svc/DiffComoSVC.png filter=lfs diff=lfs merge=lfs -text
+utils/Amphion/imgs/svc/MultipleContentsSVC.png filter=lfs diff=lfs merge=lfs -text
+utils/Amphion/imgs/svc/pipeline.png filter=lfs diff=lfs merge=lfs -text
+utils/Amphion/imgs/visualization/SingVisio_demo.png filter=lfs diff=lfs merge=lfs -text
+utils/Amphion/imgs/visualization/SingVisio_system.png filter=lfs diff=lfs merge=lfs -text
+utils/Amphion/imgs/vocoder/diffusion/pipeline.png filter=lfs diff=lfs merge=lfs -text
+utils/Amphion/imgs/vocoder/gan/MSSBCQTD.png filter=lfs diff=lfs merge=lfs -text
+utils/Amphion/visualization/SingVisio/System_Introduction_of_SingVisio.pdf filter=lfs diff=lfs merge=lfs -text
+utils/pretrain/nsf_hifigan/model filter=lfs diff=lfs merge=lfs -text

configs/hifigan_config.json ADDED Viewed

	@@ -0,0 +1,38 @@

+{
+    "resblock": "1",
+    "num_gpus": 4,
+    "batch_size": 10,
+    "learning_rate": 0.0002,
+    "adam_b1": 0.8,
+    "adam_b2": 0.99,
+    "lr_decay": 0.999,
+    "seed": 1234,
+    "upsample_rates":        [ 8, 8, 2, 2, 2],
+    "upsample_kernel_sizes": [16,16, 4, 4, 4],
+    "upsample_initial_channel": 512,
+    "resblock_kernel_sizes": [3,7,11],
+    "resblock_dilation_sizes": [[1,3,5], [1,3,5], [1,3,5]],
+    "discriminator_periods": [3, 5, 7, 11, 17, 23, 37],
+    "segment_size": 16384,
+    "num_mels": 128,
+    "num_freq": 1025,
+    "n_fft"   : 2048,
+    "hop_size": 512,
+    "win_size": 2048,
+    "sampling_rate": 44100,
+    "fmin": 40,
+    "fmax": 16000,
+    "fmax_for_loss": null,
+    "num_workers": 16,
+    "dist_config": {
+        "dist_backend": "nccl",
+        "dist_url": "tcp://localhost:54321",
+        "world_size": 1
+    }
+}

configs/hq_svc_infer.yaml ADDED Viewed

	@@ -0,0 +1,25 @@

+# 推理环境配置
+device: 'cuda'
+# 模型路径
+model_path: utils/pretrain/250000_step_val_loss_0.50.pth
+# 核心模型参数 (必须保留，用于初始化网络结构)
+use_tfm: True
+mode: [film_mlp, infonce, pred_f0]
+# 音频处理参数
+sample_rate: 44100
+encoder_sr: 16000
+vocoder: 'nsf-hifigan'
+hop_size: 256  # 建议保留，部分模型初始化需要显式指定步长
+# 扩散模型推理设置
+infer_speedup: 10
+infer_method: 'dpm-solver'
+# 特征提取配置
+f0_extractor: rmvpe
+block_size: 512
+f0_min: 60
+f0_max: 1200

demo/singing/1.wav ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:5c879c6ee5921229c98ba264869a1a6e502a6de197b3506e68a9de7771992a8a
+size 900764

demo/singing/2.wav ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:961a715b14292591938047d0ad8448a7a10ff6cf3437f0cec44eb1539fc84216
+size 877004

demo/speech/1.wav ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:88def6269f9d345d9dc374f5797ff8f7997fd6cfc87e69077246d9b25b18c8d1
+size 293804

demo/speech/2.wav ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:bca0e22549e1ff358002b672b5301f0275d39b9eeced7cb527ea64a001f572a3
+size 733868

images/kon-new.gif ADDED Viewed

Git LFS Details

SHA256: 8dcd62795f2818d6bdc085b405ef61c28614ffee98359dbd7b3cfe56faf0856e
Pointer size: 132 Bytes
Size of remote file: 1.6 MB

logger/__init__.py ADDED Viewed

File without changes

logger/__pycache__/__init__.cpython-310.pyc ADDED Viewed

Binary file (136 Bytes). View file

logger/__pycache__/__init__.cpython-38.pyc ADDED Viewed

Binary file (147 Bytes). View file

logger/__pycache__/__init__.cpython-39.pyc ADDED Viewed

Binary file (134 Bytes). View file

logger/__pycache__/saver.cpython-38.pyc ADDED Viewed

Binary file (3.72 kB). View file

logger/__pycache__/saver.cpython-39.pyc ADDED Viewed

Binary file (3.73 kB). View file

logger/__pycache__/utils.cpython-310.pyc ADDED Viewed

Binary file (3.97 kB). View file

logger/__pycache__/utils.cpython-38.pyc ADDED Viewed

Binary file (3.91 kB). View file

logger/__pycache__/utils.cpython-39.pyc ADDED Viewed

Binary file (3.93 kB). View file

logger/saver.py ADDED Viewed

	@@ -0,0 +1,150 @@

+'''
+author: wayn391@mastertones
+'''
+import os
+import json
+import time
+import yaml
+import datetime
+import torch
+import matplotlib.pyplot as plt
+from . import utils
+import numpy as np
+from torch.utils.tensorboard import SummaryWriter
+class Saver(object):
+    def __init__(
+            self,
+            args,
+            initial_global_step=0):
+        # cold start
+        self.global_step = initial_global_step
+        self.init_time = time.time()
+        self.last_time = time.time()
+        self.log_dir = args.log_dir
+        self.sample_rate = args.sample_rate
+        # ckpt
+        os.makedirs(self.log_dir, exist_ok=True)
+        # writer
+        self.writer = SummaryWriter(self.log_dir)
+    def log_info(self, msg):
+        '''log method'''
+        if isinstance(msg, dict):
+            msg_list = []
+            for k, v in msg.items():
+                tmp_str = ''
+                if isinstance(v, int):
+                    tmp_str = '{}: {:,}'.format(k, v)
+                else:
+                    tmp_str = '{}: {}'.format(k, v)
+                msg_list.append(tmp_str)
+            msg_str = '\n'.join(msg_list)
+        else:
+            msg_str = msg
+        # dsplay
+        print(msg_str)
+        # save
+        with open(self.path_log_info, 'a') as fp:
+            fp.write(msg_str+'\n')
+    def log_value(self, dict):
+        for k, v in dict.items():
+            self.writer.add_scalar(k, v, self.global_step)
+    def log_spec(self, name, spec, vmin=-14, vmax=3.5):
+    # 检查 spec 是否为 Tensor，并转换为 numpy
+        if isinstance(spec, torch.Tensor):
+            spec = spec.cpu().numpy()
+        # 为 spec 绘制图像
+        fig = plt.figure(figsize=(12, 6))
+        # font_path = 'SimHei'  # 或者字体的绝对路径
+        # font_prop = FontProperties(fname=font_path, size=14)
+        plt.imshow(spec, aspect='auto', vmin=vmin, vmax=vmax)
+        plt.colorbar()
+        # plt.title(name, fontproperties=font_prop)
+        plt.gca().invert_yaxis()  # 反转y轴
+        plt.tight_layout()
+        # 将图像添加到 TensorBoard
+        self.writer.add_figure(name, fig, self.global_step)
+        # 关闭图形以释放资源
+        plt.close(fig)
+    def log_audio(self, dict):
+        for k, v in dict.items():
+            self.writer.add_audio(k, v, global_step=self.global_step, sample_rate=self.sample_rate)
+    def get_interval_time(self, update=True):
+        cur_time = time.time()
+        time_interval = cur_time - self.last_time
+        if update:
+            self.last_time = cur_time
+        return time_interval
+    def get_total_time(self, to_str=True):
+        total_time = time.time() - self.init_time
+        if to_str:
+            total_time = str(datetime.timedelta(
+                seconds=total_time))[:-5]
+        return total_time
+    def save_model(
+            self,
+            model,
+            optimizer,
+            name='model',
+            postfix='',
+            to_json=False):
+        # os.makedirs(os.path.join(self.expdir), exist_ok=True)
+        # path
+        if postfix:
+            postfix = '_' + postfix
+        path_pt = os.path.join(
+            self.log_dir , name+postfix+'.pt')
+        # check
+        print(' [*] model checkpoint saved: {}'.format(path_pt))
+        # save
+        if optimizer is not None:
+            torch.save({
+                'global_step': self.global_step,
+                'model': model.state_dict(),
+                'optimizer': optimizer.state_dict()}, path_pt)
+        else:
+            torch.save({
+                'global_step': self.global_step,
+                'model': model.state_dict()}, path_pt)
+        # to json
+        # if to_json:
+        #     path_json = os.path.join(
+        #         self.expdir , name+'.json')
+        #     utils.to_json(path_params, path_json)
+    def delete_model(self, name='model', postfix=''):
+        # path
+        if postfix:
+            postfix = '_' + postfix
+        path_pt = os.path.join(
+            self.expdir , name+postfix+'.pt')
+        # delete
+        if os.path.exists(path_pt):
+            os.remove(path_pt)
+            print(' [*] model checkpoint deleted: {}'.format(path_pt))
+    def global_step_increment(self):
+        self.global_step += 1

logger/utils.py ADDED Viewed

	@@ -0,0 +1,128 @@

+import os
+import yaml
+import json
+import pickle
+import torch
+import shutil
+def traverse_dir(
+        root_dir,
+        extensions,
+        amount=None,
+        str_include=None,
+        str_exclude=None,
+        is_pure=False,
+        is_sort=False,
+        is_ext=True):
+    file_list = []
+    cnt = 0
+    for root, _, files in os.walk(root_dir):
+        for file in files:
+            if any([file.endswith(f".{ext}") for ext in extensions]):
+                # path
+                mix_path = os.path.join(root, file)
+                pure_path = mix_path[len(root_dir)+1:] if is_pure else mix_path
+                # amount
+                if (amount is not None) and (cnt == amount):
+                    if is_sort:
+                        file_list.sort()
+                    return file_list
+                # check string
+                if (str_include is not None) and (str_include not in pure_path):
+                    continue
+                if (str_exclude is not None) and (str_exclude in pure_path):
+                    continue
+                if not is_ext:
+                    ext = pure_path.split('.')[-1]
+                    pure_path = pure_path[:-(len(ext)+1)]
+                file_list.append(pure_path)
+                cnt += 1
+    if is_sort:
+        file_list.sort()
+    return file_list
+class DotDict(dict):
+    def __getattr__(*args):
+        val = dict.get(*args)
+        return DotDict(val) if type(val) is dict else val
+    __setattr__ = dict.__setitem__
+    __delattr__ = dict.__delitem__
+def get_network_paras_amount(model_dict):
+    info = dict()
+    for model_name, model in model_dict.items():
+        # all_params = sum(p.numel() for p in model.parameters())
+        trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
+        info[model_name] = trainable_params
+    return info
+def load_config(path_config):
+    with open(path_config, "r") as config:
+        args = yaml.safe_load(config)
+    args = DotDict(args)
+    # print(args)
+    return args
+def save_config(path_config, output_dir):
+    file_name = path_config.split('/')[-1]
+    output_path = os.path.join(output_dir, file_name)
+    shutil.copy(path_config, output_path)
+    print(f" [*] Save config to {output_path}")
+def to_json(path_params, path_json):
+    params = torch.load(path_params, map_location=torch.device('cpu'))
+    raw_state_dict = {}
+    for k, v in params.items():
+        val = v.flatten().numpy().tolist()
+        raw_state_dict[k] = val
+    with open(path_json, 'w') as outfile:
+        json.dump(raw_state_dict, outfile,indent= "\t")
+def convert_tensor_to_numpy(tensor, is_squeeze=True):
+    if is_squeeze:
+        tensor = tensor.squeeze()
+    if tensor.requires_grad:
+        tensor = tensor.detach()
+    if tensor.is_cuda:
+        tensor = tensor.cpu()
+    return tensor.numpy()
+def load_model(
+        expdir,
+        model,
+        optimizer,
+        name='model',
+        postfix='',
+        device='cpu'):
+    if postfix == '':
+        postfix = '_' + postfix
+    path = os.path.join(expdir, name+postfix)
+    path_pt = traverse_dir(expdir, ['pt'], is_ext=False)
+    global_step = 0
+    if len(path_pt) > 0:
+        steps = [s[len(path):] for s in path_pt]
+        maxstep = max([int(s) if s.isdigit() else 0 for s in steps])
+        if maxstep >= 0:
+            path_pt = path+str(maxstep)+'.pt'
+        else:
+            path_pt = path+'best.pt'
+        print(' [*] restoring model from', path_pt)
+        ckpt = torch.load(path_pt, map_location=torch.device(device))
+        global_step = ckpt['global_step']
+        model.load_state_dict(ckpt['model'], strict=False)
+        if ckpt.get('optimizer') != None:
+            optimizer.load_state_dict(ckpt['optimizer'])
+    return global_step, model, optimizer

utils/Amphion/.github/CODE_OF_CONDUCT.md ADDED Viewed

	@@ -0,0 +1,132 @@

+# Contributor Covenant Code of Conduct
+## Our Pledge
+We as members, contributors, and leaders pledge to make participation in our
+community a harassment-free experience for everyone, regardless of age, body
+size, visible or invisible disability, ethnicity, sex characteristics, gender
+identity and expression, level of experience, education, socio-economic status,
+nationality, personal appearance, race, caste, color, religion, or sexual
+identity and orientation.
+We pledge to act and interact in ways that contribute to an open, welcoming,
+diverse, inclusive, and healthy community.
+## Our Standards
+Examples of behavior that contributes to a positive environment for our
+community include:
+* Demonstrating empathy and kindness toward other people
+* Being respectful of differing opinions, viewpoints, and experiences
+* Giving and gracefully accepting constructive feedback
+* Accepting responsibility and apologizing to those affected by our mistakes,
+  and learning from the experience
+* Focusing on what is best not just for us as individuals, but for the overall
+  community
+Examples of unacceptable behavior include:
+* The use of sexualized language or imagery, and sexual attention or advances of
+  any kind
+* Trolling, insulting or derogatory comments, and personal or political attacks
+* Public or private harassment
+* Publishing others' private information, such as a physical or email address,
+  without their explicit permission
+* Other conduct which could reasonably be considered inappropriate in a
+  professional setting
+## Enforcement Responsibilities
+Community leaders are responsible for clarifying and enforcing our standards of
+acceptable behavior and will take appropriate and fair corrective action in
+response to any behavior that they deem inappropriate, threatening, offensive,
+or harmful.
+Community leaders have the right and responsibility to remove, edit, or reject
+comments, commits, code, wiki edits, issues, and other contributions that are
+not aligned to this Code of Conduct, and will communicate reasons for moderation
+decisions when appropriate.
+## Scope
+This Code of Conduct applies within all community spaces, and also applies when
+an individual is officially representing the community in public spaces.
+Examples of representing our community include using an official email address,
+posting via an official social media account, or acting as an appointed
+representative at an online or offline event.
+## Enforcement
+Instances of abusive, harassing, or otherwise unacceptable behavior may be
+reported to the community leaders responsible for enforcement.
+All complaints will be reviewed and investigated promptly and fairly.
+All community leaders are obligated to respect the privacy and security of the
+reporter of any incident.
+## Enforcement Guidelines
+Community leaders will follow these Community Impact Guidelines in determining
+the consequences for any action they deem in violation of this Code of Conduct:
+### 1. Correction
+**Community Impact**: Use of inappropriate language or other behavior deemed
+unprofessional or unwelcome in the community.
+**Consequence**: A private, written warning from community leaders, providing
+clarity around the nature of the violation and an explanation of why the
+behavior was inappropriate. A public apology may be requested.
+### 2. Warning
+**Community Impact**: A violation through a single incident or series of
+actions.
+**Consequence**: A warning with consequences for continued behavior. No
+interaction with the people involved, including unsolicited interaction with
+those enforcing the Code of Conduct, for a specified period of time. This
+includes avoiding interactions in community spaces as well as external channels
+like social media. Violating these terms may lead to a temporary or permanent
+ban.
+### 3. Temporary Ban
+**Community Impact**: A serious violation of community standards, including
+sustained inappropriate behavior.
+**Consequence**: A temporary ban from any sort of interaction or public
+communication with the community for a specified period of time. No public or
+private interaction with the people involved, including unsolicited interaction
+with those enforcing the Code of Conduct, is allowed during this period.
+Violating these terms may lead to a permanent ban.
+### 4. Permanent Ban
+**Community Impact**: Demonstrating a pattern of violation of community
+standards, including sustained inappropriate behavior, harassment of an
+individual, or aggression toward or disparagement of classes of individuals.
+**Consequence**: A permanent ban from any sort of public interaction within the
+community.
+## Attribution
+This Code of Conduct is adapted from the [Contributor Covenant][homepage],
+version 2.1, available at
+[https://www.contributor-covenant.org/version/2/1/code_of_conduct.html][v2.1].
+Community Impact Guidelines were inspired by
+[Mozilla's code of conduct enforcement ladder][Mozilla CoC].
+For answers to common questions about this code of conduct, see the FAQ at
+[https://www.contributor-covenant.org/faq][FAQ]. Translations are available at
+[https://www.contributor-covenant.org/translations][translations].
+[homepage]: https://www.contributor-covenant.org
+[v2.1]: https://www.contributor-covenant.org/version/2/1/code_of_conduct.html
+[Mozilla CoC]: https://github.com/mozilla/diversity
+[FAQ]: https://www.contributor-covenant.org/faq
+[translations]: https://www.contributor-covenant.org/translations

utils/Amphion/.github/CONTRIBUTING.md ADDED Viewed

	@@ -0,0 +1,77 @@

+# Welcome to the Amphion Community!
+We greatly appreciate your interest in contributing to Amphion. Your involvement plays a pivotal role in our collective growth, and we are dedicated to nurturing a cooperative and inclusive space for all contributors. To ensure a respectful and productive atmosphere, all contributors must adhere to the Amphion [Code of Conduct](CODE_OF_CONDUCT.md).
+## Contributions
+All kinds of contributions are welcome, including but not limited to:
+- **Issue Reporting**: Report bugs or suggest features through GitHub Issues.
+- **Bug Fixes**: Identify and rectify software issues to boost functionality.
+- **Developing New Features**: Bring innovation and impactful enhancements to Amphion.
+- **Implementing New Checkpoints**: Introduce checkpoints to optimize workflows.
+- **Recipe Contributions**: Share your unique and practical coding solutions.
+- **Diverse Contributions**: Your participation isn't limited! Contribute to documentation, community support, and more.
+## How to Contribute
+1. **Fork the Repository**: Start by forking the Amphion repository on GitHub.
+2. **Clone Your Fork**: Localize your fork on your development machine.
+3. **Create a Branch**: Initiate a new branch for your changes.
+4. **Test Your Changes**: Ensure compatibility and non-disruption of your updates.
+5. **Commit Your Changes**: Make small, focused commits with clear descriptions.
+6. **Update Your Fork**: Upload your modifications to your GitHub fork.
+7. **Open a Pull Request**: Suggest a pull request from your fork to the main Amphion repository with our [Pull Request Template](pull_request_template.md).
+8. **Participate in Code Reviews**: Collaborate with reviewers and address their feedback.
+## Coding Standards
+- **License Headers**: Each new code file should include license headers.
+- **Style Consistency**: Align with the project's existing coding style.
+- **Code Quality**: Aim for clarity, maintainability, and efficiency.
+- **Clear Commenting**: Describe the purpose and usage of each function and other crucial code segments.
+- **Code Formatting**:
+  - Install 'black' formatter: `pip install black`.
+  - Format files: `black file.py`.
+  - Format directories: `black directory/`.
+## Contributor Agreement
+By contributing to Amphion, you agree to abide by our Code of Conduct, and the Developer Certificate of Origin, Version 1.1:
+```
+Developer Certificate of Origin
+Version 1.1
+Copyright (C) 2004, 2006 The Linux Foundation and its contributors.
+Everyone is permitted to copy and distribute verbatim copies of this
+license document, but changing it is not allowed.
+Developer's Certificate of Origin 1.1
+By making a contribution to this project, I certify that:
+(a) The contribution was created in whole or in part by me and I
+    have the right to submit it under the open source license
+    indicated in the file; or
+(b) The contribution is based upon previous work that, to the best
+    of my knowledge, is covered under an appropriate open source
+    license and I have the right under that license to submit that
+    work with modifications, whether created in whole or in part
+    by me, under the same open source license (unless I am
+    permitted to submit under a different license), as indicated
+    in the file; or
+(c) The contribution was provided directly to me by some other
+    person who certified (a), (b) or (c) and I have not modified
+    it.
+(d) I understand and agree that this project and the contribution
+    are public and that a record of the contribution (including all
+    personal information I submit with it, including my sign-off) is
+    maintained indefinitely and may be redistributed consistent with
+    this project or the open source license(s) involved.
+```
+## Need Help?
+For any queries or support, feel free to open an issue for community discussions and help.

utils/Amphion/.github/ISSUE_TEMPLATE/bug_report.md ADDED Viewed

	@@ -0,0 +1,32 @@

+---
+name: Bug report
+about: Create a report to help us improve Amphion.
+title: "[BUG]: "
+labels: 'bug'
+assignees: ''
+---
+## Describe the bug
+(A clear and concise description of what the bug is.)
+## How To Reproduce
+Steps to reproduce the behavior:
+1. Config/File changes: ...
+2. Run command: ...
+3. See error: ...
+## Expected behavior
+(A clear and concise description of what you expected to happen.)
+## Screenshots
+(If applicable, add screenshots to help explain your problem.)
+## Environment Information
+ - Operating System: [e.g. Ubuntu 20.04.5 LTS]
+ - Python Version: [e.g. Python 3.9.15]
+ - Driver & CUDA Version: [e.g. Driver 470.103.01 & CUDA 11.4]
+ - Error Messages and Logs: [If applicable, provide any error messages or relevant log outputs]
+## Additional context
+(Add any other context about the problem here.)

utils/Amphion/.github/ISSUE_TEMPLATE/docs_feedback.md ADDED Viewed

	@@ -0,0 +1,17 @@

+---
+name: Docs feedback
+about: Improve documentation about Amphion.
+title: "[Docs]: "
+labels: 'documentation'
+assignees: ''
+---
+## Documentation Reference
+(Path/Link to the documentation file)
+## Feedback on documentation
+(Your suggestions to the documentation. e.g., accuracy, complex explanations, structural organization, practical examples, technical reliability, and consistency)
+## Additional context
+(Add any other context or screenshots about the documentation here.)

utils/Amphion/.github/ISSUE_TEMPLATE/feature_request.md ADDED Viewed

	@@ -0,0 +1,20 @@

+---
+name: Feature request
+about: Suggest an idea for Amphion.
+title: "[Feature]: "
+labels: 'enhancement'
+assignees: ''
+---
+## Is your feature request related to a problem? Please describe.
+(A clear and concise description of what the problem is.)
+## Describe the solution you'd like
+(A clear and concise description of what you want to happen.)
+## Describe alternatives you've considered
+(A clear and concise description of any alternative solutions or features you've considered.)
+## Additional context
+(Add any other context or screenshots about the feature request here.)

utils/Amphion/.github/ISSUE_TEMPLATE/help_wanted.md ADDED Viewed

	@@ -0,0 +1,32 @@

+---
+name: Help wanted
+about: Want help from Amphion team.
+title: "[Help]: "
+labels: 'help wanted'
+assignees: ''
+---
+## Problem Overview
+(Briefly and clearly describe the issue you're facing and seeking help with.)
+## Steps Taken
+(Detail your attempts to resolve the issue, including any relevant steps or processes.)
+1. Config/File changes: ...
+2. Run command: ...
+3. See errors: ...
+## Expected Outcome
+(A clear and concise description of what you expected to happen.)
+## Screenshots
+(If applicable, add screenshots to help explain your problem.)
+## Environment Information
+ - Operating System: [e.g. Ubuntu 20.04.5 LTS]
+ - Python Version: [e.g. Python 3.9.15]
+ - Driver & CUDA Version: [e.g. Driver 470.103.01 & CUDA 11.4]
+ - Error Messages and Logs: [If applicable, provide any error messages or relevant log outputs]
+## Additional context
+(Add any other context about the problem here.)

utils/Amphion/.github/pull_request_template.md ADDED Viewed

	@@ -0,0 +1,32 @@

+## ✨ Description
+[Please describe the background, purpose, changes made, and how to test this PR]
+## 🚧 Related Issues
+[List the issue numbers related to this PR]
+## 👨‍💻 Changes Proposed
+- [ ] change1
+- [ ] ...
+## 🧑‍🤝‍🧑 Who Can Review?
+[Please use the '@' symbol to mention any community member who is free to review the PR once the tests have passed. Feel free to tag members or contributors who might be interested in your PR.]
+## 🛠 TODO
+- [ ] task1
+- [ ] ...
+## ✅ Checklist
+- [ ]  Code has been reviewed
+- [ ]  Code complies with the project's code standards and best practices
+- [ ]  Code has passed all tests
+- [ ]  Code does not affect the normal use of existing features
+- [ ]  Code has been commented properly
+- [ ]  Documentation has been updated (if applicable)
+- [ ]  Demo/checkpoint has been attached (if applicable)

utils/Amphion/.github/workflows/check_format.yml ADDED Viewed

	@@ -0,0 +1,12 @@

+name: Check Format
+on: [push, pull_request]
+jobs:
+  CheckCodeFormat:
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@v3
+      - uses: psf/black@stable
+        with:
+          options: "--check --diff --color"

utils/Amphion/.gitignore ADDED Viewed

	@@ -0,0 +1,64 @@

+# Mac OS files
+.DS_Store
+# IDEs
+.idea
+.vs
+.vscode
+.cache
+# GitHub files
+.github
+# Byte-compiled / optimized / DLL / cached files
+__pycache__/
+*.py[cod]
+*$py.class
+*.pyc
+.temp
+*.c
+*.so
+*.o
+# Developing mode
+_*.sh
+_*.json
+*.lst
+yard*
+*.out
+evaluation/evalset_selection
+mfa
+egs/svc/*wavmark
+egs/svc/custom
+egs/svc/*/dev*
+egs/svc/dev_exp_config.json
+egs/svc/dev
+bins/svc/demo*
+bins/svc/preprocess_custom.py
+data
+ckpts
+# Data and ckpt
+*.pkl
+*.pt
+*.npy
+*.npz
+*.tar.gz
+*.ckpt
+*.wav
+*.flac
+pretrained/wenet/*conformer_exp
+pretrained/bigvgan/args.json
+!egs/tts/VALLE/prompt_examples/*.wav
+# Runtime data dirs
+processed_data
+data
+model_ckpt
+logs
+*.ipynb
+*.lst
+source_audio
+result
+conversion_results
+get_available_gpu.py

utils/Amphion/Dockerfile ADDED Viewed

	@@ -0,0 +1,64 @@

+# Copyright (c) 2023 Amphion.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+# Other version: https://hub.docker.com/r/nvidia/cuda/tags
+FROM nvidia/cuda:11.8.0-cudnn8-devel-ubuntu18.04
+ARG DEBIAN_FRONTEND=noninteractive
+ARG PYTORCH='2.0.0'
+ARG CUDA='cu118'
+ARG SHELL='/bin/bash'
+ARG MINICONDA='Miniconda3-py39_23.3.1-0-Linux-x86_64.sh'
+ENV LANG=en_US.UTF-8 PYTHONIOENCODING=utf-8 PYTHONDONTWRITEBYTECODE=1 CUDA_HOME=/usr/local/cuda CONDA_HOME=/opt/conda SHELL=${SHELL}
+ENV PATH=$CONDA_HOME/bin:$CUDA_HOME/bin:$PATH \
+    LD_LIBRARY_PATH=$CUDA_HOME/lib64:$LD_LIBRARY_PATH \
+    LIBRARY_PATH=$CUDA_HOME/lib64:$LIBRARY_PATH \
+    CONDA_PREFIX=$CONDA_HOME \
+    NCCL_HOME=$CUDA_HOME
+# Install ubuntu packages
+RUN sed -i 's/archive.ubuntu.com/mirrors.cloud.tencent.com/g' /etc/apt/sources.list \
+    && sed -i 's/security.ubuntu.com/mirrors.cloud.tencent.com/g' /etc/apt/sources.list \
+    && rm /etc/apt/sources.list.d/cuda.list \
+    && apt-get update \
+    && apt-get -y install \
+    python3-pip ffmpeg git less wget libsm6 libxext6 libxrender-dev \
+    build-essential cmake pkg-config libx11-dev libatlas-base-dev \
+    libgtk-3-dev libboost-python-dev vim libgl1-mesa-glx \
+    libaio-dev software-properties-common tmux \
+    espeak-ng
+# Install miniconda with python 3.9
+USER root
+# COPY Miniconda3-py39_23.3.1-0-Linux-x86_64.sh /root/anaconda.sh
+RUN wget -t 0 -c -O /tmp/anaconda.sh https://repo.anaconda.com/miniconda/${MINICONDA} \
+    && mv /tmp/anaconda.sh /root/anaconda.sh \
+    && ${SHELL} /root/anaconda.sh -b -p $CONDA_HOME \
+    && rm /root/anaconda.sh
+RUN conda create -y --name amphion python=3.9.15
+WORKDIR /app
+COPY env.sh env.sh
+RUN chmod +x ./env.sh
+RUN ["conda", "run", "-n", "amphion", "-vvv", "--no-capture-output", "./env.sh"]
+RUN conda init \
+    && echo "\nconda activate amphion\n" >> ~/.bashrc
+CMD ["/bin/bash"]
+# *** Build ***
+# docker build -t realamphion/amphion .
+# *** Run ***
+# cd Amphion
+# docker run --runtime=nvidia --gpus all -it -v .:/app -v /mnt:/mnt_host realamphion/amphion
+# *** Push and release ***
+# docker login
+# docker push realamphion/amphion

utils/Amphion/LICENSE ADDED Viewed

	@@ -0,0 +1,21 @@

+MIT License
+Copyright (c) 2023 Amphion
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.

utils/Amphion/README.md ADDED Viewed

	@@ -0,0 +1,163 @@

+# Amphion: An Open-Source Audio, Music, and Speech Generation Toolkit
+<div>
+    <a href="https://arxiv.org/abs/2312.09911"><img src="https://img.shields.io/badge/arXiv-Paper-<COLOR>.svg"></a>
+    <a href="https://huggingface.co/amphion"><img src="https://img.shields.io/badge/%F0%9F%A4%97%20HuggingFace-Amphion-pink"></a>
+    <a href="https://openxlab.org.cn/usercenter/Amphion"><img src="https://cdn-static.openxlab.org.cn/app-center/openxlab_app.svg"></a>
+    <a href="egs/tts/README.md"><img src="https://img.shields.io/badge/README-TTS-blue"></a>
+    <a href="egs/svc/README.md"><img src="https://img.shields.io/badge/README-SVC-blue"></a>
+    <a href="egs/tta/README.md"><img src="https://img.shields.io/badge/README-TTA-blue"></a>
+    <a href="egs/vocoder/README.md"><img src="https://img.shields.io/badge/README-Vocoder-purple"></a>
+    <a href="egs/metrics/README.md"><img src="https://img.shields.io/badge/README-Evaluation-yellow"></a>
+    <a href="LICENSE"><img src="https://img.shields.io/badge/LICENSE-MIT-red"></a>
+</div>
+<br>
+**Amphion (/æmˈfaɪən/) is a toolkit for Audio, Music, and Speech Generation.** Its purpose is to support reproducible research and help junior researchers and engineers get started in the field of audio, music, and speech generation research and development. Amphion offers a unique feature: **visualizations** of classic models or architectures. We believe that these visualizations are beneficial for junior researchers and engineers who wish to gain a better understanding of the model.
+**The North-Star objective of Amphion is to offer a platform for studying the conversion of any inputs into audio.** Amphion is designed to support individual generation tasks, including but not limited to,
+- **TTS**: Text to Speech (⛳ supported)
+- **SVS**: Singing Voice Synthesis (👨‍💻 developing)
+- **VC**: Voice Conversion (👨‍💻 developing)
+- **SVC**: Singing Voice Conversion (⛳ supported)
+- **TTA**: Text to Audio (⛳ supported)
+- **TTM**: Text to Music (👨‍💻 developing)
+- more…
+In addition to the specific generation tasks, Amphion also includes several **vocoders** and **evaluation metrics**. A vocoder is an important module for producing high-quality audio signals, while evaluation metrics are critical for ensuring consistent metrics in generation tasks.
+Here is the Amphion v0.1 demo, whose voice, audio effects, and singing voice are generated by our models. Just enjoy it!
+[amphion-v0.1-en](https://github.com/open-mmlab/Amphion/assets/24860155/7fcdcea5-3d95-4b31-bd93-4b4da734ef9b
+)
+## 🚀 News
+- **2024/03/12**: Amphion now support **NaturalSpeech3 FACodec** and release pretrained checkpoints. [![arXiv](https://img.shields.io/badge/arXiv-Paper-COLOR.svg)](https://arxiv.org/abs/2403.03100) [![hf](https://img.shields.io/badge/%F0%9F%A4%97%20HuggingFace-model-yellow)](https://huggingface.co/amphion/naturalspeech3_facodec) [![hf](https://img.shields.io/badge/%F0%9F%A4%97%20HuggingFace-demo-pink)](https://huggingface.co/spaces/amphion/naturalspeech3_facodec) [![readme](https://img.shields.io/badge/README-Key%20Features-blue)](models/codec/ns3_codec/README.md)
+- **2024/02/22**: The first Amphion visualization tool, **SingVisio**, release. [![arXiv](https://img.shields.io/badge/arXiv-Paper-COLOR.svg)](https://arxiv.org/abs/2402.12660) [![openxlab](https://cdn-static.openxlab.org.cn/app-center/openxlab_app.svg)](https://openxlab.org.cn/apps/detail/Amphion/SingVisio) [![Video](https://img.shields.io/badge/Video-Demo-orange)](https://github.com/open-mmlab/Amphion/assets/33707885/0a6e39e8-d5f1-4288-b0f8-32da5a2d6e96) [![readme](https://img.shields.io/badge/README-Key%20Features-blue)](egs/visualization/SingVisio/README.md)
+- **2023/12/18**: Amphion v0.1 release. [![arXiv](https://img.shields.io/badge/arXiv-Paper-<COLOR>.svg)](https://arxiv.org/abs/2312.09911) [![hf](https://img.shields.io/badge/%F0%9F%A4%97%20HuggingFace-Amphion-pink)](https://huggingface.co/amphion) [![youtube](https://img.shields.io/badge/YouTube-Demo-red)](https://www.youtube.com/watch?v=1aw0HhcggvQ) [![readme](https://img.shields.io/badge/README-Key%20Features-blue)](https://github.com/open-mmlab/Amphion/pull/39)
+- **2023/11/28**: Amphion alpha release. [![readme](https://img.shields.io/badge/README-Key%20Features-blue)](https://github.com/open-mmlab/Amphion/pull/2)
+## ⭐ Key Features
+### TTS: Text to Speech
+- Amphion achieves state-of-the-art performance when compared with existing open-source repositories on text-to-speech (TTS) systems. It supports the following models or architectures:
+    - [FastSpeech2](https://arxiv.org/abs/2006.04558): A non-autoregressive TTS architecture that utilizes feed-forward Transformer blocks.
+    - [VITS](https://arxiv.org/abs/2106.06103): An end-to-end TTS architecture that utilizes conditional variational autoencoder with adversarial learning
+    - [Vall-E](https://arxiv.org/abs/2301.02111): A zero-shot TTS architecture that uses a neural codec language model with discrete codes.
+    - [NaturalSpeech2](https://arxiv.org/abs/2304.09116): An architecture for TTS that utilizes a latent diffusion model to generate natural-sounding voices.
+### SVC: Singing Voice Conversion
+- Ampion supports multiple content-based features from various pretrained models, including [WeNet](https://github.com/wenet-e2e/wenet), [Whisper](https://github.com/openai/whisper), and [ContentVec](https://github.com/auspicious3000/contentvec). Their specific roles in SVC has been investigated in our NeurIPS 2023 workshop paper. [![arXiv](https://img.shields.io/badge/arXiv-Paper-<COLOR>.svg)](https://arxiv.org/abs/2310.11160) [![code](https://img.shields.io/badge/README-Code-red)](egs/svc/MultipleContentsSVC)
+- Amphion implements several state-of-the-art model architectures, including diffusion-, transformer-, VAE- and flow-based models. The diffusion-based architecture uses [Bidirectional dilated CNN](https://openreview.net/pdf?id=a-xFK8Ymz5J) as a backend and supports several sampling algorithms such as [DDPM](https://arxiv.org/pdf/2006.11239.pdf), [DDIM](https://arxiv.org/pdf/2010.02502.pdf), and [PNDM](https://arxiv.org/pdf/2202.09778.pdf). Additionally, it supports single-step inference based on the [Consistency Model](https://openreview.net/pdf?id=FmqFfMTNnv).
+### TTA: Text to Audio
+- Amphion supports the TTA with a latent diffusion model. It is designed like [AudioLDM](https://arxiv.org/abs/2301.12503), [Make-an-Audio](https://arxiv.org/abs/2301.12661), and [AUDIT](https://arxiv.org/abs/2304.00830). It is also the official implementation of the text-to-audio generation part of our NeurIPS 2023 paper. [![arXiv](https://img.shields.io/badge/arXiv-Paper-<COLOR>.svg)](https://arxiv.org/abs/2304.00830) [![code](https://img.shields.io/badge/README-Code-red)](egs/tta/RECIPE.md)
+### Vocoder
+- Amphion supports various widely-used neural vocoders, including:
+    - GAN-based vocoders: [MelGAN](https://arxiv.org/abs/1910.06711), [HiFi-GAN](https://arxiv.org/abs/2010.05646), [NSF-HiFiGAN](https://github.com/nii-yamagishilab/project-NN-Pytorch-scripts), [BigVGAN](https://arxiv.org/abs/2206.04658), [APNet](https://arxiv.org/abs/2305.07952).
+    - Flow-based vocoders: [WaveGlow](https://arxiv.org/abs/1811.00002).
+    - Diffusion-based vocoders: [Diffwave](https://arxiv.org/abs/2009.09761).
+    - Auto-regressive based vocoders: [WaveNet](https://arxiv.org/abs/1609.03499), [WaveRNN](https://arxiv.org/abs/1802.08435v1).
+- Amphion provides the official implementation of [Multi-Scale Constant-Q Transform Discriminator](https://arxiv.org/abs/2311.14957) (our ICASSP 2024 paper). It can be used to enhance any architecture GAN-based vocoders during training, and keep the inference stage (such as memory or speed) unchanged. [![arXiv](https://img.shields.io/badge/arXiv-Paper-<COLOR>.svg)](https://arxiv.org/abs/2311.14957) [![code](https://img.shields.io/badge/README-Code-red)](egs/vocoder/gan/tfr_enhanced_hifigan)
+### Evaluation
+Amphion provides a comprehensive objective evaluation of the generated audio. The evaluation metrics contain:
+- **F0 Modeling**: F0 Pearson Coefficients, F0 Periodicity Root Mean Square Error, F0 Root Mean Square Error, Voiced/Unvoiced F1 Score, etc.
+- **Energy Modeling**: Energy Root Mean Square Error, Energy Pearson Coefficients, etc.
+- **Intelligibility**: Character/Word Error Rate, which can be calculated based on [Whisper](https://github.com/openai/whisper) and more.
+- **Spectrogram Distortion**: Frechet Audio Distance (FAD), Mel Cepstral Distortion (MCD), Multi-Resolution STFT Distance (MSTFT), Perceptual Evaluation of Speech Quality (PESQ), Short Time Objective Intelligibility (STOI), etc.
+- **Speaker Similarity**: Cosine similarity, which can be calculated based on [RawNet3](https://github.com/Jungjee/RawNet), [Resemblyzer](https://github.com/resemble-ai/Resemblyzer), [WeSpeaker](https://github.com/wenet-e2e/wespeaker), [WavLM](https://github.com/microsoft/unilm/tree/master/wavlm) and more.
+### Datasets
+Amphion unifies the data preprocess of the open-source datasets including [AudioCaps](https://audiocaps.github.io/), [LibriTTS](https://www.openslr.org/60/), [LJSpeech](https://keithito.com/LJ-Speech-Dataset/), [M4Singer](https://github.com/M4Singer/M4Singer), [Opencpop](https://wenet.org.cn/opencpop/), [OpenSinger](https://github.com/Multi-Singer/Multi-Singer.github.io), [SVCC](http://vc-challenge.org/), [VCTK](https://datashare.ed.ac.uk/handle/10283/3443), and more. The supported dataset list can be seen [here](egs/datasets/README.md) (updating).
+### Visualization
+Amphion provides visualization tools to interactively illustrate the internal processing mechanism of classic models. This provides an invaluable resource for educational purposes and for facilitating understandable research.
+Currently, Amphion supports [SingVisio](egs/visualization/SingVisio/README.md), a visualization tool of the diffusion model for singing voice conversion. [![arXiv](https://img.shields.io/badge/arXiv-Paper-COLOR.svg)](https://arxiv.org/abs/2402.12660) [![openxlab](https://cdn-static.openxlab.org.cn/app-center/openxlab_app.svg)](https://openxlab.org.cn/apps/detail/Amphion/SingVisio) [![Video](https://img.shields.io/badge/Video-Demo-orange)](https://github.com/open-mmlab/Amphion/assets/33707885/0a6e39e8-d5f1-4288-b0f8-32da5a2d6e96)
+## 📀 Installation
+Amphion can be installed through either Setup Installer or Docker Image.
+### Setup Installer
+```bash
+git clone https://github.com/open-mmlab/Amphion.git
+cd Amphion
+# Install Python Environment
+conda create --name amphion python=3.9.15
+conda activate amphion
+# Install Python Packages Dependencies
+sh env.sh
+```
+### Docker Image
+1. Install [Docker](https://docs.docker.com/get-docker/), [NVIDIA Driver](https://www.nvidia.com/download/index.aspx), [NVIDIA Container Toolkit](https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/install-guide.html), and [CUDA](https://developer.nvidia.com/cuda-downloads).
+2. Run the following commands:
+```bash
+git clone https://github.com/open-mmlab/Amphion.git
+cd Amphion
+docker pull realamphion/amphion
+docker run --runtime=nvidia --gpus all -it -v .:/app realamphion/amphion
+```
+Mount dataset by argument `-v` is necessary when using Docker. Please refer to [Mount dataset in Docker container](egs/datasets/docker.md) and [Docker Docs](https://docs.docker.com/engine/reference/commandline/container_run/#volume) for more details.
+## 🐍 Usage in Python
+We detail the instructions of different tasks in the following recipes:
+- [Text to Speech (TTS)](egs/tts/README.md)
+- [Singing Voice Conversion (SVC)](egs/svc/README.md)
+- [Text to Audio (TTA)](egs/tta/README.md)
+- [Vocoder](egs/vocoder/README.md)
+- [Evaluation](egs/metrics/README.md)
+- [Visualization](egs/visualization/README.md)
+## 👨‍💻 Contributing
+We appreciate all contributions to improve Amphion. Please refer to [CONTRIBUTING.md](.github/CONTRIBUTING.md) for the contributing guideline.
+## 🙏 Acknowledgement
+- [ming024's FastSpeech2](https://github.com/ming024/FastSpeech2) and [jaywalnut310's VITS](https://github.com/jaywalnut310/vits) for model architecture code.
+- [lifeiteng's VALL-E](https://github.com/lifeiteng/vall-e) for training pipeline and model architecture design.
+- [WeNet](https://github.com/wenet-e2e/wenet), [Whisper](https://github.com/openai/whisper), [ContentVec](https://github.com/auspicious3000/contentvec), and [RawNet3](https://github.com/Jungjee/RawNet) for pretrained models and inference code.
+- [HiFi-GAN](https://github.com/jik876/hifi-gan) for GAN-based Vocoder's architecture design and training strategy.
+- [Encodec](https://github.com/facebookresearch/encodec) for well-organized GAN Discriminator's architecture and basic blocks.
+- [Latent Diffusion](https://github.com/CompVis/latent-diffusion) for model architecture design.
+- [TensorFlowTTS](https://github.com/TensorSpeech/TensorFlowTTS) for preparing the MFA tools.
+## ©️ License
+Amphion is under the [MIT License](LICENSE). It is free for both research and commercial use cases.
+## 📚 Citations
+```bibtex
+@article{zhang2023amphion,
+      title={Amphion: An Open-Source Audio, Music and Speech Generation Toolkit},
+      author={Xueyao Zhang and Liumeng Xue and Yicheng Gu and Yuancheng Wang and Haorui He and Chaoren Wang and Xi Chen and Zihao Fang and Haopeng Chen and Junan Zhang and Tze Ying Tang and Lexiao Zou and Mingxuan Wang and Jun Han and Kai Chen and Haizhou Li and Zhizheng Wu},
+      journal={arXiv},
+      year={2024},
+      volume={abs/2312.09911}
+}
+```

utils/Amphion/__init__.py ADDED Viewed

File without changes

utils/Amphion/__pycache__/__init__.cpython-38.pyc ADDED Viewed

Binary file (154 Bytes). View file

utils/Amphion/__pycache__/__init__.cpython-39.pyc ADDED Viewed

Binary file (150 Bytes). View file

utils/Amphion/bins/calc_metrics.py ADDED Viewed

	@@ -0,0 +1,268 @@

+# Copyright (c) 2023 Amphion.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+import os
+import sys
+import numpy as np
+import json
+import argparse
+import whisper
+import torch
+from glob import glob
+from tqdm import tqdm
+from collections import defaultdict
+from evaluation.metrics.energy.energy_rmse import extract_energy_rmse
+from evaluation.metrics.energy.energy_pearson_coefficients import (
+    extract_energy_pearson_coeffcients,
+)
+from evaluation.metrics.f0.f0_pearson_coefficients import extract_fpc
+from evaluation.metrics.f0.f0_periodicity_rmse import extract_f0_periodicity_rmse
+from evaluation.metrics.f0.f0_rmse import extract_f0rmse
+from evaluation.metrics.f0.v_uv_f1 import extract_f1_v_uv
+from evaluation.metrics.intelligibility.character_error_rate import extract_cer
+from evaluation.metrics.intelligibility.word_error_rate import extract_wer
+from evaluation.metrics.similarity.speaker_similarity import extract_similarity
+from evaluation.metrics.spectrogram.frechet_distance import extract_fad
+from evaluation.metrics.spectrogram.mel_cepstral_distortion import extract_mcd
+from evaluation.metrics.spectrogram.multi_resolution_stft_distance import extract_mstft
+from evaluation.metrics.spectrogram.pesq import extract_pesq
+from evaluation.metrics.spectrogram.scale_invariant_signal_to_distortion_ratio import (
+    extract_si_sdr,
+)
+from evaluation.metrics.spectrogram.scale_invariant_signal_to_noise_ratio import (
+    extract_si_snr,
+)
+from evaluation.metrics.spectrogram.short_time_objective_intelligibility import (
+    extract_stoi,
+)
+METRIC_FUNC = {
+    "energy_rmse": extract_energy_rmse,
+    "energy_pc": extract_energy_pearson_coeffcients,
+    "fpc": extract_fpc,
+    "f0_periodicity_rmse": extract_f0_periodicity_rmse,
+    "f0rmse": extract_f0rmse,
+    "v_uv_f1": extract_f1_v_uv,
+    "cer": extract_cer,
+    "wer": extract_wer,
+    "similarity": extract_similarity,
+    "fad": extract_fad,
+    "mcd": extract_mcd,
+    "mstft": extract_mstft,
+    "pesq": extract_pesq,
+    "si_sdr": extract_si_sdr,
+    "si_snr": extract_si_snr,
+    "stoi": extract_stoi,
+}
+def calc_metric(
+    ref_dir,
+    deg_dir,
+    dump_dir,
+    metrics,
+    **kwargs,
+):
+    result = defaultdict()
+    for metric in tqdm(metrics):
+        if metric in ["fad", "similarity"]:
+            result[metric] = str(METRIC_FUNC[metric](ref_dir, deg_dir, kwargs=kwargs))
+            continue
+        audios_ref = []
+        audios_deg = []
+        files = glob(deg_dir + "/*.wav")
+        for file in files:
+            audios_deg.append(file)
+            uid = file.split("/")[-1].split(".wav")[0]
+            file_gt = ref_dir + "/{}.wav".format(uid)
+            audios_ref.append(file_gt)
+        if metric in ["wer", "cer"] and kwargs["intelligibility_mode"] == "gt_content":
+            ltr_path = kwargs["ltr_path"]
+            tmpltrs = {}
+            with open(ltr_path, "r") as f:
+                for line in f:
+                    paras = line.replace("\n", "").split("|")
+                    paras[1] = paras[1].replace(" ", "")
+                    paras[1] = paras[1].replace(".", "")
+                    paras[1] = paras[1].replace("'", "")
+                    paras[1] = paras[1].replace("-", "")
+                    paras[1] = paras[1].replace(",", "")
+                    paras[1] = paras[1].replace("!", "")
+                    paras[1] = paras[1].lower()
+                    tmpltrs[paras[0]] = paras[1]
+            ltrs = []
+            files = glob(ref_dir + "/*.wav")
+            for file in files:
+                ltrs.append(tmpltrs[os.path.basename(file)])
+        if metric in ["v_uv_f1"]:
+            tp_total = 0
+            fp_total = 0
+            fn_total = 0
+            for i in tqdm(range(len(audios_ref))):
+                audio_ref = audios_ref[i]
+                audio_deg = audios_deg[i]
+                tp, fp, fn = METRIC_FUNC[metric](audio_ref, audio_deg, kwargs=kwargs)
+                tp_total += tp
+                fp_total += fp
+                fn_total += fn
+            result[metric] = str(tp_total / (tp_total + (fp_total + fn_total) / 2))
+        else:
+            scores = []
+            for i in tqdm(range(len(audios_ref))):
+                audio_ref = audios_ref[i]
+                audio_deg = audios_deg[i]
+                if metric in ["wer", "cer"]:
+                    model = whisper.load_model("large")
+                    mode = kwargs["intelligibility_mode"]
+                    if torch.cuda.is_available():
+                        device = torch.device("cuda")
+                        model = model.to(device)
+                    if mode == "gt_audio":
+                        kwargs["audio_ref"] = audio_ref
+                        kwargs["audio_deg"] = audio_deg
+                        score = METRIC_FUNC[metric](
+                            model,
+                            kwargs=kwargs,
+                        )
+                    elif mode == "gt_content":
+                        kwargs["content_gt"] = ltrs[i]
+                        kwargs["audio_deg"] = audio_deg
+                        score = METRIC_FUNC[metric](
+                            model,
+                            kwargs=kwargs,
+                        )
+                else:
+                    score = METRIC_FUNC[metric](
+                        audio_ref,
+                        audio_deg,
+                        kwargs=kwargs,
+                    )
+                if not np.isnan(score):
+                    scores.append(score)
+            scores = np.array(scores)
+            result["{}".format(metric)] = str(np.mean(scores))
+    data = json.dumps(result, indent=4)
+    with open(os.path.join(dump_dir, "result.json"), "w", newline="\n") as f:
+        f.write(data)
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--ref_dir",
+        type=str,
+        help="Path to the reference audio folder.",
+    )
+    parser.add_argument(
+        "--deg_dir",
+        type=str,
+        help="Path to the test audio folder.",
+    )
+    parser.add_argument(
+        "--dump_dir",
+        type=str,
+        help="Path to dump the results.",
+    )
+    parser.add_argument(
+        "--metrics",
+        nargs="+",
+        help="Metrics used to evaluate.",
+    )
+    parser.add_argument(
+        "--fs",
+        type=str,
+        default="None",
+        help="(Optional) Sampling rate",
+    )
+    parser.add_argument(
+        "--align_method",
+        type=str,
+        default="dtw",
+        help="(Optional) Method for aligning feature length. ['cut', 'dtw']",
+    )
+    parser.add_argument(
+        "--db_scale",
+        type=str,
+        default="True",
+        help="(Optional) Wether or not computing energy related metrics in db scale.",
+    )
+    parser.add_argument(
+        "--f0_subtract_mean",
+        type=str,
+        default="True",
+        help="(Optional) Wether or not computing f0 related metrics with mean value subtracted.",
+    )
+    parser.add_argument(
+        "--similarity_model",
+        type=str,
+        default="wavlm",
+        help="(Optional)The model for computing speaker similarity. ['rawnet', 'wavlm', 'resemblyzer']",
+    )
+    parser.add_argument(
+        "--similarity_mode",
+        type=str,
+        default="pairwith",
+        help="(Optional)The method of calculating similarity, where set to overall means computing \
+        the speaker similarity between two folder of audios content freely, and set to pairwith means \
+        computing the speaker similarity between a seires of paired gt/pred audios",
+    )
+    parser.add_argument(
+        "--ltr_path",
+        type=str,
+        default="None",
+        help="(Optional)Path to the transcription file,Note that the format in the transcription \
+            file is 'file name|transcription'",
+    )
+    parser.add_argument(
+        "--intelligibility_mode",
+        type=str,
+        default="gt_audio",
+        help="(Optional)The method of calculating WER and CER, where set to gt_audio means selecting \
+        the recognition content of the reference audio as the target, and set to gt_content means \
+        using transcription as the target",
+    )
+    parser.add_argument(
+        "--language",
+        type=str,
+        default="english",
+        help="(Optional)['english','chinese']",
+    )
+    args = parser.parse_args()
+    calc_metric(
+        args.ref_dir,
+        args.deg_dir,
+        args.dump_dir,
+        args.metrics,
+        fs=int(args.fs) if args.fs != "None" else None,
+        method=args.align_method,
+        db_scale=True if args.db_scale == "True" else False,
+        need_mean=True if args.f0_subtract_mean == "True" else False,
+        model_name=args.similarity_model,
+        similarity_mode=args.similarity_mode,
+        ltr_path=args.ltr_path,
+        intelligibility_mode=args.intelligibility_mode,
+        language=args.language,
+    )

utils/Amphion/bins/svc/inference.py ADDED Viewed

	@@ -0,0 +1,265 @@

+# Copyright (c) 2023 Amphion.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+import argparse
+import os
+import glob
+from tqdm import tqdm
+import json
+import torch
+import time
+from models.svc.diffusion.diffusion_inference import DiffusionInference
+from models.svc.comosvc.comosvc_inference import ComoSVCInference
+from models.svc.transformer.transformer_inference import TransformerInference
+from models.svc.vits.vits_inference import VitsInference
+from utils.util import load_config
+from utils.audio_slicer import split_audio, merge_segments_encodec
+from processors import acoustic_extractor, content_extractor
+def build_inference(args, cfg, infer_type="from_dataset"):
+    supported_inference = {
+        "DiffWaveNetSVC": DiffusionInference,
+        "DiffComoSVC": ComoSVCInference,
+        "TransformerSVC": TransformerInference,
+        "VitsSVC": VitsInference,
+    }
+    inference_class = supported_inference[cfg.model_type]
+    return inference_class(args, cfg, infer_type)
+def prepare_for_audio_file(args, cfg, num_workers=1):
+    preprocess_path = cfg.preprocess.processed_dir
+    audio_name = cfg.inference.source_audio_name
+    temp_audio_dir = os.path.join(preprocess_path, audio_name)
+    ### eval file
+    t = time.time()
+    eval_file = prepare_source_eval_file(cfg, temp_audio_dir, audio_name)
+    args.source = eval_file
+    with open(eval_file, "r") as f:
+        metadata = json.load(f)
+    print("Prepare for meta eval data: {:.1f}s".format(time.time() - t))
+    ### acoustic features
+    t = time.time()
+    acoustic_extractor.extract_utt_acoustic_features_serial(
+        metadata, temp_audio_dir, cfg
+    )
+    if cfg.preprocess.use_min_max_norm_mel == True:
+        acoustic_extractor.cal_mel_min_max(
+            dataset=audio_name, output_path=preprocess_path, cfg=cfg, metadata=metadata
+        )
+    acoustic_extractor.cal_pitch_statistics_svc(
+        dataset=audio_name, output_path=preprocess_path, cfg=cfg, metadata=metadata
+    )
+    print("Prepare for acoustic features: {:.1f}s".format(time.time() - t))
+    ### content features
+    t = time.time()
+    content_extractor.extract_utt_content_features_dataloader(
+        cfg, metadata, num_workers
+    )
+    print("Prepare for content features: {:.1f}s".format(time.time() - t))
+    return args, cfg, temp_audio_dir
+def merge_for_audio_segments(audio_files, args, cfg):
+    audio_name = cfg.inference.source_audio_name
+    target_singer_name = args.target_singer
+    merge_segments_encodec(
+        wav_files=audio_files,
+        fs=cfg.preprocess.sample_rate,
+        output_path=os.path.join(
+            args.output_dir, "{}_{}.wav".format(audio_name, target_singer_name)
+        ),
+        overlap_duration=cfg.inference.segments_overlap_duration,
+    )
+    for tmp_file in audio_files:
+        os.remove(tmp_file)
+def prepare_source_eval_file(cfg, temp_audio_dir, audio_name):
+    """
+    Prepare the eval file (json) for an audio
+    """
+    audio_chunks_results = split_audio(
+        wav_file=cfg.inference.source_audio_path,
+        target_sr=cfg.preprocess.sample_rate,
+        output_dir=os.path.join(temp_audio_dir, "wavs"),
+        max_duration_of_segment=cfg.inference.segments_max_duration,
+        overlap_duration=cfg.inference.segments_overlap_duration,
+    )
+    metadata = []
+    for i, res in enumerate(audio_chunks_results):
+        res["index"] = i
+        res["Dataset"] = audio_name
+        res["Singer"] = audio_name
+        res["Uid"] = "{}_{}".format(audio_name, res["Uid"])
+        metadata.append(res)
+    eval_file = os.path.join(temp_audio_dir, "eval.json")
+    with open(eval_file, "w") as f:
+        json.dump(metadata, f, indent=4, ensure_ascii=False, sort_keys=True)
+    return eval_file
+def cuda_relevant(deterministic=False):
+    torch.cuda.empty_cache()
+    # TF32 on Ampere and above
+    torch.backends.cuda.matmul.allow_tf32 = True
+    torch.backends.cudnn.enabled = True
+    torch.backends.cudnn.allow_tf32 = True
+    # Deterministic
+    torch.backends.cudnn.deterministic = deterministic
+    torch.backends.cudnn.benchmark = not deterministic
+    torch.use_deterministic_algorithms(deterministic)
+def infer(args, cfg, infer_type):
+    # Build inference
+    t = time.time()
+    trainer = build_inference(args, cfg, infer_type)
+    print("Model Init: {:.1f}s".format(time.time() - t))
+    # Run inference
+    t = time.time()
+    output_audio_files = trainer.inference()
+    print("Model inference: {:.1f}s".format(time.time() - t))
+    return output_audio_files
+def build_parser():
+    r"""Build argument parser for inference.py.
+    Anything else should be put in an extra config YAML file.
+    """
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--config",
+        type=str,
+        required=True,
+        help="JSON/YAML file for configurations.",
+    )
+    parser.add_argument(
+        "--acoustics_dir",
+        type=str,
+        help="Acoustics model checkpoint directory. If a directory is given, "
+        "search for the latest checkpoint dir in the directory. If a specific "
+        "checkpoint dir is given, directly load the checkpoint.",
+    )
+    parser.add_argument(
+        "--vocoder_dir",
+        type=str,
+        required=True,
+        help="Vocoder checkpoint directory. Searching behavior is the same as "
+        "the acoustics one.",
+    )
+    parser.add_argument(
+        "--target_singer",
+        type=str,
+        required=True,
+        help="convert to a specific singer (e.g. --target_singers singer_id).",
+    )
+    parser.add_argument(
+        "--trans_key",
+        default=0,
+        help="0: no pitch shift; autoshift: pitch shift;  int: key shift.",
+    )
+    parser.add_argument(
+        "--source",
+        type=str,
+        default="source_audio",
+        help="Source audio file or directory. If a JSON file is given, "
+        "inference from dataset is applied. If a directory is given, "
+        "inference from all wav/flac/mp3 audio files in the directory is applied. "
+        "Default: inference from all wav/flac/mp3 audio files in ./source_audio",
+    )
+    parser.add_argument(
+        "--output_dir",
+        type=str,
+        default="conversion_results",
+        help="Output directory. Default: ./conversion_results",
+    )
+    parser.add_argument(
+        "--log_level",
+        type=str,
+        default="warning",
+        help="Logging level. Default: warning",
+    )
+    parser.add_argument(
+        "--keep_cache",
+        action="store_true",
+        default=True,
+        help="Keep cache files. Only applicable to inference from files.",
+    )
+    parser.add_argument(
+        "--diffusion_inference_steps",
+        type=int,
+        default=1000,
+        help="Number of inference steps. Only applicable to diffusion inference.",
+    )
+    return parser
+def main():
+    ### Parse arguments and config
+    args = build_parser().parse_args()
+    cfg = load_config(args.config)
+    # CUDA settings
+    cuda_relevant()
+    if os.path.isdir(args.source):
+        ### Infer from file
+        # Get all the source audio files (.wav, .flac, .mp3)
+        source_audio_dir = args.source
+        audio_list = []
+        for suffix in ["wav", "flac", "mp3"]:
+            audio_list += glob.glob(
+                os.path.join(source_audio_dir, "**/*.{}".format(suffix)), recursive=True
+            )
+        print("There are {} source audios: ".format(len(audio_list)))
+        # Infer for every file as dataset
+        output_root_path = args.output_dir
+        for audio_path in tqdm(audio_list):
+            audio_name = audio_path.split("/")[-1].split(".")[0]
+            args.output_dir = os.path.join(output_root_path, audio_name)
+            print("\n{}\nConversion for {}...\n".format("*" * 10, audio_name))
+            cfg.inference.source_audio_path = audio_path
+            cfg.inference.source_audio_name = audio_name
+            cfg.inference.segments_max_duration = 10.0
+            cfg.inference.segments_overlap_duration = 1.0
+            # Prepare metadata and features
+            args, cfg, cache_dir = prepare_for_audio_file(args, cfg)
+            # Infer from file
+            output_audio_files = infer(args, cfg, infer_type="from_file")
+            # Merge the split segments
+            merge_for_audio_segments(output_audio_files, args, cfg)
+            # Keep or remove caches
+            if not args.keep_cache:
+                os.removedirs(cache_dir)
+    else:
+        ### Infer from dataset
+        infer(args, cfg, infer_type="from_dataset")
+if __name__ == "__main__":
+    main()

utils/Amphion/bins/svc/preprocess.py ADDED Viewed

	@@ -0,0 +1,183 @@

+# Copyright (c) 2023 Amphion.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+import faulthandler
+faulthandler.enable()
+import os
+import argparse
+import json
+from multiprocessing import cpu_count
+from utils.util import load_config
+from preprocessors.processor import preprocess_dataset
+from preprocessors.metadata import cal_metadata
+from processors import acoustic_extractor, content_extractor, data_augment
+def extract_acoustic_features(dataset, output_path, cfg, n_workers=1):
+    """Extract acoustic features of utterances in the dataset
+    Args:
+        dataset (str): name of dataset, e.g. opencpop
+        output_path (str): directory that stores train, test and feature files of datasets
+        cfg (dict): dictionary that stores configurations
+        n_workers (int, optional): num of processes to extract features in parallel. Defaults to 1.
+    """
+    types = ["train", "test"] if "eval" not in dataset else ["test"]
+    metadata = []
+    dataset_output = os.path.join(output_path, dataset)
+    for dataset_type in types:
+        dataset_file = os.path.join(dataset_output, "{}.json".format(dataset_type))
+        with open(dataset_file, "r") as f:
+            metadata.extend(json.load(f))
+        # acoustic_extractor.extract_utt_acoustic_features_parallel(
+        #     metadata, dataset_output, cfg, n_workers=n_workers
+        # )
+    acoustic_extractor.extract_utt_acoustic_features_serial(
+        metadata, dataset_output, cfg
+    )
+def extract_content_features(dataset, output_path, cfg, num_workers=1):
+    """Extract content features of utterances in the dataset
+    Args:
+        dataset (str): name of dataset, e.g. opencpop
+        output_path (str): directory that stores train, test and feature files of datasets
+        cfg (dict): dictionary that stores configurations
+    """
+    types = ["train", "test"] if "eval" not in dataset else ["test"]
+    metadata = []
+    for dataset_type in types:
+        dataset_output = os.path.join(output_path, dataset)
+        dataset_file = os.path.join(dataset_output, "{}.json".format(dataset_type))
+        with open(dataset_file, "r") as f:
+            metadata.extend(json.load(f))
+    content_extractor.extract_utt_content_features_dataloader(
+        cfg, metadata, num_workers
+    )
+def preprocess(cfg, args):
+    """Proprocess raw data of single or multiple datasets (in cfg.dataset)
+    Args:
+        cfg (dict): dictionary that stores configurations
+        args (ArgumentParser): specify the configuration file and num_workers
+    """
+    # Specify the output root path to save the processed data
+    output_path = cfg.preprocess.processed_dir
+    os.makedirs(output_path, exist_ok=True)
+    ## Split train and test sets
+    for dataset in cfg.dataset:
+        print("Preprocess {}...".format(dataset))
+        preprocess_dataset(
+            dataset,
+            cfg.dataset_path[dataset],
+            output_path,
+            cfg.preprocess,
+            cfg.task_type,
+            is_custom_dataset=dataset in cfg.use_custom_dataset,
+        )
+    # Data augmentation: create new wav files with pitch shift, formant shift, equalizer, time stretch
+    try:
+        assert isinstance(
+            cfg.preprocess.data_augment, list
+        ), "Please provide a list of datasets need to be augmented."
+        if len(cfg.preprocess.data_augment) > 0:
+            new_datasets_list = []
+            for dataset in cfg.preprocess.data_augment:
+                new_datasets = data_augment.augment_dataset(cfg, dataset)
+                new_datasets_list.extend(new_datasets)
+            cfg.dataset.extend(new_datasets_list)
+            print("Augmentation datasets: ", cfg.dataset)
+    except:
+        print("No Data Augmentation.")
+    # Dump metadata of datasets (singers, train/test durations, etc.)
+    cal_metadata(cfg)
+    ## Prepare the acoustic features
+    for dataset in cfg.dataset:
+        # Skip augmented datasets which do not need to extract acoustic features
+        # We will copy acoustic features from the original dataset later
+        if (
+            "pitch_shift" in dataset
+            or "formant_shift" in dataset
+            or "equalizer" in dataset in dataset
+        ):
+            continue
+        print(
+            "Extracting acoustic features for {} using {} workers ...".format(
+                dataset, args.num_workers
+            )
+        )
+        extract_acoustic_features(dataset, output_path, cfg, args.num_workers)
+        # Calculate the statistics of acoustic features
+        if cfg.preprocess.mel_min_max_norm:
+            acoustic_extractor.cal_mel_min_max(dataset, output_path, cfg)
+        if cfg.preprocess.extract_pitch:
+            acoustic_extractor.cal_pitch_statistics_svc(dataset, output_path, cfg)
+    # Copy acoustic features for augmented datasets by creating soft-links
+    for dataset in cfg.dataset:
+        if "pitch_shift" in dataset:
+            src_dataset = dataset.replace("_pitch_shift", "")
+            src_dataset_dir = os.path.join(output_path, src_dataset)
+        elif "formant_shift" in dataset:
+            src_dataset = dataset.replace("_formant_shift", "")
+            src_dataset_dir = os.path.join(output_path, src_dataset)
+        elif "equalizer" in dataset:
+            src_dataset = dataset.replace("_equalizer", "")
+            src_dataset_dir = os.path.join(output_path, src_dataset)
+        else:
+            continue
+        dataset_dir = os.path.join(output_path, dataset)
+        metadata = []
+        for split in ["train", "test"] if not "eval" in dataset else ["test"]:
+            metadata_file_path = os.path.join(src_dataset_dir, "{}.json".format(split))
+            with open(metadata_file_path, "r") as f:
+                metadata.extend(json.load(f))
+        print("Copying acoustic features for {}...".format(dataset))
+        acoustic_extractor.copy_acoustic_features(
+            metadata, dataset_dir, src_dataset_dir, cfg
+        )
+        if cfg.preprocess.mel_min_max_norm:
+            acoustic_extractor.cal_mel_min_max(dataset, output_path, cfg)
+        if cfg.preprocess.extract_pitch:
+            acoustic_extractor.cal_pitch_statistics(dataset, output_path, cfg)
+    # Prepare the content features
+    for dataset in cfg.dataset:
+        print("Extracting content features for {}...".format(dataset))
+        extract_content_features(dataset, output_path, cfg, args.num_workers)
+def main():
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--config", default="config.json", help="json files for configurations."
+    )
+    parser.add_argument("--num_workers", type=int, default=int(cpu_count()))
+    parser.add_argument("--prepare_alignment", type=bool, default=False)
+    args = parser.parse_args()
+    cfg = load_config(args.config)
+    preprocess(cfg, args)
+if __name__ == "__main__":
+    main()

utils/Amphion/bins/svc/train.py ADDED Viewed

	@@ -0,0 +1,111 @@

+# Copyright (c) 2023 Amphion.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+import argparse
+import torch
+from models.svc.diffusion.diffusion_trainer import DiffusionTrainer
+from models.svc.comosvc.comosvc_trainer import ComoSVCTrainer
+from models.svc.transformer.transformer_trainer import TransformerTrainer
+from models.svc.vits.vits_trainer import VitsSVCTrainer
+from utils.util import load_config
+def build_trainer(args, cfg):
+    supported_trainer = {
+        "DiffWaveNetSVC": DiffusionTrainer,
+        "DiffComoSVC": ComoSVCTrainer,
+        "TransformerSVC": TransformerTrainer,
+        "VitsSVC": VitsSVCTrainer,
+    }
+    trainer_class = supported_trainer[cfg.model_type]
+    trainer = trainer_class(args, cfg)
+    return trainer
+def cuda_relevant(deterministic=False):
+    torch.cuda.empty_cache()
+    # TF32 on Ampere and above
+    torch.backends.cuda.matmul.allow_tf32 = True
+    torch.backends.cudnn.enabled = True
+    torch.backends.cudnn.allow_tf32 = True
+    # Deterministic
+    torch.backends.cudnn.deterministic = deterministic
+    torch.backends.cudnn.benchmark = not deterministic
+    torch.use_deterministic_algorithms(deterministic)
+def main():
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--config",
+        default="config.json",
+        help="json files for configurations.",
+        required=True,
+    )
+    parser.add_argument(
+        "--exp_name",
+        type=str,
+        default="exp_name",
+        help="A specific name to note the experiment",
+        required=True,
+    )
+    parser.add_argument(
+        "--resume",
+        action="store_true",
+        help="If specified, to resume from the existing checkpoint.",
+    )
+    parser.add_argument(
+        "--resume_from_ckpt_path",
+        type=str,
+        default="",
+        help="The specific checkpoint path that you want to resume from.",
+    )
+    parser.add_argument(
+        "--resume_type",
+        type=str,
+        default="",
+        help="`resume` for loading all the things (including model weights, optimizer, scheduler, and random states). `finetune` for loading only the model weights",
+    )
+    parser.add_argument(
+        "--log_level", default="warning", help="logging level (debug, info, warning)"
+    )
+    args = parser.parse_args()
+    cfg = load_config(args.config)
+    # Data Augmentation
+    if (
+        type(cfg.preprocess.data_augment) == list
+        and len(cfg.preprocess.data_augment) > 0
+    ):
+        new_datasets_list = []
+        for dataset in cfg.preprocess.data_augment:
+            new_datasets = [
+                f"{dataset}_pitch_shift" if cfg.preprocess.use_pitch_shift else None,
+                (
+                    f"{dataset}_formant_shift"
+                    if cfg.preprocess.use_formant_shift
+                    else None
+                ),
+                f"{dataset}_equalizer" if cfg.preprocess.use_equalizer else None,
+                f"{dataset}_time_stretch" if cfg.preprocess.use_time_stretch else None,
+            ]
+            new_datasets_list.extend(filter(None, new_datasets))
+        cfg.dataset.extend(new_datasets_list)
+    # CUDA settings
+    cuda_relevant()
+    # Build trainer
+    trainer = build_trainer(args, cfg)
+    trainer.train_loop()
+if __name__ == "__main__":
+    main()

utils/Amphion/bins/tta/inference.py ADDED Viewed

	@@ -0,0 +1,94 @@

+# Copyright (c) 2023 Amphion.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+import argparse
+from argparse import ArgumentParser
+import os
+from models.tta.ldm.audioldm_inference import AudioLDMInference
+from utils.util import save_config, load_model_config, load_config
+import numpy as np
+import torch
+def build_inference(args, cfg):
+    supported_inference = {
+        "AudioLDM": AudioLDMInference,
+    }
+    inference_class = supported_inference[cfg.model_type]
+    inference = inference_class(args, cfg)
+    return inference
+def build_parser():
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--config",
+        type=str,
+        required=True,
+        help="JSON/YAML file for configurations.",
+    )
+    parser.add_argument(
+        "--text",
+        help="Text to be synthesized",
+        type=str,
+        default="Text to be synthesized.",
+    )
+    parser.add_argument(
+        "--checkpoint_path",
+        type=str,
+    )
+    parser.add_argument(
+        "--vocoder_path", type=str, help="Checkpoint path of the vocoder"
+    )
+    parser.add_argument(
+        "--vocoder_config_path", type=str, help="Config path of the vocoder"
+    )
+    parser.add_argument(
+        "--output_dir",
+        type=str,
+        default=None,
+        help="Output dir for saving generated results",
+    )
+    parser.add_argument(
+        "--num_steps",
+        type=int,
+        default=200,
+        help="The total number of denosing steps",
+    )
+    parser.add_argument(
+        "--guidance_scale",
+        type=float,
+        default=4.0,
+        help="The scale of classifer free guidance",
+    )
+    parser.add_argument("--local_rank", default=-1, type=int)
+    return parser
+def main():
+    # Parse arguments
+    args = build_parser().parse_args()
+    # args, infer_type = formulate_parser(args)
+    # Parse config
+    cfg = load_config(args.config)
+    if torch.cuda.is_available():
+        args.local_rank = torch.device("cuda")
+    else:
+        args.local_rank = torch.device("cpu")
+    print("args: ", args)
+    # Build inference
+    inferencer = build_inference(args, cfg)
+    # Run inference
+    inferencer.inference()
+if __name__ == "__main__":
+    main()

utils/Amphion/bins/tta/preprocess.py ADDED Viewed

	@@ -0,0 +1,195 @@

+# Copyright (c) 2023 Amphion.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+import faulthandler
+faulthandler.enable()
+import os
+import argparse
+import json
+import pyworld as pw
+from multiprocessing import cpu_count
+from utils.util import load_config
+from preprocessors.processor import preprocess_dataset, prepare_align
+from preprocessors.metadata import cal_metadata
+from processors import acoustic_extractor, content_extractor, data_augment
+def extract_acoustic_features(dataset, output_path, cfg, n_workers=1):
+    """Extract acoustic features of utterances in the dataset
+    Args:
+        dataset (str): name of dataset, e.g. opencpop
+        output_path (str): directory that stores train, test and feature files of datasets
+        cfg (dict): dictionary that stores configurations
+        n_workers (int, optional): num of processes to extract features in parallel. Defaults to 1.
+    """
+    types = ["train", "test"] if "eval" not in dataset else ["test"]
+    metadata = []
+    for dataset_type in types:
+        dataset_output = os.path.join(output_path, dataset)
+        dataset_file = os.path.join(dataset_output, "{}.json".format(dataset_type))
+        with open(dataset_file, "r") as f:
+            metadata.extend(json.load(f))
+        # acoustic_extractor.extract_utt_acoustic_features_parallel(
+        #     metadata, dataset_output, cfg, n_workers=n_workers
+        # )
+    acoustic_extractor.extract_utt_acoustic_features_serial(
+        metadata, dataset_output, cfg
+    )
+def extract_content_features(dataset, output_path, cfg, num_workers=1):
+    """Extract content features of utterances in the dataset
+    Args:
+        dataset (str): name of dataset, e.g. opencpop
+        output_path (str): directory that stores train, test and feature files of datasets
+        cfg (dict): dictionary that stores configurations
+    """
+    types = ["train", "test"] if "eval" not in dataset else ["test"]
+    metadata = []
+    for dataset_type in types:
+        dataset_output = os.path.join(output_path, dataset)
+        dataset_file = os.path.join(dataset_output, "{}.json".format(dataset_type))
+        with open(dataset_file, "r") as f:
+            metadata.extend(json.load(f))
+    content_extractor.extract_utt_content_features_dataloader(
+        cfg, metadata, num_workers
+    )
+def preprocess(cfg, args):
+    """Proprocess raw data of single or multiple datasets (in cfg.dataset)
+    Args:
+        cfg (dict): dictionary that stores configurations
+        args (ArgumentParser): specify the configuration file and num_workers
+    """
+    # Specify the output root path to save the processed data
+    output_path = cfg.preprocess.processed_dir
+    os.makedirs(output_path, exist_ok=True)
+    ## Split train and test sets
+    for dataset in cfg.dataset:
+        print("Preprocess {}...".format(dataset))
+        if args.prepare_alignment:
+            ## Prepare alignment with MFA
+            print("Prepare alignment {}...".format(dataset))
+            prepare_align(
+                dataset, cfg.dataset_path[dataset], cfg.preprocess, output_path
+            )
+        preprocess_dataset(
+            dataset,
+            cfg.dataset_path[dataset],
+            output_path,
+            cfg.preprocess,
+            cfg.task_type,
+            is_custom_dataset=dataset in cfg.use_custom_dataset,
+        )
+    # Data augmentation: create new wav files with pitch shift, formant shift, equalizer, time stretch
+    try:
+        assert isinstance(
+            cfg.preprocess.data_augment, list
+        ), "Please provide a list of datasets need to be augmented."
+        if len(cfg.preprocess.data_augment) > 0:
+            new_datasets_list = []
+            for dataset in cfg.preprocess.data_augment:
+                new_datasets = data_augment.augment_dataset(cfg, dataset)
+                new_datasets_list.extend(new_datasets)
+            cfg.dataset.extend(new_datasets_list)
+            print("Augmentation datasets: ", cfg.dataset)
+    except:
+        print("No Data Augmentation.")
+    # Dump metadata of datasets (singers, train/test durations, etc.)
+    cal_metadata(cfg)
+    ## Prepare the acoustic features
+    for dataset in cfg.dataset:
+        # Skip augmented datasets which do not need to extract acoustic features
+        # We will copy acoustic features from the original dataset later
+        if (
+            "pitch_shift" in dataset
+            or "formant_shift" in dataset
+            or "equalizer" in dataset in dataset
+        ):
+            continue
+        print(
+            "Extracting acoustic features for {} using {} workers ...".format(
+                dataset, args.num_workers
+            )
+        )
+        extract_acoustic_features(dataset, output_path, cfg, args.num_workers)
+        # Calculate the statistics of acoustic features
+        if cfg.preprocess.mel_min_max_norm:
+            acoustic_extractor.cal_mel_min_max(dataset, output_path, cfg)
+        if cfg.preprocess.extract_pitch:
+            acoustic_extractor.cal_pitch_statistics(dataset, output_path, cfg)
+        if cfg.preprocess.extract_energy:
+            acoustic_extractor.cal_energy_statistics(dataset, output_path, cfg)
+    if cfg.preprocess.align_mel_duration:
+        acoustic_extractor.align_duration_mel(dataset, output_path, cfg)
+    # Copy acoustic features for augmented datasets by creating soft-links
+    for dataset in cfg.dataset:
+        if "pitch_shift" in dataset:
+            src_dataset = dataset.replace("_pitch_shift", "")
+            src_dataset_dir = os.path.join(output_path, src_dataset)
+        elif "formant_shift" in dataset:
+            src_dataset = dataset.replace("_formant_shift", "")
+            src_dataset_dir = os.path.join(output_path, src_dataset)
+        elif "equalizer" in dataset:
+            src_dataset = dataset.replace("_equalizer", "")
+            src_dataset_dir = os.path.join(output_path, src_dataset)
+        else:
+            continue
+        dataset_dir = os.path.join(output_path, dataset)
+        metadata = []
+        for split in ["train", "test"] if not "eval" in dataset else ["test"]:
+            metadata_file_path = os.path.join(src_dataset_dir, "{}.json".format(split))
+            with open(metadata_file_path, "r") as f:
+                metadata.extend(json.load(f))
+        print("Copying acoustic features for {}...".format(dataset))
+        acoustic_extractor.copy_acoustic_features(
+            metadata, dataset_dir, src_dataset_dir, cfg
+        )
+        if cfg.preprocess.mel_min_max_norm:
+            acoustic_extractor.cal_mel_min_max(dataset, output_path, cfg)
+        if cfg.preprocess.extract_pitch:
+            acoustic_extractor.cal_pitch_statistics(dataset, output_path, cfg)
+    # Prepare the content features
+    for dataset in cfg.dataset:
+        print("Extracting content features for {}...".format(dataset))
+        extract_content_features(dataset, output_path, cfg, args.num_workers)
+def main():
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--config", default="config.json", help="json files for configurations."
+    )
+    parser.add_argument("--num_workers", type=int, default=int(cpu_count()))
+    parser.add_argument("--prepare_alignment", type=bool, default=False)
+    args = parser.parse_args()
+    cfg = load_config(args.config)
+    preprocess(cfg, args)
+if __name__ == "__main__":
+    main()

utils/Amphion/bins/tta/train_tta.py ADDED Viewed

	@@ -0,0 +1,77 @@

+# Copyright (c) 2023 Amphion.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+import argparse
+import os
+import torch
+from models.tta.autoencoder.autoencoder_trainer import AutoencoderKLTrainer
+from models.tta.ldm.audioldm_trainer import AudioLDMTrainer
+from utils.util import load_config
+def build_trainer(args, cfg):
+    supported_trainer = {
+        "AutoencoderKL": AutoencoderKLTrainer,
+        "AudioLDM": AudioLDMTrainer,
+    }
+    trainer_class = supported_trainer[cfg.model_type]
+    trainer = trainer_class(args, cfg)
+    return trainer
+def main():
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--config",
+        default="config.json",
+        help="json files for configurations.",
+        required=True,
+    )
+    parser.add_argument(
+        "--num_workers", type=int, default=6, help="Number of dataloader workers."
+    )
+    parser.add_argument(
+        "--exp_name",
+        type=str,
+        default="exp_name",
+        help="A specific name to note the experiment",
+        required=True,
+    )
+    parser.add_argument(
+        "--resume",
+        type=str,
+        default=None,
+        # action="store_true",
+        help="The model name to restore",
+    )
+    parser.add_argument(
+        "--log_level", default="info", help="logging level (info, debug, warning)"
+    )
+    parser.add_argument("--stdout_interval", default=5, type=int)
+    parser.add_argument("--local_rank", default=-1, type=int)
+    args = parser.parse_args()
+    cfg = load_config(args.config)
+    cfg.exp_name = args.exp_name
+    # Model saving dir
+    args.log_dir = os.path.join(cfg.log_dir, args.exp_name)
+    os.makedirs(args.log_dir, exist_ok=True)
+    if not cfg.train.ddp:
+        args.local_rank = torch.device("cuda")
+    # Build trainer
+    trainer = build_trainer(args, cfg)
+    # Restore models
+    if args.resume:
+        trainer.restore()
+    trainer.train()
+if __name__ == "__main__":
+    main()

utils/Amphion/bins/tts/inference.py ADDED Viewed

	@@ -0,0 +1,167 @@

+# Copyright (c) 2023 Amphion.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+import argparse
+from argparse import ArgumentParser
+import os
+from models.tts.fastspeech2.fs2_inference import FastSpeech2Inference
+from models.tts.vits.vits_inference import VitsInference
+from models.tts.valle.valle_inference import VALLEInference
+from models.tts.naturalspeech2.ns2_inference import NS2Inference
+from utils.util import load_config
+import torch
+def build_inference(args, cfg):
+    supported_inference = {
+        "FastSpeech2": FastSpeech2Inference,
+        "VITS": VitsInference,
+        "VALLE": VALLEInference,
+        "NaturalSpeech2": NS2Inference,
+    }
+    inference_class = supported_inference[cfg.model_type]
+    inference = inference_class(args, cfg)
+    return inference
+def cuda_relevant(deterministic=False):
+    torch.cuda.empty_cache()
+    # TF32 on Ampere and above
+    torch.backends.cuda.matmul.allow_tf32 = True
+    torch.backends.cudnn.enabled = True
+    torch.backends.cudnn.allow_tf32 = True
+    # Deterministic
+    torch.backends.cudnn.deterministic = deterministic
+    torch.backends.cudnn.benchmark = not deterministic
+    torch.use_deterministic_algorithms(deterministic)
+def build_parser():
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--config",
+        type=str,
+        required=True,
+        help="JSON/YAML file for configurations.",
+    )
+    parser.add_argument(
+        "--dataset",
+        type=str,
+        help="convert from the source data",
+        default=None,
+    )
+    parser.add_argument(
+        "--testing_set",
+        type=str,
+        help="train, test, golden_test",
+        default="test",
+    )
+    parser.add_argument(
+        "--test_list_file",
+        type=str,
+        help="convert from the test list file",
+        default=None,
+    )
+    parser.add_argument(
+        "--speaker_name",
+        type=str,
+        default=None,
+        help="speaker name for multi-speaker synthesis, for single-sentence mode only",
+    )
+    parser.add_argument(
+        "--text",
+        help="Text to be synthesized.",
+        type=str,
+        default="",
+    )
+    parser.add_argument(
+        "--vocoder_dir",
+        type=str,
+        default=None,
+        help="Vocoder checkpoint directory. Searching behavior is the same as "
+        "the acoustics one.",
+    )
+    parser.add_argument(
+        "--acoustics_dir",
+        type=str,
+        default=None,
+        help="Acoustic model checkpoint directory. If a directory is given, "
+        "search for the latest checkpoint dir in the directory. If a specific "
+        "checkpoint dir is given, directly load the checkpoint.",
+    )
+    parser.add_argument(
+        "--checkpoint_path",
+        type=str,
+        default=None,
+        help="Acoustic model checkpoint directory. If a directory is given, "
+        "search for the latest checkpoint dir in the directory. If a specific "
+        "checkpoint dir is given, directly load the checkpoint.",
+    )
+    parser.add_argument(
+        "--mode",
+        type=str,
+        choices=["batch", "single"],
+        required=True,
+        help="Synthesize a whole dataset or a single sentence",
+    )
+    parser.add_argument(
+        "--log_level",
+        type=str,
+        default="warning",
+        help="Logging level. Default: warning",
+    )
+    parser.add_argument(
+        "--pitch_control",
+        type=float,
+        default=1.0,
+        help="control the pitch of the whole utterance, larger value for higher pitch",
+    )
+    parser.add_argument(
+        "--energy_control",
+        type=float,
+        default=1.0,
+        help="control the energy of the whole utterance, larger value for larger volume",
+    )
+    parser.add_argument(
+        "--duration_control",
+        type=float,
+        default=1.0,
+        help="control the speed of the whole utterance, larger value for slower speaking rate",
+    )
+    parser.add_argument(
+        "--output_dir",
+        type=str,
+        default=None,
+        help="Output dir for saving generated results",
+    )
+    return parser
+def main():
+    # Parse arguments
+    parser = build_parser()
+    VALLEInference.add_arguments(parser)
+    NS2Inference.add_arguments(parser)
+    args = parser.parse_args()
+    print(args)
+    # Parse config
+    cfg = load_config(args.config)
+    # CUDA settings
+    cuda_relevant()
+    # Build inference
+    inferencer = build_inference(args, cfg)
+    # Run inference
+    inferencer.inference()
+if __name__ == "__main__":
+    main()

utils/Amphion/bins/tts/preprocess.py ADDED Viewed

	@@ -0,0 +1,244 @@

+# Copyright (c) 2023 Amphion.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+import faulthandler
+faulthandler.enable()
+import os
+import argparse
+import json
+import pyworld as pw
+from multiprocessing import cpu_count
+from utils.util import load_config
+from preprocessors.processor import preprocess_dataset, prepare_align
+from preprocessors.metadata import cal_metadata
+from processors import (
+    acoustic_extractor,
+    content_extractor,
+    data_augment,
+    phone_extractor,
+)
+def extract_acoustic_features(dataset, output_path, cfg, dataset_types, n_workers=1):
+    """Extract acoustic features of utterances in the dataset
+    Args:
+        dataset (str): name of dataset, e.g. opencpop
+        output_path (str): directory that stores train, test and feature files of datasets
+        cfg (dict): dictionary that stores configurations
+        n_workers (int, optional): num of processes to extract features in parallel. Defaults to 1.
+    """
+    metadata = []
+    for dataset_type in dataset_types:
+        dataset_output = os.path.join(output_path, dataset)
+        dataset_file = os.path.join(dataset_output, "{}.json".format(dataset_type))
+        with open(dataset_file, "r") as f:
+            metadata.extend(json.load(f))
+        # acoustic_extractor.extract_utt_acoustic_features_parallel(
+        #     metadata, dataset_output, cfg, n_workers=n_workers
+        # )
+    acoustic_extractor.extract_utt_acoustic_features_serial(
+        metadata, dataset_output, cfg
+    )
+def extract_content_features(dataset, output_path, cfg, dataset_types, num_workers=1):
+    """Extract content features of utterances in the dataset
+    Args:
+        dataset (str): name of dataset, e.g. opencpop
+        output_path (str): directory that stores train, test and feature files of datasets
+        cfg (dict): dictionary that stores configurations
+    """
+    metadata = []
+    for dataset_type in dataset_types:
+        dataset_output = os.path.join(output_path, dataset)
+        # dataset_file = os.path.join(dataset_output, "{}.json".format(dataset_type))
+        dataset_file = os.path.join(dataset_output, "{}.json".format(dataset_type))
+        with open(dataset_file, "r") as f:
+            metadata.extend(json.load(f))
+    content_extractor.extract_utt_content_features_dataloader(
+        cfg, metadata, num_workers
+    )
+def extract_phonme_sequences(dataset, output_path, cfg, dataset_types):
+    """Extract phoneme features of utterances in the dataset
+    Args:
+        dataset (str): name of dataset, e.g. opencpop
+        output_path (str): directory that stores train, test and feature files of datasets
+        cfg (dict): dictionary that stores configurations
+    """
+    metadata = []
+    for dataset_type in dataset_types:
+        dataset_output = os.path.join(output_path, dataset)
+        dataset_file = os.path.join(dataset_output, "{}.json".format(dataset_type))
+        with open(dataset_file, "r") as f:
+            metadata.extend(json.load(f))
+    phone_extractor.extract_utt_phone_sequence(dataset, cfg, metadata)
+def preprocess(cfg, args):
+    """Preprocess raw data of single or multiple datasets (in cfg.dataset)
+    Args:
+        cfg (dict): dictionary that stores configurations
+        args (ArgumentParser): specify the configuration file and num_workers
+    """
+    # Specify the output root path to save the processed data
+    output_path = cfg.preprocess.processed_dir
+    os.makedirs(output_path, exist_ok=True)
+    # Split train and test sets
+    for dataset in cfg.dataset:
+        print("Preprocess {}...".format(dataset))
+        if args.prepare_alignment:
+            # Prepare alignment with MFA
+            print("Prepare alignment {}...".format(dataset))
+            prepare_align(
+                dataset, cfg.dataset_path[dataset], cfg.preprocess, output_path
+            )
+        preprocess_dataset(
+            dataset,
+            cfg.dataset_path[dataset],
+            output_path,
+            cfg.preprocess,
+            cfg.task_type,
+            is_custom_dataset=dataset in cfg.use_custom_dataset,
+        )
+    # Data augmentation: create new wav files with pitch shift, formant shift, equalizer, time stretch
+    try:
+        assert isinstance(
+            cfg.preprocess.data_augment, list
+        ), "Please provide a list of datasets need to be augmented."
+        if len(cfg.preprocess.data_augment) > 0:
+            new_datasets_list = []
+            for dataset in cfg.preprocess.data_augment:
+                new_datasets = data_augment.augment_dataset(cfg, dataset)
+                new_datasets_list.extend(new_datasets)
+            cfg.dataset.extend(new_datasets_list)
+            print("Augmentation datasets: ", cfg.dataset)
+    except:
+        print("No Data Augmentation.")
+    # json files
+    dataset_types = list()
+    dataset_types.append((cfg.preprocess.train_file).split(".")[0])
+    dataset_types.append((cfg.preprocess.valid_file).split(".")[0])
+    if "test" not in dataset_types:
+        dataset_types.append("test")
+    if "eval" in dataset:
+        dataset_types = ["test"]
+    # Dump metadata of datasets (singers, train/test durations, etc.)
+    cal_metadata(cfg, dataset_types)
+    # Prepare the acoustic features
+    for dataset in cfg.dataset:
+        # Skip augmented datasets which do not need to extract acoustic features
+        # We will copy acoustic features from the original dataset later
+        if (
+            "pitch_shift" in dataset
+            or "formant_shift" in dataset
+            or "equalizer" in dataset in dataset
+        ):
+            continue
+        print(
+            "Extracting acoustic features for {} using {} workers ...".format(
+                dataset, args.num_workers
+            )
+        )
+        extract_acoustic_features(
+            dataset, output_path, cfg, dataset_types, args.num_workers
+        )
+        # Calculate the statistics of acoustic features
+        if cfg.preprocess.mel_min_max_norm:
+            acoustic_extractor.cal_mel_min_max(dataset, output_path, cfg)
+        if cfg.preprocess.extract_pitch:
+            acoustic_extractor.cal_pitch_statistics(dataset, output_path, cfg)
+        if cfg.preprocess.extract_energy:
+            acoustic_extractor.cal_energy_statistics(dataset, output_path, cfg)
+        if cfg.preprocess.pitch_norm:
+            acoustic_extractor.normalize(dataset, cfg.preprocess.pitch_dir, cfg)
+        if cfg.preprocess.energy_norm:
+            acoustic_extractor.normalize(dataset, cfg.preprocess.energy_dir, cfg)
+    # Copy acoustic features for augmented datasets by creating soft-links
+    for dataset in cfg.dataset:
+        if "pitch_shift" in dataset:
+            src_dataset = dataset.replace("_pitch_shift", "")
+            src_dataset_dir = os.path.join(output_path, src_dataset)
+        elif "formant_shift" in dataset:
+            src_dataset = dataset.replace("_formant_shift", "")
+            src_dataset_dir = os.path.join(output_path, src_dataset)
+        elif "equalizer" in dataset:
+            src_dataset = dataset.replace("_equalizer", "")
+            src_dataset_dir = os.path.join(output_path, src_dataset)
+        else:
+            continue
+        dataset_dir = os.path.join(output_path, dataset)
+        metadata = []
+        for split in ["train", "test"] if not "eval" in dataset else ["test"]:
+            metadata_file_path = os.path.join(src_dataset_dir, "{}.json".format(split))
+            with open(metadata_file_path, "r") as f:
+                metadata.extend(json.load(f))
+        print("Copying acoustic features for {}...".format(dataset))
+        acoustic_extractor.copy_acoustic_features(
+            metadata, dataset_dir, src_dataset_dir, cfg
+        )
+        if cfg.preprocess.mel_min_max_norm:
+            acoustic_extractor.cal_mel_min_max(dataset, output_path, cfg)
+        if cfg.preprocess.extract_pitch:
+            acoustic_extractor.cal_pitch_statistics(dataset, output_path, cfg)
+    # Prepare the content features
+    for dataset in cfg.dataset:
+        print("Extracting content features for {}...".format(dataset))
+        extract_content_features(
+            dataset, output_path, cfg, dataset_types, args.num_workers
+        )
+    # Prepare the phenome squences
+    if cfg.preprocess.extract_phone:
+        for dataset in cfg.dataset:
+            print("Extracting phoneme sequence for {}...".format(dataset))
+            extract_phonme_sequences(dataset, output_path, cfg, dataset_types)
+def main():
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--config", default="config.json", help="json files for configurations."
+    )
+    parser.add_argument("--num_workers", type=int, default=int(cpu_count()))
+    parser.add_argument("--prepare_alignment", type=bool, default=False)
+    args = parser.parse_args()
+    cfg = load_config(args.config)
+    preprocess(cfg, args)
+if __name__ == "__main__":
+    main()

utils/Amphion/bins/tts/train.py ADDED Viewed

	@@ -0,0 +1,111 @@

+# Copyright (c) 2023 Amphion.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+import argparse
+import torch
+from models.tts.fastspeech2.fs2_trainer import FastSpeech2Trainer
+from models.tts.vits.vits_trainer import VITSTrainer
+from models.tts.valle.valle_trainer import VALLETrainer
+from models.tts.naturalspeech2.ns2_trainer import NS2Trainer
+from utils.util import load_config
+def build_trainer(args, cfg):
+    supported_trainer = {
+        "FastSpeech2": FastSpeech2Trainer,
+        "VITS": VITSTrainer,
+        "VALLE": VALLETrainer,
+        "NaturalSpeech2": NS2Trainer,
+    }
+    trainer_class = supported_trainer[cfg.model_type]
+    trainer = trainer_class(args, cfg)
+    return trainer
+def cuda_relevant(deterministic=False):
+    torch.cuda.empty_cache()
+    # TF32 on Ampere and above
+    torch.backends.cuda.matmul.allow_tf32 = True
+    torch.backends.cudnn.enabled = True
+    torch.backends.cudnn.allow_tf32 = True
+    # Deterministic
+    torch.backends.cudnn.deterministic = deterministic
+    torch.backends.cudnn.benchmark = not deterministic
+    torch.use_deterministic_algorithms(deterministic)
+def main():
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--config",
+        default="config.json",
+        help="json files for configurations.",
+        required=True,
+    )
+    parser.add_argument(
+        "--exp_name",
+        type=str,
+        default="exp_name",
+        help="A specific name to note the experiment",
+        required=True,
+    )
+    parser.add_argument(
+        "--resume", action="store_true", help="The model name to restore"
+    )
+    parser.add_argument(
+        "--log_level", default="warning", help="logging level (debug, info, warning)"
+    )
+    parser.add_argument(
+        "--resume_type",
+        type=str,
+        default="resume",
+        help="Resume training or finetuning.",
+    )
+    parser.add_argument(
+        "--checkpoint_path",
+        type=str,
+        default=None,
+        help="Checkpoint for resume training or finetuning.",
+    )
+    VALLETrainer.add_arguments(parser)
+    args = parser.parse_args()
+    cfg = load_config(args.config)
+    # Data Augmentation
+    if (
+        type(cfg.preprocess.data_augment) == list
+        and len(cfg.preprocess.data_augment) > 0
+    ):
+        new_datasets_list = []
+        for dataset in cfg.preprocess.data_augment:
+            new_datasets = [
+                f"{dataset}_pitch_shift" if cfg.preprocess.use_pitch_shift else None,
+                (
+                    f"{dataset}_formant_shift"
+                    if cfg.preprocess.use_formant_shift
+                    else None
+                ),
+                f"{dataset}_equalizer" if cfg.preprocess.use_equalizer else None,
+                f"{dataset}_time_stretch" if cfg.preprocess.use_time_stretch else None,
+            ]
+            new_datasets_list.extend(filter(None, new_datasets))
+        cfg.dataset.extend(new_datasets_list)
+    # # CUDA settings
+    cuda_relevant()
+    # Build trainer
+    trainer = build_trainer(args, cfg)
+    torch.set_num_threads(1)
+    torch.set_num_interop_threads(1)
+    trainer.train_loop()
+if __name__ == "__main__":
+    main()

utils/Amphion/bins/vocoder/inference.py ADDED Viewed

	@@ -0,0 +1,115 @@

+# Copyright (c) 2023 Amphion.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+import argparse
+import os
+import torch
+from models.vocoders.vocoder_inference import VocoderInference
+from utils.util import load_config
+def build_inference(args, cfg, infer_type="infer_from_dataset"):
+    supported_inference = {
+        "GANVocoder": VocoderInference,
+        "DiffusionVocoder": VocoderInference,
+    }
+    inference_class = supported_inference[cfg.model_type]
+    return inference_class(args, cfg, infer_type)
+def cuda_relevant(deterministic=False):
+    torch.cuda.empty_cache()
+    # TF32 on Ampere and above
+    torch.backends.cuda.matmul.allow_tf32 = True
+    torch.backends.cudnn.enabled = True
+    torch.backends.cudnn.allow_tf32 = True
+    # Deterministic
+    torch.backends.cudnn.deterministic = deterministic
+    torch.backends.cudnn.benchmark = not deterministic
+    torch.use_deterministic_algorithms(deterministic)
+def build_parser():
+    r"""Build argument parser for inference.py.
+    Anything else should be put in an extra config YAML file.
+    """
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--config",
+        type=str,
+        required=True,
+        help="JSON/YAML file for configurations.",
+    )
+    parser.add_argument(
+        "--infer_mode",
+        type=str,
+        required=None,
+    )
+    parser.add_argument(
+        "--infer_datasets",
+        nargs="+",
+        default=None,
+    )
+    parser.add_argument(
+        "--feature_folder",
+        type=str,
+        default=None,
+    )
+    parser.add_argument(
+        "--audio_folder",
+        type=str,
+        default=None,
+    )
+    parser.add_argument(
+        "--vocoder_dir",
+        type=str,
+        required=True,
+        help="Vocoder checkpoint directory. Searching behavior is the same as "
+        "the acoustics one.",
+    )
+    parser.add_argument(
+        "--output_dir",
+        type=str,
+        default="result",
+        help="Output directory. Default: ./result",
+    )
+    parser.add_argument(
+        "--log_level",
+        type=str,
+        default="warning",
+        help="Logging level. Default: warning",
+    )
+    parser.add_argument(
+        "--keep_cache",
+        action="store_true",
+        default=False,
+        help="Keep cache files. Only applicable to inference from files.",
+    )
+    return parser
+def main():
+    # Parse arguments
+    args = build_parser().parse_args()
+    # Parse config
+    cfg = load_config(args.config)
+    # CUDA settings
+    cuda_relevant()
+    # Build inference
+    trainer = build_inference(args, cfg, args.infer_mode)
+    # Run inference
+    trainer.inference()
+if __name__ == "__main__":
+    main()

utils/Amphion/bins/vocoder/preprocess.py ADDED Viewed

	@@ -0,0 +1,151 @@

+# Copyright (c) 2023 Amphion.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+import faulthandler
+faulthandler.enable()
+import os
+import argparse
+import json
+import pyworld as pw
+from multiprocessing import cpu_count
+from utils.util import load_config
+from preprocessors.processor import preprocess_dataset, prepare_align
+from preprocessors.metadata import cal_metadata
+from processors import acoustic_extractor, content_extractor, data_augment
+def extract_acoustic_features(dataset, output_path, cfg, n_workers=1):
+    """Extract acoustic features of utterances in the dataset
+    Args:
+        dataset (str): name of dataset, e.g. opencpop
+        output_path (str): directory that stores train, test and feature files of datasets
+        cfg (dict): dictionary that stores configurations
+        n_workers (int, optional): num of processes to extract features in parallel. Defaults to 1.
+    """
+    types = ["train", "test"] if "eval" not in dataset else ["test"]
+    metadata = []
+    for dataset_type in types:
+        dataset_output = os.path.join(output_path, dataset)
+        dataset_file = os.path.join(dataset_output, "{}.json".format(dataset_type))
+        with open(dataset_file, "r") as f:
+            metadata.extend(json.load(f))
+    acoustic_extractor.extract_utt_acoustic_features_serial(
+        metadata, dataset_output, cfg
+    )
+def preprocess(cfg, args):
+    """Proprocess raw data of single or multiple datasets (in cfg.dataset)
+    Args:
+        cfg (dict): dictionary that stores configurations
+        args (ArgumentParser): specify the configuration file and num_workers
+    """
+    # Specify the output root path to save the processed data
+    output_path = cfg.preprocess.processed_dir
+    os.makedirs(output_path, exist_ok=True)
+    ## Split train and test sets
+    for dataset in cfg.dataset:
+        print("Preprocess {}...".format(dataset))
+        preprocess_dataset(
+            dataset,
+            cfg.dataset_path[dataset],
+            output_path,
+            cfg.preprocess,
+            cfg.task_type,
+            is_custom_dataset=dataset in cfg.use_custom_dataset,
+        )
+    # Data augmentation: create new wav files with pitch shift, formant shift, equalizer, time stretch
+    try:
+        assert isinstance(
+            cfg.preprocess.data_augment, list
+        ), "Please provide a list of datasets need to be augmented."
+        if len(cfg.preprocess.data_augment) > 0:
+            new_datasets_list = []
+            for dataset in cfg.preprocess.data_augment:
+                new_datasets = data_augment.augment_dataset(cfg, dataset)
+                new_datasets_list.extend(new_datasets)
+            cfg.dataset.extend(new_datasets_list)
+            print("Augmentation datasets: ", cfg.dataset)
+    except:
+        print("No Data Augmentation.")
+    # Dump metadata of datasets (singers, train/test durations, etc.)
+    cal_metadata(cfg)
+    ## Prepare the acoustic features
+    for dataset in cfg.dataset:
+        # Skip augmented datasets which do not need to extract acoustic features
+        # We will copy acoustic features from the original dataset later
+        if (
+            "pitch_shift" in dataset
+            or "formant_shift" in dataset
+            or "equalizer" in dataset in dataset
+        ):
+            continue
+        print(
+            "Extracting acoustic features for {} using {} workers ...".format(
+                dataset, args.num_workers
+            )
+        )
+        extract_acoustic_features(dataset, output_path, cfg, args.num_workers)
+        # Calculate the statistics of acoustic features
+        if cfg.preprocess.mel_min_max_norm:
+            acoustic_extractor.cal_mel_min_max(dataset, output_path, cfg)
+    # Copy acoustic features for augmented datasets by creating soft-links
+    for dataset in cfg.dataset:
+        if "pitch_shift" in dataset:
+            src_dataset = dataset.replace("_pitch_shift", "")
+            src_dataset_dir = os.path.join(output_path, src_dataset)
+        elif "formant_shift" in dataset:
+            src_dataset = dataset.replace("_formant_shift", "")
+            src_dataset_dir = os.path.join(output_path, src_dataset)
+        elif "equalizer" in dataset:
+            src_dataset = dataset.replace("_equalizer", "")
+            src_dataset_dir = os.path.join(output_path, src_dataset)
+        else:
+            continue
+        dataset_dir = os.path.join(output_path, dataset)
+        metadata = []
+        for split in ["train", "test"] if not "eval" in dataset else ["test"]:
+            metadata_file_path = os.path.join(src_dataset_dir, "{}.json".format(split))
+            with open(metadata_file_path, "r") as f:
+                metadata.extend(json.load(f))
+        print("Copying acoustic features for {}...".format(dataset))
+        acoustic_extractor.copy_acoustic_features(
+            metadata, dataset_dir, src_dataset_dir, cfg
+        )
+        if cfg.preprocess.mel_min_max_norm:
+            acoustic_extractor.cal_mel_min_max(dataset, output_path, cfg)
+        if cfg.preprocess.extract_pitch:
+            acoustic_extractor.cal_pitch_statistics(dataset, output_path, cfg)
+def main():
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--config", default="config.json", help="json files for configurations."
+    )
+    parser.add_argument("--num_workers", type=int, default=int(cpu_count()))
+    args = parser.parse_args()
+    cfg = load_config(args.config)
+    preprocess(cfg, args)
+if __name__ == "__main__":
+    main()

utils/Amphion/bins/vocoder/train.py ADDED Viewed

	@@ -0,0 +1,93 @@

+# Copyright (c) 2023 Amphion.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+import argparse
+import torch
+from models.vocoders.gan.gan_vocoder_trainer import GANVocoderTrainer
+from models.vocoders.diffusion.diffusion_vocoder_trainer import DiffusionVocoderTrainer
+from utils.util import load_config
+def build_trainer(args, cfg):
+    supported_trainer = {
+        "GANVocoder": GANVocoderTrainer,
+        "DiffusionVocoder": DiffusionVocoderTrainer,
+    }
+    trainer_class = supported_trainer[cfg.model_type]
+    trainer = trainer_class(args, cfg)
+    return trainer
+def cuda_relevant(deterministic=False):
+    torch.cuda.empty_cache()
+    # TF32 on Ampere and above
+    torch.backends.cuda.matmul.allow_tf32 = True
+    torch.backends.cudnn.enabled = True
+    torch.backends.cudnn.allow_tf32 = True
+    # Deterministic
+    torch.backends.cudnn.deterministic = deterministic
+    torch.backends.cudnn.benchmark = not deterministic
+    torch.use_deterministic_algorithms(deterministic)
+def main():
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--config",
+        default="config.json",
+        help="json files for configurations.",
+        required=True,
+    )
+    parser.add_argument(
+        "--exp_name",
+        type=str,
+        default="exp_name",
+        help="A specific name to note the experiment",
+        required=True,
+    )
+    parser.add_argument(
+        "--resume_type",
+        type=str,
+        help="resume for continue to train, finetune for finetuning",
+    )
+    parser.add_argument(
+        "--checkpoint",
+        type=str,
+        help="checkpoint to resume",
+    )
+    parser.add_argument(
+        "--log_level", default="warning", help="logging level (debug, info, warning)"
+    )
+    args = parser.parse_args()
+    cfg = load_config(args.config)
+    # Data Augmentation
+    if cfg.preprocess.data_augment:
+        new_datasets_list = []
+        for dataset in cfg.preprocess.data_augment:
+            new_datasets = [
+                # f"{dataset}_pitch_shift",
+                # f"{dataset}_formant_shift",
+                f"{dataset}_equalizer",
+                f"{dataset}_time_stretch",
+            ]
+            new_datasets_list.extend(new_datasets)
+        cfg.dataset.extend(new_datasets_list)
+    # CUDA settings
+    cuda_relevant()
+    # Build trainer
+    trainer = build_trainer(args, cfg)
+    trainer.train_loop()
+if __name__ == "__main__":
+    main()

utils/Amphion/config/audioldm.json ADDED Viewed

	@@ -0,0 +1,92 @@

+{
+  "base_config": "config/base.json",
+  "model_type": "AudioLDM",
+  "task_type": "tta",
+  "dataset": [
+    "AudioCaps"
+  ],
+  "preprocess": {
+    // feature used for model training
+    "use_spkid": false,
+    "use_uv": false,
+    "use_frame_pitch": false,
+    "use_phone_pitch": false,
+    "use_frame_energy": false,
+    "use_phone_energy": false,
+    "use_mel": false,
+    "use_audio": false,
+    "use_label": false,
+    "use_one_hot": false,
+    "cond_mask_prob": 0.1
+  },
+  // model
+  "model": {
+    "audioldm": {
+      "image_size": 32,
+      "in_channels": 4,
+      "out_channels": 4,
+      "model_channels": 256,
+      "attention_resolutions": [
+        4,
+        2,
+        1
+      ],
+      "num_res_blocks": 2,
+      "channel_mult": [
+        1,
+        2,
+        4
+      ],
+      "num_heads": 8,
+      "use_spatial_transformer": true,
+      "transformer_depth": 1,
+      "context_dim": 768,
+      "use_checkpoint": true,
+      "legacy": false
+    },
+    "autoencoderkl": {
+      "ch": 128,
+      "ch_mult": [
+        1,
+        1,
+        2,
+        2,
+        4
+      ],
+      "num_res_blocks": 2,
+      "in_channels": 1,
+      "z_channels": 4,
+      "out_ch": 1,
+      "double_z": true
+    },
+    "noise_scheduler": {
+      "num_train_timesteps": 1000,
+      "beta_start": 0.00085,
+      "beta_end": 0.012,
+      "beta_schedule": "scaled_linear",
+      "clip_sample": false,
+      "steps_offset": 1,
+      "set_alpha_to_one": false,
+      "skip_prk_steps": true,
+      "prediction_type": "epsilon"
+    }
+  },
+  // train
+  "train": {
+    "lronPlateau": {
+      "factor": 0.9,
+      "patience": 100,
+      "min_lr": 4.0e-5,
+      "verbose": true
+    },
+    "adam": {
+      "lr": 5.0e-5,
+      "betas": [
+        0.9,
+        0.999
+      ],
+      "weight_decay": 1.0e-2,
+      "eps": 1.0e-8
+    }
+  }
+}

utils/Amphion/config/autoencoderkl.json ADDED Viewed

	@@ -0,0 +1,69 @@

+{
+  "base_config": "config/base.json",
+  "model_type": "AutoencoderKL",
+  "task_type": "tta",
+  "dataset": [
+    "AudioCaps"
+  ],
+  "preprocess": {
+    // feature used for model training
+    "use_spkid": false,
+    "use_uv": false,
+    "use_frame_pitch": false,
+    "use_phone_pitch": false,
+    "use_frame_energy": false,
+    "use_phone_energy": false,
+    "use_mel": false,
+    "use_audio": false,
+    "use_label": false,
+    "use_one_hot": false
+  },
+  // model
+  "model": {
+    "autoencoderkl": {
+      "ch": 128,
+      "ch_mult": [
+        1,
+        1,
+        2,
+        2,
+        4
+      ],
+      "num_res_blocks": 2,
+      "in_channels": 1,
+      "z_channels": 4,
+      "out_ch": 1,
+      "double_z": true
+    },
+    "loss": {
+      "kl_weight": 1e-8,
+      "disc_weight": 0.5,
+      "disc_factor": 1.0,
+      "logvar_init": 0.0,
+      "min_adapt_d_weight": 0.0,
+      "max_adapt_d_weight": 10.0,
+      "disc_start": 50001,
+      "disc_in_channels": 1,
+      "disc_num_layers": 3,
+      "use_actnorm": false
+    }
+  },
+  // train
+  "train": {
+    "lronPlateau": {
+      "factor": 0.9,
+      "patience": 100,
+      "min_lr": 4.0e-5,
+      "verbose": true
+    },
+    "adam": {
+      "lr": 4.0e-4,
+      "betas": [
+        0.9,
+        0.999
+      ],
+      "weight_decay": 1.0e-2,
+      "eps": 1.0e-8
+    }
+  }
+}

utils/Amphion/config/base.json ADDED Viewed

	@@ -0,0 +1,185 @@

+{
+  "supported_model_type": [
+    "GANVocoder",
+    "Fastspeech2",
+    "DiffSVC",
+    "Transformer",
+    "EDM",
+    "CD"
+  ],
+  "task_type": "",
+  "dataset": [],
+  "use_custom_dataset": [],
+  "preprocess": {
+    "phone_extractor": "espeak", // "espeak, pypinyin, pypinyin_initials_finals, lexicon"
+    // trim audio silence
+    "data_augment": false,
+    "trim_silence": false,
+    "num_silent_frames": 8,
+    "trim_fft_size": 512, // fft size used in trimming
+    "trim_hop_size": 128, // hop size used in trimming
+    "trim_top_db": 30, // top db used in trimming sensitive to each dataset
+    // acoustic features
+    "extract_mel": false,
+    "mel_extract_mode": "",
+    "extract_linear_spec": false,
+    "extract_mcep": false,
+    "extract_pitch": false,
+    "extract_acoustic_token": false,
+    "pitch_remove_outlier": false,
+    "extract_uv": false,
+    "pitch_norm": false,
+    "extract_audio": false,
+    "extract_label": false,
+    "pitch_extractor": "parselmouth", // pyin, dio, pyworld, pyreaper, parselmouth, CWT (Continuous Wavelet Transform)
+    "extract_energy": false,
+    "energy_remove_outlier": false,
+    "energy_norm": false,
+    "energy_extract_mode": "from_mel",
+    "extract_duration": false,
+    "extract_amplitude_phase": false,
+    "mel_min_max_norm": false,
+    // lingusitic features
+    "extract_phone": false,
+    "lexicon_path": "./text/lexicon/librispeech-lexicon.txt",
+    // content features
+    "extract_whisper_feature": false,
+    "extract_contentvec_feature": false,
+    "extract_mert_feature": false,
+    "extract_wenet_feature": false,
+    // Settings for data preprocessing
+    "n_mel": 80,
+    "win_size": 480,
+    "hop_size": 120,
+    "sample_rate": 24000,
+    "n_fft": 1024,
+    "fmin": 0,
+    "fmax": 12000,
+    "min_level_db": -115,
+    "ref_level_db": 20,
+    "bits": 8,
+    // Directory names of processed data or extracted features
+    "processed_dir": "processed_data",
+    "trimmed_wav_dir": "trimmed_wavs", // directory name of silence trimed wav
+    "raw_data": "raw_data",
+    "phone_dir": "phones",
+    "wav_dir": "wavs", // directory name of processed wav (such as downsampled waveform)
+    "audio_dir": "audios",
+    "log_amplitude_dir": "log_amplitudes",
+    "phase_dir": "phases",
+    "real_dir": "reals",
+    "imaginary_dir": "imaginarys",
+    "label_dir": "labels",
+    "linear_dir": "linears",
+    "mel_dir": "mels", // directory name of extraced mel features
+    "mcep_dir": "mcep", // directory name of extraced mcep features
+    "dur_dir": "durs",
+    "symbols_dict": "symbols.dict",
+    "lab_dir": "labs", // directory name of extraced label features
+    "wenet_dir": "wenet", // directory name of extraced wenet features
+    "contentvec_dir": "contentvec", // directory name of extraced wenet features
+    "pitch_dir": "pitches", // directory name of extraced pitch features
+    "energy_dir": "energys", // directory name of extracted energy features
+    "phone_pitch_dir": "phone_pitches", // directory name of extraced pitch features
+    "phone_energy_dir": "phone_energys", // directory name of extracted energy features
+    "uv_dir": "uvs", // directory name of extracted unvoiced features
+    "duration_dir": "duration", // ground-truth duration file
+    "phone_seq_file": "phone_seq_file", // phoneme sequence file
+    "file_lst": "file.lst",
+    "train_file": "train.json", // training set, the json file contains detailed information about the dataset, including dataset name, utterance id, duration of the utterance
+    "valid_file": "valid.json", // validattion set
+    "spk2id": "spk2id.json", // used for multi-speaker dataset
+    "utt2spk": "utt2spk", // used for multi-speaker dataset
+    "emo2id": "emo2id.json", // used for multi-emotion dataset
+    "utt2emo": "utt2emo", // used for multi-emotion dataset
+    // Features used for model training
+    "use_text": false,
+    "use_phone": false,
+    "use_phn_seq": false,
+    "use_lab": false,
+    "use_linear": false,
+    "use_mel": false,
+    "use_min_max_norm_mel": false,
+    "use_wav": false,
+    "use_phone_pitch": false,
+    "use_log_scale_pitch": false,
+    "use_phone_energy": false,
+    "use_phone_duration": false,
+    "use_log_scale_energy": false,
+    "use_wenet": false,
+    "use_dur": false,
+    "use_spkid": false, // True: use speaker id for multi-speaker dataset
+    "use_emoid": false, // True: use emotion id for multi-emotion dataset
+    "use_frame_pitch": false,
+    "use_uv": false,
+    "use_frame_energy": false,
+    "use_frame_duration": false,
+    "use_audio": false,
+    "use_label": false,
+    "use_one_hot": false,
+    "use_amplitude_phase": false,
+    "align_mel_duration": false
+  },
+  "train": {
+    "ddp": true,
+    "batch_size": 16,
+    "max_steps": 1000000,
+    // Trackers
+    "tracker": [
+      "tensorboard"
+      // "wandb",
+      // "cometml",
+      // "mlflow",
+    ],
+    "max_epoch": -1,
+    // -1 means no limit
+    "save_checkpoint_stride": [
+      5,
+      20
+    ],
+    // unit is epoch
+    "keep_last": [
+      3,
+      -1
+    ],
+    // -1 means infinite, if one number will broadcast
+    "run_eval": [
+      false,
+      true
+    ],
+    // if one number will broadcast
+    // Fix the random seed
+    "random_seed": 10086,
+    // Optimizer
+    "optimizer": "AdamW",
+    "adamw": {
+      "lr": 4.0e-4
+      // nn model lr
+    },
+    // LR Scheduler
+    "scheduler": "ReduceLROnPlateau",
+    "reducelronplateau": {
+      "factor": 0.8,
+      "patience": 10,
+      // unit is epoch
+      "min_lr": 1.0e-4
+    },
+    // Batchsampler
+    "sampler": {
+      "holistic_shuffle": true,
+      "drop_last": true
+    },
+    // Dataloader
+    "dataloader": {
+      "num_worker": 32,
+      "pin_memory": true
+    },
+    "gradient_accumulation_step": 1,
+    "total_training_steps": 50000,
+    "save_summary_steps": 500,
+    "save_checkpoints_steps": 10000,
+    "valid_interval": 10000,
+    "keep_checkpoint_max": 5,
+    "multi_speaker_training": false // True: train multi-speaker model; False: training single-speaker model;
+  }
+}