haoxiangsnr commited on Jan 4, 2025

Commit

9c38bf7

verified ·

1 Parent(s): 7926343

Add files using upload-large-folder tool

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

Amphion/egs/tts/VITS/README.md +221 -0
Amphion/egs/vocoder/diffusion/exp_config_base.json +71 -0
Amphion/egs/vocoder/gan/bigvgan_large/run.sh +141 -0
Amphion/evaluation/metrics/similarity/__init__.py +0 -0
Amphion/models/base/base_dataset.py +464 -0
Amphion/models/base/base_inference.py +220 -0
Amphion/models/base/new_inference.py +253 -0
Amphion/models/codec/ns3_codec/alias_free_torch/__init__.py +5 -0
Amphion/models/codec/ns3_codec/alias_free_torch/__pycache__/act.cpython-310.pyc +0 -0
Amphion/models/codec/ns3_codec/alias_free_torch/__pycache__/filter.cpython-310.pyc +0 -0
Amphion/models/codec/ns3_codec/alias_free_torch/act.py +29 -0
Amphion/models/codec/ns3_codec/facodec.py +1163 -0
Amphion/models/codec/ns3_codec/gradient_reversal.py +35 -0
Amphion/models/codec/ns3_codec/melspec.py +102 -0
Amphion/models/codec/ns3_codec/quantize/__pycache__/__init__.cpython-310.pyc +0 -0
Amphion/models/codec/ns3_codec/quantize/fvq.py +116 -0
Amphion/models/svc/transformer/transformer_inference.py +45 -0
Amphion/models/svc/vits/vits_trainer.py +704 -0
Amphion/models/tta/autoencoder/autoencoder_dataset.py +112 -0
Amphion/models/tta/ldm/__init__.py +0 -0
Amphion/models/tta/ldm/audioldm_dataset.py +151 -0
Amphion/models/tta/ldm/audioldm_trainer.py +251 -0
Amphion/models/tta/ldm/inference_utils/vocoder.py +408 -0
Amphion/models/tts/base/__init__.py +7 -0
Amphion/models/tts/base/tts_trainer.py +721 -0
Amphion/models/tts/fastspeech2/fs2_trainer.py +155 -0
Amphion/models/tts/naturalspeech2/ns2.py +259 -0
Amphion/models/tts/naturalspeech2/ns2_dataset.py +524 -0
Amphion/models/tts/naturalspeech2/ns2_inference.py +128 -0
Amphion/models/tts/naturalspeech2/ns2_trainer.py +798 -0
Amphion/models/tts/valle/__init__.py +0 -0
Amphion/models/vocoders/autoregressive/autoregressive_vocoder_inference.py +0 -0
Amphion/models/vocoders/autoregressive/wavenet/conv.py +66 -0
Amphion/models/vocoders/autoregressive/wavenet/wavenet.py +170 -0
Amphion/models/vocoders/diffusion/diffusion_vocoder_inference.py +131 -0
Amphion/models/vocoders/flow/flow_vocoder_dataset.py +0 -0
Amphion/models/vocoders/flow/flow_vocoder_inference.py +0 -0
Amphion/models/vocoders/gan/discriminator/msd.py +88 -0
Amphion/models/vocoders/gan/gan_vocoder_inference.py +96 -0
Amphion/models/vocoders/vocoder_dataset.py +264 -0
Amphion/models/vocoders/vocoder_sampler.py +126 -0
Amphion/modules/activation_functions/snake.py +122 -0
Amphion/modules/diffusion/bidilconv/bidilated_conv.py +102 -0
Amphion/modules/diffusion/karras/sample.py +185 -0
Amphion/modules/diffusion/unet/attention.py +241 -0
Amphion/modules/diffusion/unet/resblock.py +178 -0
Amphion/modules/diffusion/unet/unet.py +310 -0
Amphion/modules/encoder/__init__.py +1 -0
Amphion/modules/general/utils.py +100 -0
Amphion/modules/norms/__init__.py +1 -0

Amphion/egs/tts/VITS/README.md ADDED Viewed

	@@ -0,0 +1,221 @@

+# VITS Recipe
+[![hf](https://img.shields.io/badge/%F0%9F%A4%97%20HuggingFace-Spaces-yellow)](https://huggingface.co/spaces/amphion/Text-to-Speech)
+[![openxlab](https://cdn-static.openxlab.org.cn/app-center/openxlab_app.svg)](https://openxlab.org.cn/apps/detail/Amphion/Text-to-Speech)
+In this recipe, we will show how to train VITS using Amphion's infrastructure. [VITS](https://arxiv.org/abs/2106.06103) is an end-to-end TTS architecture that utilizes a conditional variational autoencoder with adversarial learning.
+There are four stages in total:
+1. Data preparation
+2. Features extraction
+3. Training
+4. Inference
+> **NOTE:** You need to run every command of this recipe in the `Amphion` root path:
+> ```bash
+> cd Amphion
+> ```
+## 1. Data Preparation
+### Dataset Download
+You can use the commonly used TTS dataset to train the TTS model, e.g., LJSpeech, VCTK, Hi-Fi TTS, LibriTTS, etc. We strongly recommend using LJSpeech to train the single-speaker TTS model for the first time. While training the multi-speaker TTS model for the first time, we recommend using Hi-Fi TTS. The process of downloading the dataset has been detailed [here](../../datasets/README.md).
+### Configuration
+After downloading the dataset, you can set the dataset paths in  `exp_config.json`. Note that you can change the `dataset` list to use your preferred datasets.
+```json
+    "dataset": [
+        "LJSpeech",
+        //"hifitts"
+    ],
+    "dataset_path": {
+        // TODO: Fill in your dataset path
+        "LJSpeech": "[LJSpeech dataset path]",
+        //"hifitts": "[Hi-Fi TTS dataset path]
+    },
+```
+## 2. Features Extraction
+### Configuration
+In `exp_config.json`, specify the `log_dir` for saving the checkpoints and logs, and specify the `processed_dir` for saving processed data. For preprocessing the multi-speaker TTS dataset, set `extract_audio` and `use_spkid` to `true`:
+```json
+    // TODO: Fill in the output log path. The default value is "Amphion/ckpts/tts"
+    "log_dir": "ckpts/tts",
+    "preprocess": {
+        //"extract_audio": true,
+        "use_phone": true,
+        // linguistic features
+        "extract_phone": true,
+        "phone_extractor": "espeak", // "espeak, pypinyin, pypinyin_initials_finals, lexicon (only for language=en-us right now)"
+        // TODO: Fill in the output data path. The default value is "Amphion/data"
+        "processed_dir": "data",
+        "sample_rate": 22050, //target sampling rate
+        "valid_file": "valid.json", //validation set
+        //"use_spkid": true, //use speaker ID to train multi-speaker TTS model
+    },
+```
+### Run
+Run the `run.sh` as the preprocess stage (set  `--stage 1`):
+```bash
+sh egs/tts/VITS/run.sh --stage 1
+```
+> **NOTE:** The `CUDA_VISIBLE_DEVICES` is set as `"0"` in default. You can change it when running `run.sh` by specifying such as `--gpu "1"`.
+## 3. Training
+### Configuration
+We provide the default hyperparameters in the `exp_config.json`. They can work on a single NVIDIA-24g GPU. You can adjust them based on your GPU machines.
+For training the multi-speaker TTS model, specify the `n_speakers` value to be greater (used for new speaker fine-tuning) than or equal to the number of speakers in your dataset(s) and set `multi_speaker_training` to `true`.
+```json
+  "model": {
+    //"n_speakers": 10 //Number of speakers in the dataset(s) used. The default value is 0 if not specified.
+  },
+  "train": {
+    "batch_size": 16,
+    //"multi_speaker_training": true,
+  }
+```
+### Train From Scratch
+Run the `run.sh` as the training stage (set  `--stage 2`). Specify an experimental name to run the following command. The tensorboard logs and checkpoints will be saved in `Amphion/ckpts/tts/[YourExptName]`.
+```bash
+sh egs/tts/VITS/run.sh --stage 2 --name [YourExptName]
+```
+### Train From Existing Source
+We support training from existing sources for various purposes. You can resume training the model from a checkpoint or fine-tune a model from another checkpoint.
+By setting `--resume true`, the training will resume from the **latest checkpoint** from the current `[YourExptName]` by default. For example, if you want to resume training from the latest checkpoint in `Amphion/ckpts/tts/[YourExptName]/checkpoint`, run:
+```bash
+sh egs/tts/VITS/run.sh --stage 2 --name [YourExptName] \
+    --resume true
+```
+You can also choose a **specific checkpoint** for retraining by `--resume_from_ckpt_path` argument. For example, if you want to resume training from the checkpoint `Amphion/ckpts/tts/[YourExptName]/checkpoint/[SpecificCheckpoint]`, run:
+```bash
+sh egs/tts/VITS/run.sh --stage 2 --name [YourExptName] \
+    --resume true \
+    --resume_from_ckpt_path "Amphion/ckpts/tts/[YourExptName]/checkpoint/[SpecificCheckpoint]"
+```
+If you want to **fine-tune from another checkpoint**, just use `--resume_type` and set it to `"finetune"`. For example, If you want to fine-tune the model from the checkpoint `Amphion/ckpts/tts/[AnotherExperiment]/checkpoint/[SpecificCheckpoint]`, run:
+```bash
+sh egs/tts/VITS/run.sh --stage 2 --name [YourExptName] \
+    --resume true \
+    --resume_from_ckpt_path "Amphion/ckpts/tts/[YourExptName]/checkpoint/[SpecificCheckpoint]" \
+    --resume_type "finetune"
+```
+> **NOTE:** The `--resume_type` is set as `"resume"` in default. It's not necessary to specify it when resuming training.
+>
+> The difference between `"resume"` and `"finetune"` is that the `"finetune"` will **only** load the pretrained model weights from the checkpoint, while the `"resume"` will load all the training states (including optimizer, scheduler, etc.) from the checkpoint.
+Here are some example scenarios to better understand how to use these arguments:
+| Scenario | `--resume` | `--resume_from_ckpt_path` | `--resume_type` |
+| ------ | -------- | ----------------------- | ------------- |
+| You want to train from scratch | no | no | no |
+| The machine breaks down during training and you want to resume training from the latest checkpoint | `true` | no | no |
+| You find the latest model is overfitting and you want to re-train from the checkpoint before | `true` | `SpecificCheckpoint Path` | no |
+| You want to fine-tune a model from another checkpoint | `true` | `SpecificCheckpoint Path` | `"finetune"` |
+> **NOTE:** The `CUDA_VISIBLE_DEVICES` is set as `"0"` in default. You can change it when running `run.sh` by specifying such as `--gpu "0,1,2,3"`.
+## 4. Inference
+### Pre-trained Model Download
+We released a pre-trained Amphion VITS model trained on LJSpeech. So you can download the pre-trained model [here](https://huggingface.co/amphion/vits-ljspeech) and generate speech according to the following inference instruction.
+### Configuration
+For inference, you need to specify the following configurations when running `run.sh`:
+| Parameters            | Description                                                                            | Example                                                                                                                                                                         |
+| --------------------- | -------------------------------------------------------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| `--infer_expt_dir`    | The experimental directory which contains `checkpoint`                                 | `Amphion/ckpts/tts/[YourExptName]`                                                                                                                                              |
+| `--infer_output_dir`  | The output directory to save inferred audios.                                          | `Amphion/ckpts/tts/[YourExptName]/result`                                                                                                                                       |
+| `--infer_mode`        | The inference mode, e.g., "`single`", "`batch`".                                       | "`single`" to generate a clip of speech, "`batch`" to generate a batch of speech at a time.                                                                                     |
+| `--infer_dataset`     | The dataset used for inference.                                                        | For LJSpeech dataset, the inference dataset would be `LJSpeech`.<br> For Hi-Fi TTS dataset, the inference dataset would be `hifitts`.                                                                                                              |
+| `--infer_testing_set` | The subset of the inference dataset used for inference, e.g., train, test, golden_test | For LJSpeech dataset, the testing set would be  "`test`" split from LJSpeech at the feature extraction, or "`golden_test`" cherry-picked from the test set as template testing set.<br>For Hi-Fi TTS dataset, the testing set would be "`test`" split from Hi-Fi TTS during the feature extraction process. |
+| `--infer_text`        | The text to be synthesized.                                                            | "`This is a clip of generated speech with the given text from a TTS model.`"                                                                                                    |
+| `--infer_speaker_name`        | The target speaker's voice is to be  synthesized.<br> (***Note: only applicable to multi-speaker TTS model***)                                                   | For Hi-Fi TTS dataset, the list of available speakers includes: "`hifitts_11614`", "`hifitts_11697`", "`hifitts_12787`", "`hifitts_6097`", "`hifitts_6670`", "`hifitts_6671`", "`hifitts_8051`", "`hifitts_9017`", "`hifitts_9136`", "`hifitts_92`". <br> You may find the list of available speakers from `spk2id.json` file generated in  ```log_dir/[YourExptName]``` that you have specified in `exp_config.json`.                                                                         |
+### Run
+#### Single text inference:
+For the single-speaker TTS model, if you want to generate a single clip of speech from a given text, just run:
+```bash
+sh egs/tts/VITS/run.sh --stage 3 --gpu "0" \
+    --infer_expt_dir Amphion/ckpts/tts/[YourExptName] \
+    --infer_output_dir Amphion/ckpts/tts/[YourExptName]/result \
+    --infer_mode "single" \
+    --infer_text "This is a clip of generated speech with the given text from a TTS model."
+```
+For the multi-speaker TTS model, in addition to the above-mentioned arguments, you need to add ```infer_speaker_name``` argument, and run:
+```bash
+sh egs/tts/VITS/run.sh --stage 3 --gpu "0" \
+    --infer_expt_dir Amphion/ckpts/tts/[YourExptName] \
+    --infer_output_dir Amphion/ckpts/tts/[YourExptName]/result \
+    --infer_mode "single" \
+    --infer_text "This is a clip of generated speech with the given text from a TTS model." \
+    --infer_speaker_name "hifitts_92"
+```
+#### Batch inference:
+For the single-speaker TTS model, if you want to generate speech of all testing sets split from LJSpeech, just run:
+```bash
+sh egs/tts/VITS/run.sh --stage 3 --gpu "0" \
+    --infer_expt_dir Amphion/ckpts/tts/[YourExptName] \
+    --infer_output_dir Amphion/ckpts/tts/[YourExptName]/result \
+    --infer_mode "batch" \
+    --infer_dataset "LJSpeech" \
+    --infer_testing_set "test"
+```
+For the multi-speaker TTS model, if you want to generate speech of all testing sets split from Hi-Fi TTS, the same procedure follows from above, with ```LJSpeech``` replaced by ```hifitts```.
+```bash
+sh egs/tts/VITS/run.sh --stage 3 --gpu "0" \
+    --infer_expt_dir Amphion/ckpts/tts/[YourExptName] \
+    --infer_output_dir Amphion/ckpts/tts/[YourExptName]/result \
+    --infer_mode "batch" \
+    --infer_dataset "hifitts" \
+    --infer_testing_set "test"
+```
+We released a pre-trained Amphion VITS model trained on LJSpeech. So, you can download the pre-trained model [here](https://huggingface.co/amphion/vits-ljspeech) and generate speech following the above inference instructions. Meanwhile, the pre-trained multi-speaker VITS model trained on Hi-Fi TTS will be released soon. Stay tuned.
+```bibtex
+@inproceedings{kim2021conditional,
+  title={Conditional variational autoencoder with adversarial learning for end-to-end text-to-speech},
+  author={Kim, Jaehyeon and Kong, Jungil and Son, Juhee},
+  booktitle={International Conference on Machine Learning},
+  pages={5530--5540},
+  year={2021},
+}
+```

Amphion/egs/vocoder/diffusion/exp_config_base.json ADDED Viewed

	@@ -0,0 +1,71 @@

+{
+  "base_config": "config/vocoder.json",
+  "model_type": "DiffusionVocoder",
+  // TODO: Choose your needed datasets
+  "dataset": [
+    "csd",
+    "kising",
+    "m4singer",
+    "nus48e",
+    "opencpop",
+    "opensinger",
+    "opera",
+    "pjs",
+    "popbutfy",
+    "popcs",
+    "ljspeech",
+    "vctk",
+    "libritts",
+  ],
+  "dataset_path": {
+    // TODO: Fill in your dataset path
+    "csd": "[dataset path]",
+    "kising": "[dataset path]",
+    "m4singer": "[dataset path]",
+    "nus48e": "[dataset path]",
+    "opencpop": "[dataset path]",
+    "opensinger": "[dataset path]",
+    "opera": "[dataset path]",
+    "pjs": "[dataset path]",
+    "popbutfy": "[dataset path]",
+    "popcs": "[dataset path]",
+    "ljspeech": "[dataset path]",
+    "vctk": "[dataset path]",
+    "libritts": "[dataset path]",
+  },
+  // TODO: Fill in the output log path
+  "log_dir": "ckpts/vocoder",
+  "preprocess": {
+    // Acoustic features
+    "extract_mel": true,
+    "extract_audio": true,
+    "extract_pitch": false,
+    "extract_uv": false,
+    "pitch_extractor": "parselmouth",
+    // Features used for model training
+    "use_mel": true,
+    "use_frame_pitch": false,
+    "use_uv": false,
+    "use_audio": true,
+    // TODO: Fill in the output data path
+    "processed_dir": "data/",
+    "n_mel": 100,
+    "sample_rate": 24000
+  },
+  "train": {
+    // TODO: Choose a suitable batch size, training epoch, and save stride
+    "batch_size": 32,
+    "max_epoch": 1000000,
+    "save_checkpoint_stride": [20],
+    "adamw": {
+        "lr": 2.0e-4,
+        "adam_b1": 0.8,
+        "adam_b2": 0.99
+    },
+    "exponential_lr": {
+        "lr_decay": 0.999
+    },
+  }
+}

Amphion/egs/vocoder/gan/bigvgan_large/run.sh ADDED Viewed

	@@ -0,0 +1,141 @@

+# Copyright (c) 2023 Amphion.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+######## Build Experiment Environment ###########
+exp_dir=$(cd `dirname $0`; pwd)
+work_dir=$(dirname $(dirname $(dirname $(dirname $exp_dir))))
+export WORK_DIR=$work_dir
+export PYTHONPATH=$work_dir
+export PYTHONIOENCODING=UTF-8
+######## Parse the Given Parameters from the Commond ###########
+options=$(getopt -o c:n:s --long gpu:,config:,name:,stage:,checkpoint:,resume_type:,main_process_port:,infer_mode:,infer_datasets:,infer_feature_dir:,infer_audio_dir:,infer_expt_dir:,infer_output_dir: -- "$@")
+eval set -- "$options"
+while true; do
+  case $1 in
+    # Experimental Configuration File
+    -c | --config) shift; exp_config=$1 ; shift ;;
+    # Experimental Name
+    -n | --name) shift; exp_name=$1 ; shift ;;
+    # Running Stage
+    -s | --stage) shift; running_stage=$1 ; shift ;;
+    # Visible GPU machines. The default value is "0".
+    --gpu) shift; gpu=$1 ; shift ;;
+    # [Only for Training] The specific checkpoint path that you want to resume from.
+    --checkpoint) shift; checkpoint=$1 ; shift ;;
+    # [Only for Training] `resume` for loading all the things (including model weights, optimizer, scheduler, and random states). `finetune` for loading only the model weights.
+    --resume_type) shift; resume_type=$1 ; shift ;;
+    # [Only for Traiing] `main_process_port` for multi gpu training
+    --main_process_port) shift; main_process_port=$1 ; shift ;;
+    # [Only for Inference] The inference mode
+    --infer_mode) shift; infer_mode=$1 ; shift ;;
+    # [Only for Inference] The inferenced datasets
+    --infer_datasets) shift; infer_datasets=$1 ; shift ;;
+    # [Only for Inference] The feature dir for inference
+    --infer_feature_dir) shift; infer_feature_dir=$1 ; shift ;;
+    # [Only for Inference] The audio dir for inference
+    --infer_audio_dir) shift; infer_audio_dir=$1 ; shift ;;
+    # [Only for Inference] The experiment dir. The value is like "[Your path to save logs and checkpoints]/[YourExptName]"
+    --infer_expt_dir) shift; infer_expt_dir=$1 ; shift ;;
+    # [Only for Inference] The output dir to save inferred audios. Its default value is "$expt_dir/result"
+    --infer_output_dir) shift; infer_output_dir=$1 ; shift ;;
+    --) shift ; break ;;
+    *) echo "Invalid option: $1" exit 1 ;;
+  esac
+done
+### Value check ###
+if [ -z "$running_stage" ]; then
+    echo "[Error] Please specify the running stage"
+    exit 1
+fi
+if [ -z "$exp_config" ]; then
+    exp_config="${exp_dir}"/exp_config.json
+fi
+echo "Exprimental Configuration File: $exp_config"
+if [ -z "$gpu" ]; then
+    gpu="0"
+fi
+if [ -z "$main_process_port" ]; then
+    main_process_port=29500
+fi
+echo "Main Process Port: $main_process_port"
+######## Features Extraction ###########
+if [ $running_stage -eq 1 ]; then
+    CUDA_VISIBLE_DEVICES=$gpu python "${work_dir}"/bins/vocoder/preprocess.py \
+        --config $exp_config \
+        --num_workers 8
+fi
+######## Training ###########
+if [ $running_stage -eq 2 ]; then
+    if [ -z "$exp_name" ]; then
+        echo "[Error] Please specify the experiments name"
+        exit 1
+    fi
+    echo "Exprimental Name: $exp_name"
+    CUDA_VISIBLE_DEVICES=$gpu accelerate launch \
+        --main_process_port "$main_process_port" \
+        "${work_dir}"/bins/vocoder/train.py \
+        --config "$exp_config" \
+        --exp_name "$exp_name" \
+        --log_level info \
+        --checkpoint "$checkpoint" \
+        --resume_type "$resume_type"
+fi
+######## Inference/Conversion ###########
+if [ $running_stage -eq 3 ]; then
+    if [ -z "$infer_expt_dir" ]; then
+        echo "[Error] Please specify the experimental directionary. The value is like [Your path to save logs and checkpoints]/[YourExptName]"
+        exit 1
+    fi
+    if [ -z "$infer_output_dir" ]; then
+        infer_output_dir="$infer_expt_dir/result"
+    fi
+    if [ $infer_mode = "infer_from_dataset" ]; then
+        CUDA_VISIBLE_DEVICES=$gpu accelerate launch "$work_dir"/bins/vocoder/inference.py \
+            --config $exp_config \
+            --infer_mode $infer_mode \
+            --infer_datasets $infer_datasets \
+            --vocoder_dir $infer_expt_dir \
+            --output_dir $infer_output_dir  \
+            --log_level debug
+    fi
+    if [ $infer_mode = "infer_from_feature" ]; then
+        CUDA_VISIBLE_DEVICES=$gpu accelerate launch "$work_dir"/bins/vocoder/inference.py \
+            --config $exp_config \
+            --infer_mode $infer_mode \
+            --feature_folder $infer_feature_dir \
+            --vocoder_dir $infer_expt_dir \
+            --output_dir $infer_output_dir  \
+            --log_level debug
+    fi
+    if [ $infer_mode = "infer_from_audio" ]; then
+        CUDA_VISIBLE_DEVICES=$gpu accelerate launch "$work_dir"/bins/vocoder/inference.py \
+            --config $exp_config \
+            --infer_mode $infer_mode \
+            --audio_folder $infer_audio_dir \
+            --vocoder_dir $infer_expt_dir \
+            --output_dir $infer_output_dir  \
+            --log_level debug
+    fi
+fi

Amphion/evaluation/metrics/similarity/__init__.py ADDED Viewed

File without changes

Amphion/models/base/base_dataset.py ADDED Viewed

	@@ -0,0 +1,464 @@

+# Copyright (c) 2023 Amphion.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+import torch
+import numpy as np
+import torch.utils.data
+from torch.nn.utils.rnn import pad_sequence
+import librosa
+from utils.data_utils import *
+from processors.acoustic_extractor import cal_normalized_mel
+from text import text_to_sequence
+from text.text_token_collation import phoneIDCollation
+class BaseOfflineDataset(torch.utils.data.Dataset):
+    def __init__(self, cfg, dataset, is_valid=False):
+        """
+        Args:
+            cfg: config
+            dataset: dataset name
+            is_valid: whether to use train or valid dataset
+        """
+        assert isinstance(dataset, str)
+        # self.data_root = processed_data_dir
+        self.cfg = cfg
+        processed_data_dir = os.path.join(cfg.preprocess.processed_dir, dataset)
+        meta_file = cfg.preprocess.valid_file if is_valid else cfg.preprocess.train_file
+        self.metafile_path = os.path.join(processed_data_dir, meta_file)
+        self.metadata = self.get_metadata()
+        """
+        load spk2id and utt2spk from json file
+            spk2id: {spk1: 0, spk2: 1, ...}
+            utt2spk: {dataset_uid: spk1, ...}
+        """
+        if cfg.preprocess.use_spkid:
+            spk2id_path = os.path.join(processed_data_dir, cfg.preprocess.spk2id)
+            with open(spk2id_path, "r") as f:
+                self.spk2id = json.load(f)
+            utt2spk_path = os.path.join(processed_data_dir, cfg.preprocess.utt2spk)
+            self.utt2spk = dict()
+            with open(utt2spk_path, "r") as f:
+                for line in f.readlines():
+                    utt, spk = line.strip().split("\t")
+                    self.utt2spk[utt] = spk
+        if cfg.preprocess.use_uv:
+            self.utt2uv_path = {}
+            for utt_info in self.metadata:
+                dataset = utt_info["Dataset"]
+                uid = utt_info["Uid"]
+                utt = "{}_{}".format(dataset, uid)
+                self.utt2uv_path[utt] = os.path.join(
+                    cfg.preprocess.processed_dir,
+                    dataset,
+                    cfg.preprocess.uv_dir,
+                    uid + ".npy",
+                )
+        if cfg.preprocess.use_frame_pitch:
+            self.utt2frame_pitch_path = {}
+            for utt_info in self.metadata:
+                dataset = utt_info["Dataset"]
+                uid = utt_info["Uid"]
+                utt = "{}_{}".format(dataset, uid)
+                self.utt2frame_pitch_path[utt] = os.path.join(
+                    cfg.preprocess.processed_dir,
+                    dataset,
+                    cfg.preprocess.pitch_dir,
+                    uid + ".npy",
+                )
+        if cfg.preprocess.use_frame_energy:
+            self.utt2frame_energy_path = {}
+            for utt_info in self.metadata:
+                dataset = utt_info["Dataset"]
+                uid = utt_info["Uid"]
+                utt = "{}_{}".format(dataset, uid)
+                self.utt2frame_energy_path[utt] = os.path.join(
+                    cfg.preprocess.processed_dir,
+                    dataset,
+                    cfg.preprocess.energy_dir,
+                    uid + ".npy",
+                )
+        if cfg.preprocess.use_mel:
+            self.utt2mel_path = {}
+            for utt_info in self.metadata:
+                dataset = utt_info["Dataset"]
+                uid = utt_info["Uid"]
+                utt = "{}_{}".format(dataset, uid)
+                self.utt2mel_path[utt] = os.path.join(
+                    cfg.preprocess.processed_dir,
+                    dataset,
+                    cfg.preprocess.mel_dir,
+                    uid + ".npy",
+                )
+        if cfg.preprocess.use_linear:
+            self.utt2linear_path = {}
+            for utt_info in self.metadata:
+                dataset = utt_info["Dataset"]
+                uid = utt_info["Uid"]
+                utt = "{}_{}".format(dataset, uid)
+                self.utt2linear_path[utt] = os.path.join(
+                    cfg.preprocess.processed_dir,
+                    dataset,
+                    cfg.preprocess.linear_dir,
+                    uid + ".npy",
+                )
+        if cfg.preprocess.use_audio:
+            self.utt2audio_path = {}
+            for utt_info in self.metadata:
+                dataset = utt_info["Dataset"]
+                uid = utt_info["Uid"]
+                utt = "{}_{}".format(dataset, uid)
+                self.utt2audio_path[utt] = os.path.join(
+                    cfg.preprocess.processed_dir,
+                    dataset,
+                    cfg.preprocess.audio_dir,
+                    uid + ".npy",
+                )
+        elif cfg.preprocess.use_label:
+            self.utt2label_path = {}
+            for utt_info in self.metadata:
+                dataset = utt_info["Dataset"]
+                uid = utt_info["Uid"]
+                utt = "{}_{}".format(dataset, uid)
+                self.utt2label_path[utt] = os.path.join(
+                    cfg.preprocess.processed_dir,
+                    dataset,
+                    cfg.preprocess.label_dir,
+                    uid + ".npy",
+                )
+        elif cfg.preprocess.use_one_hot:
+            self.utt2one_hot_path = {}
+            for utt_info in self.metadata:
+                dataset = utt_info["Dataset"]
+                uid = utt_info["Uid"]
+                utt = "{}_{}".format(dataset, uid)
+                self.utt2one_hot_path[utt] = os.path.join(
+                    cfg.preprocess.processed_dir,
+                    dataset,
+                    cfg.preprocess.one_hot_dir,
+                    uid + ".npy",
+                )
+        if cfg.preprocess.use_text or cfg.preprocess.use_phone:
+            self.utt2seq = {}
+            for utt_info in self.metadata:
+                dataset = utt_info["Dataset"]
+                uid = utt_info["Uid"]
+                utt = "{}_{}".format(dataset, uid)
+                if cfg.preprocess.use_text:
+                    text = utt_info["Text"]
+                    sequence = text_to_sequence(text, cfg.preprocess.text_cleaners)
+                elif cfg.preprocess.use_phone:
+                    # load phoneme squence from phone file
+                    phone_path = os.path.join(
+                        processed_data_dir, cfg.preprocess.phone_dir, uid + ".phone"
+                    )
+                    with open(phone_path, "r") as fin:
+                        phones = fin.readlines()
+                        assert len(phones) == 1
+                        phones = phones[0].strip()
+                    phones_seq = phones.split(" ")
+                    phon_id_collator = phoneIDCollation(cfg, dataset=dataset)
+                    sequence = phon_id_collator.get_phone_id_sequence(cfg, phones_seq)
+                self.utt2seq[utt] = sequence
+    def get_metadata(self):
+        with open(self.metafile_path, "r", encoding="utf-8") as f:
+            metadata = json.load(f)
+        return metadata
+    def get_dataset_name(self):
+        return self.metadata[0]["Dataset"]
+    def __getitem__(self, index):
+        utt_info = self.metadata[index]
+        dataset = utt_info["Dataset"]
+        uid = utt_info["Uid"]
+        utt = "{}_{}".format(dataset, uid)
+        single_feature = dict()
+        if self.cfg.preprocess.use_spkid:
+            single_feature["spk_id"] = np.array(
+                [self.spk2id[self.utt2spk[utt]]], dtype=np.int32
+            )
+        if self.cfg.preprocess.use_mel:
+            mel = np.load(self.utt2mel_path[utt])
+            assert mel.shape[0] == self.cfg.preprocess.n_mel  # [n_mels, T]
+            if self.cfg.preprocess.use_min_max_norm_mel:
+                # do mel norm
+                mel = cal_normalized_mel(mel, utt_info["Dataset"], self.cfg.preprocess)
+            if "target_len" not in single_feature.keys():
+                single_feature["target_len"] = mel.shape[1]
+            single_feature["mel"] = mel.T  # [T, n_mels]
+        if self.cfg.preprocess.use_linear:
+            linear = np.load(self.utt2linear_path[utt])
+            if "target_len" not in single_feature.keys():
+                single_feature["target_len"] = linear.shape[1]
+            single_feature["linear"] = linear.T  # [T, n_linear]
+        if self.cfg.preprocess.use_frame_pitch:
+            frame_pitch_path = self.utt2frame_pitch_path[utt]
+            frame_pitch = np.load(frame_pitch_path)
+            if "target_len" not in single_feature.keys():
+                single_feature["target_len"] = len(frame_pitch)
+            aligned_frame_pitch = align_length(
+                frame_pitch, single_feature["target_len"]
+            )
+            single_feature["frame_pitch"] = aligned_frame_pitch
+            if self.cfg.preprocess.use_uv:
+                frame_uv_path = self.utt2uv_path[utt]
+                frame_uv = np.load(frame_uv_path)
+                aligned_frame_uv = align_length(frame_uv, single_feature["target_len"])
+                aligned_frame_uv = [
+                    0 if frame_uv else 1 for frame_uv in aligned_frame_uv
+                ]
+                aligned_frame_uv = np.array(aligned_frame_uv)
+                single_feature["frame_uv"] = aligned_frame_uv
+        if self.cfg.preprocess.use_frame_energy:
+            frame_energy_path = self.utt2frame_energy_path[utt]
+            frame_energy = np.load(frame_energy_path)
+            if "target_len" not in single_feature.keys():
+                single_feature["target_len"] = len(frame_energy)
+            aligned_frame_energy = align_length(
+                frame_energy, single_feature["target_len"]
+            )
+            single_feature["frame_energy"] = aligned_frame_energy
+        if self.cfg.preprocess.use_audio:
+            audio = np.load(self.utt2audio_path[utt])
+            single_feature["audio"] = audio
+            single_feature["audio_len"] = audio.shape[0]
+        if self.cfg.preprocess.use_phone or self.cfg.preprocess.use_text:
+            single_feature["phone_seq"] = np.array(self.utt2seq[utt])
+            single_feature["phone_len"] = len(self.utt2seq[utt])
+        return single_feature
+    def __len__(self):
+        return len(self.metadata)
+class BaseOfflineCollator(object):
+    """Zero-pads model inputs and targets based on number of frames per step"""
+    def __init__(self, cfg):
+        self.cfg = cfg
+    def __call__(self, batch):
+        packed_batch_features = dict()
+        # mel: [b, T, n_mels]
+        # frame_pitch, frame_energy: [1, T]
+        # target_len: [b]
+        # spk_id: [b, 1]
+        # mask: [b, T, 1]
+        for key in batch[0].keys():
+            if key == "target_len":
+                packed_batch_features["target_len"] = torch.LongTensor(
+                    [b["target_len"] for b in batch]
+                )
+                masks = [
+                    torch.ones((b["target_len"], 1), dtype=torch.long) for b in batch
+                ]
+                packed_batch_features["mask"] = pad_sequence(
+                    masks, batch_first=True, padding_value=0
+                )
+            elif key == "phone_len":
+                packed_batch_features["phone_len"] = torch.LongTensor(
+                    [b["phone_len"] for b in batch]
+                )
+                masks = [
+                    torch.ones((b["phone_len"], 1), dtype=torch.long) for b in batch
+                ]
+                packed_batch_features["phn_mask"] = pad_sequence(
+                    masks, batch_first=True, padding_value=0
+                )
+            elif key == "audio_len":
+                packed_batch_features["audio_len"] = torch.LongTensor(
+                    [b["audio_len"] for b in batch]
+                )
+                masks = [
+                    torch.ones((b["audio_len"], 1), dtype=torch.long) for b in batch
+                ]
+            else:
+                values = [torch.from_numpy(b[key]) for b in batch]
+                packed_batch_features[key] = pad_sequence(
+                    values, batch_first=True, padding_value=0
+                )
+        return packed_batch_features
+class BaseOnlineDataset(torch.utils.data.Dataset):
+    def __init__(self, cfg, dataset, is_valid=False):
+        """
+        Args:
+            cfg: config
+            dataset: dataset name
+            is_valid: whether to use train or valid dataset
+        """
+        assert isinstance(dataset, str)
+        self.cfg = cfg
+        self.sample_rate = cfg.preprocess.sample_rate
+        self.hop_size = self.cfg.preprocess.hop_size
+        processed_data_dir = os.path.join(cfg.preprocess.processed_dir, dataset)
+        meta_file = cfg.preprocess.valid_file if is_valid else cfg.preprocess.train_file
+        self.metafile_path = os.path.join(processed_data_dir, meta_file)
+        self.metadata = self.get_metadata()
+        """
+        load spk2id and utt2spk from json file
+            spk2id: {spk1: 0, spk2: 1, ...}
+            utt2spk: {dataset_uid: spk1, ...}
+        """
+        if cfg.preprocess.use_spkid:
+            spk2id_path = os.path.join(processed_data_dir, cfg.preprocess.spk2id)
+            with open(spk2id_path, "r") as f:
+                self.spk2id = json.load(f)
+            utt2spk_path = os.path.join(processed_data_dir, cfg.preprocess.utt2spk)
+            self.utt2spk = dict()
+            with open(utt2spk_path, "r") as f:
+                for line in f.readlines():
+                    utt, spk = line.strip().split("\t")
+                    self.utt2spk[utt] = spk
+    def get_metadata(self):
+        with open(self.metafile_path, "r", encoding="utf-8") as f:
+            metadata = json.load(f)
+        return metadata
+    def get_dataset_name(self):
+        return self.metadata[0]["Dataset"]
+    def __getitem__(self, index):
+        """
+        single_feature:
+            wav: (T)
+            wav_len: int
+            target_len: int
+            mask: (n_frames, 1)
+            spk_id: (1)
+        """
+        utt_item = self.metadata[index]
+        wav_path = utt_item["Path"]
+        wav, _ = librosa.load(wav_path, sr=self.sample_rate)
+        # wav: (T)
+        wav = torch.as_tensor(wav, dtype=torch.float32)
+        wav_len = len(wav)
+        # mask: (n_frames, 1)
+        frame_len = wav_len // self.hop_size
+        mask = torch.ones(frame_len, 1, dtype=torch.long)
+        single_feature = {
+            "wav": wav,
+            "wav_len": wav_len,
+            "target_len": frame_len,
+            "mask": mask,
+        }
+        if self.cfg.preprocess.use_spkid:
+            utt = "{}_{}".format(utt_item["Dataset"], utt_item["Uid"])
+            single_feature["spk_id"] = torch.tensor(
+                [self.spk2id[self.utt2spk[utt]]], dtype=torch.int32
+            )
+        return single_feature
+    def __len__(self):
+        return len(self.metadata)
+class BaseOnlineCollator(object):
+    """Zero-pads model inputs and targets based on number of frames per step (For on-the-fly features extraction, whose iterative item contains only wavs)"""
+    def __init__(self, cfg):
+        self.cfg = cfg
+    def __call__(self, batch):
+        """
+        BaseOnlineDataset.__getitem__:
+            wav: (T,)
+            wav_len: int
+            target_len: int
+            mask: (n_frames, 1)
+            spk_id: (1)
+        Returns:
+            wav: (B, T), torch.float32
+            wav_len: (B), torch.long
+            target_len: (B), torch.long
+            mask: (B, n_frames, 1), torch.long
+            spk_id: (B, 1), torch.int32
+        """
+        packed_batch_features = dict()
+        for key in batch[0].keys():
+            if key in ["wav_len", "target_len"]:
+                packed_batch_features[key] = torch.LongTensor([b[key] for b in batch])
+            else:
+                packed_batch_features[key] = pad_sequence(
+                    [b[key] for b in batch], batch_first=True, padding_value=0
+                )
+        return packed_batch_features
+class BaseTestDataset(torch.utils.data.Dataset):
+    def __init__(self, cfg, args):
+        raise NotImplementedError
+    def get_metadata(self):
+        raise NotImplementedError
+    def __getitem__(self, index):
+        raise NotImplementedError
+    def __len__(self):
+        return len(self.metadata)
+class BaseTestCollator(object):
+    """Zero-pads model inputs and targets based on number of frames per step"""
+    def __init__(self, cfg):
+        raise NotImplementedError
+    def __call__(self, batch):
+        raise NotImplementedError

Amphion/models/base/base_inference.py ADDED Viewed

	@@ -0,0 +1,220 @@

+# Copyright (c) 2023 Amphion.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+import argparse
+import os
+import re
+import time
+from pathlib import Path
+import torch
+from torch.utils.data import DataLoader
+from tqdm import tqdm
+from models.vocoders.vocoder_inference import synthesis
+from torch.utils.data import DataLoader
+from utils.util import set_all_random_seed
+from utils.util import load_config
+def parse_vocoder(vocoder_dir):
+    r"""Parse vocoder config"""
+    vocoder_dir = os.path.abspath(vocoder_dir)
+    ckpt_list = [ckpt for ckpt in Path(vocoder_dir).glob("*.pt")]
+    ckpt_list.sort(key=lambda x: int(x.stem), reverse=True)
+    ckpt_path = str(ckpt_list[0])
+    vocoder_cfg = load_config(os.path.join(vocoder_dir, "args.json"), lowercase=True)
+    vocoder_cfg.model.bigvgan = vocoder_cfg.vocoder
+    return vocoder_cfg, ckpt_path
+class BaseInference(object):
+    def __init__(self, cfg, args):
+        self.cfg = cfg
+        self.args = args
+        self.model_type = cfg.model_type
+        self.avg_rtf = list()
+        set_all_random_seed(10086)
+        os.makedirs(args.output_dir, exist_ok=True)
+        if torch.cuda.is_available():
+            self.device = torch.device("cuda")
+        else:
+            self.device = torch.device("cpu")
+            torch.set_num_threads(10)  # inference on 1 core cpu.
+        # Load acoustic model
+        self.model = self.create_model().to(self.device)
+        state_dict = self.load_state_dict()
+        self.load_model(state_dict)
+        self.model.eval()
+        # Load vocoder model if necessary
+        if self.args.checkpoint_dir_vocoder is not None:
+            self.get_vocoder_info()
+    def create_model(self):
+        raise NotImplementedError
+    def load_state_dict(self):
+        self.checkpoint_file = self.args.checkpoint_file
+        if self.checkpoint_file is None:
+            assert self.args.checkpoint_dir is not None
+            checkpoint_path = os.path.join(self.args.checkpoint_dir, "checkpoint")
+            checkpoint_filename = open(checkpoint_path).readlines()[-1].strip()
+            self.checkpoint_file = os.path.join(
+                self.args.checkpoint_dir, checkpoint_filename
+            )
+        self.checkpoint_dir = os.path.split(self.checkpoint_file)[0]
+        print("Restore acoustic model from {}".format(self.checkpoint_file))
+        raw_state_dict = torch.load(self.checkpoint_file, map_location=self.device)
+        self.am_restore_step = re.findall(r"step-(.+?)_loss", self.checkpoint_file)[0]
+        return raw_state_dict
+    def load_model(self, model):
+        raise NotImplementedError
+    def get_vocoder_info(self):
+        self.checkpoint_dir_vocoder = self.args.checkpoint_dir_vocoder
+        self.vocoder_cfg = os.path.join(
+            os.path.dirname(self.checkpoint_dir_vocoder), "args.json"
+        )
+        self.cfg.vocoder = load_config(self.vocoder_cfg, lowercase=True)
+        self.vocoder_tag = self.checkpoint_dir_vocoder.split("/")[-2].split(":")[-1]
+        self.vocoder_steps = self.checkpoint_dir_vocoder.split("/")[-1].split(".")[0]
+    def build_test_utt_data(self):
+        raise NotImplementedError
+    def build_testdata_loader(self, args, target_speaker=None):
+        datasets, collate = self.build_test_dataset()
+        self.test_dataset = datasets(self.cfg, args, target_speaker)
+        self.test_collate = collate(self.cfg)
+        self.test_batch_size = min(
+            self.cfg.train.batch_size, len(self.test_dataset.metadata)
+        )
+        test_loader = DataLoader(
+            self.test_dataset,
+            collate_fn=self.test_collate,
+            num_workers=self.args.num_workers,
+            batch_size=self.test_batch_size,
+            shuffle=False,
+        )
+        return test_loader
+    def inference_each_batch(self, batch_data):
+        raise NotImplementedError
+    def inference_for_batches(self, args, target_speaker=None):
+        ###### Construct test_batch ######
+        loader = self.build_testdata_loader(args, target_speaker)
+        n_batch = len(loader)
+        now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(time.time()))
+        print(
+            "Model eval time: {}, batch_size = {}, n_batch = {}".format(
+                now, self.test_batch_size, n_batch
+            )
+        )
+        self.model.eval()
+        ###### Inference for each batch ######
+        pred_res = []
+        with torch.no_grad():
+            for i, batch_data in enumerate(loader if n_batch == 1 else tqdm(loader)):
+                # Put the data to device
+                for k, v in batch_data.items():
+                    batch_data[k] = batch_data[k].to(self.device)
+                y_pred, stats = self.inference_each_batch(batch_data)
+                pred_res += y_pred
+        return pred_res
+    def inference(self, feature):
+        raise NotImplementedError
+    def synthesis_by_vocoder(self, pred):
+        audios_pred = synthesis(
+            self.vocoder_cfg,
+            self.checkpoint_dir_vocoder,
+            len(pred),
+            pred,
+        )
+        return audios_pred
+    def __call__(self, utt):
+        feature = self.build_test_utt_data(utt)
+        start_time = time.time()
+        with torch.no_grad():
+            outputs = self.inference(feature)[0]
+        time_used = time.time() - start_time
+        rtf = time_used / (
+            outputs.shape[1]
+            * self.cfg.preprocess.hop_size
+            / self.cfg.preprocess.sample_rate
+        )
+        print("Time used: {:.3f}, RTF: {:.4f}".format(time_used, rtf))
+        self.avg_rtf.append(rtf)
+        audios = outputs.cpu().squeeze().numpy().reshape(-1, 1)
+        return audios
+def base_parser():
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--config", default="config.json", help="json files for configurations."
+    )
+    parser.add_argument("--use_ddp_inference", default=False)
+    parser.add_argument("--n_workers", default=1, type=int)
+    parser.add_argument("--local_rank", default=-1, type=int)
+    parser.add_argument(
+        "--batch_size", default=1, type=int, help="Batch size for inference"
+    )
+    parser.add_argument(
+        "--num_workers",
+        default=1,
+        type=int,
+        help="Worker number for inference dataloader",
+    )
+    parser.add_argument(
+        "--checkpoint_dir",
+        type=str,
+        default=None,
+        help="Checkpoint dir including model file and configuration",
+    )
+    parser.add_argument(
+        "--checkpoint_file", help="checkpoint file", type=str, default=None
+    )
+    parser.add_argument(
+        "--test_list", help="test utterance list for testing", type=str, default=None
+    )
+    parser.add_argument(
+        "--checkpoint_dir_vocoder",
+        help="Vocoder's checkpoint dir including model file and configuration",
+        type=str,
+        default=None,
+    )
+    parser.add_argument(
+        "--output_dir",
+        type=str,
+        default=None,
+        help="Output dir for saving generated results",
+    )
+    return parser
+if __name__ == "__main__":
+    parser = base_parser()
+    args = parser.parse_args()
+    cfg = load_config(args.config)
+    # Build inference
+    inference = BaseInference(cfg, args)
+    inference()

Amphion/models/base/new_inference.py ADDED Viewed

	@@ -0,0 +1,253 @@

+# Copyright (c) 2023 Amphion.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+import os
+import random
+import re
+import time
+from abc import abstractmethod
+from pathlib import Path
+import accelerate
+import json5
+import numpy as np
+import torch
+from accelerate.logging import get_logger
+from torch.utils.data import DataLoader
+from models.vocoders.vocoder_inference import synthesis
+from utils.io import save_audio
+from utils.util import load_config
+from utils.audio_slicer import is_silence
+EPS = 1.0e-12
+class BaseInference(object):
+    def __init__(self, args=None, cfg=None, infer_type="from_dataset"):
+        super().__init__()
+        start = time.monotonic_ns()
+        self.args = args
+        self.cfg = cfg
+        assert infer_type in ["from_dataset", "from_file"]
+        self.infer_type = infer_type
+        # init with accelerate
+        self.accelerator = accelerate.Accelerator()
+        self.accelerator.wait_for_everyone()
+        # Use accelerate logger for distributed inference
+        with self.accelerator.main_process_first():
+            self.logger = get_logger("inference", log_level=args.log_level)
+        # Log some info
+        self.logger.info("=" * 56)
+        self.logger.info("||\t\t" + "New inference process started." + "\t\t||")
+        self.logger.info("=" * 56)
+        self.logger.info("\n")
+        self.logger.debug(f"Using {args.log_level.upper()} logging level.")
+        self.acoustics_dir = args.acoustics_dir
+        self.logger.debug(f"Acoustic dir: {args.acoustics_dir}")
+        self.vocoder_dir = args.vocoder_dir
+        self.logger.debug(f"Vocoder dir: {args.vocoder_dir}")
+        # should be in svc inferencer
+        # self.target_singer = args.target_singer
+        # self.logger.info(f"Target singers: {args.target_singer}")
+        # self.trans_key = args.trans_key
+        # self.logger.info(f"Trans key: {args.trans_key}")
+        os.makedirs(args.output_dir, exist_ok=True)
+        # set random seed
+        with self.accelerator.main_process_first():
+            start = time.monotonic_ns()
+            self._set_random_seed(self.cfg.train.random_seed)
+            end = time.monotonic_ns()
+            self.logger.debug(
+                f"Setting random seed done in {(end - start) / 1e6:.2f}ms"
+            )
+            self.logger.debug(f"Random seed: {self.cfg.train.random_seed}")
+        # setup data_loader
+        with self.accelerator.main_process_first():
+            self.logger.info("Building dataset...")
+            start = time.monotonic_ns()
+            self.test_dataloader = self._build_dataloader()
+            end = time.monotonic_ns()
+            self.logger.info(f"Building dataset done in {(end - start) / 1e6:.2f}ms")
+        # setup model
+        with self.accelerator.main_process_first():
+            self.logger.info("Building model...")
+            start = time.monotonic_ns()
+            self.model = self._build_model()
+            end = time.monotonic_ns()
+            # self.logger.debug(self.model)
+            self.logger.info(f"Building model done in {(end - start) / 1e6:.3f}ms")
+        # init with accelerate
+        self.logger.info("Initializing accelerate...")
+        start = time.monotonic_ns()
+        self.accelerator = accelerate.Accelerator()
+        self.model = self.accelerator.prepare(self.model)
+        end = time.monotonic_ns()
+        self.accelerator.wait_for_everyone()
+        self.logger.info(f"Initializing accelerate done in {(end - start) / 1e6:.3f}ms")
+        with self.accelerator.main_process_first():
+            self.logger.info("Loading checkpoint...")
+            start = time.monotonic_ns()
+            # TODO: Also, suppose only use latest one yet
+            self.__load_model(os.path.join(args.acoustics_dir, "checkpoint"))
+            end = time.monotonic_ns()
+            self.logger.info(f"Loading checkpoint done in {(end - start) / 1e6:.3f}ms")
+        self.model.eval()
+        self.accelerator.wait_for_everyone()
+    ### Abstract methods ###
+    @abstractmethod
+    def _build_test_dataset(self):
+        pass
+    @abstractmethod
+    def _build_model(self):
+        pass
+    @abstractmethod
+    @torch.inference_mode()
+    def _inference_each_batch(self, batch_data):
+        pass
+    ### Abstract methods end ###
+    @torch.inference_mode()
+    def inference(self):
+        for i, batch in enumerate(self.test_dataloader):
+            y_pred = self._inference_each_batch(batch).cpu()
+            # Judge whether the min-max normliazation is used
+            if self.cfg.preprocess.use_min_max_norm_mel:
+                mel_min, mel_max = self.test_dataset.target_mel_extrema
+                y_pred = (y_pred + 1.0) / 2.0 * (mel_max - mel_min + EPS) + mel_min
+            y_ls = y_pred.chunk(self.test_batch_size)
+            tgt_ls = batch["target_len"].cpu().chunk(self.test_batch_size)
+            j = 0
+            for it, l in zip(y_ls, tgt_ls):
+                l = l.item()
+                it = it.squeeze(0)[:l]
+                uid = self.test_dataset.metadata[i * self.test_batch_size + j]["Uid"]
+                torch.save(it, os.path.join(self.args.output_dir, f"{uid}.pt"))
+                j += 1
+        vocoder_cfg, vocoder_ckpt = self._parse_vocoder(self.args.vocoder_dir)
+        res = synthesis(
+            cfg=vocoder_cfg,
+            vocoder_weight_file=vocoder_ckpt,
+            n_samples=None,
+            pred=[
+                torch.load(
+                    os.path.join(self.args.output_dir, "{}.pt".format(i["Uid"]))
+                ).numpy(force=True)
+                for i in self.test_dataset.metadata
+            ],
+        )
+        output_audio_files = []
+        for it, wav in zip(self.test_dataset.metadata, res):
+            uid = it["Uid"]
+            file = os.path.join(self.args.output_dir, f"{uid}.wav")
+            output_audio_files.append(file)
+            wav = wav.numpy(force=True)
+            save_audio(
+                file,
+                wav,
+                self.cfg.preprocess.sample_rate,
+                add_silence=False,
+                turn_up=not is_silence(wav, self.cfg.preprocess.sample_rate),
+            )
+            os.remove(os.path.join(self.args.output_dir, f"{uid}.pt"))
+        return sorted(output_audio_files)
+    # TODO: LEGACY CODE
+    def _build_dataloader(self):
+        datasets, collate = self._build_test_dataset()
+        self.test_dataset = datasets(self.args, self.cfg, self.infer_type)
+        self.test_collate = collate(self.cfg)
+        self.test_batch_size = min(
+            self.cfg.train.batch_size, len(self.test_dataset.metadata)
+        )
+        test_dataloader = DataLoader(
+            self.test_dataset,
+            collate_fn=self.test_collate,
+            num_workers=1,
+            batch_size=self.test_batch_size,
+            shuffle=False,
+        )
+        return test_dataloader
+    def __load_model(self, checkpoint_dir: str = None, checkpoint_path: str = None):
+        r"""Load model from checkpoint. If checkpoint_path is None, it will
+        load the latest checkpoint in checkpoint_dir. If checkpoint_path is not
+        None, it will load the checkpoint specified by checkpoint_path. **Only use this
+        method after** ``accelerator.prepare()``.
+        """
+        if checkpoint_path is None:
+            ls = []
+            for i in Path(checkpoint_dir).iterdir():
+                if re.match(r"epoch-\d+_step-\d+_loss-[\d.]+", str(i.stem)):
+                    ls.append(i)
+            ls.sort(
+                key=lambda x: int(x.stem.split("_")[-3].split("-")[-1]), reverse=True
+            )
+            checkpoint_path = ls[0]
+        else:
+            checkpoint_path = Path(checkpoint_path)
+        self.accelerator.load_state(str(checkpoint_path))
+        # set epoch and step
+        self.epoch = int(checkpoint_path.stem.split("_")[-3].split("-")[-1])
+        self.step = int(checkpoint_path.stem.split("_")[-2].split("-")[-1])
+        return str(checkpoint_path)
+    @staticmethod
+    def _set_random_seed(seed):
+        r"""Set random seed for all possible random modules."""
+        random.seed(seed)
+        np.random.seed(seed)
+        torch.random.manual_seed(seed)
+    @staticmethod
+    def _parse_vocoder(vocoder_dir):
+        r"""Parse vocoder config"""
+        vocoder_dir = os.path.abspath(vocoder_dir)
+        ckpt_list = [ckpt for ckpt in Path(vocoder_dir).glob("*.pt")]
+        ckpt_list.sort(key=lambda x: int(x.stem), reverse=True)
+        ckpt_path = str(ckpt_list[0])
+        vocoder_cfg = load_config(
+            os.path.join(vocoder_dir, "args.json"), lowercase=True
+        )
+        return vocoder_cfg, ckpt_path
+    @staticmethod
+    def __count_parameters(model):
+        return sum(p.numel() for p in model.parameters())
+    def __dump_cfg(self, path):
+        os.makedirs(os.path.dirname(path), exist_ok=True)
+        json5.dump(
+            self.cfg,
+            open(path, "w"),
+            indent=4,
+            sort_keys=True,
+            ensure_ascii=False,
+            quote_keys=True,
+        )

Amphion/models/codec/ns3_codec/alias_free_torch/__init__.py ADDED Viewed

	@@ -0,0 +1,5 @@

+# Adapted from https://github.com/junjun3518/alias-free-torch under the Apache License 2.0
+from .filter import *
+from .resample import *
+from .act import *

Amphion/models/codec/ns3_codec/alias_free_torch/__pycache__/act.cpython-310.pyc ADDED Viewed

Binary file (1.11 kB). View file

Amphion/models/codec/ns3_codec/alias_free_torch/__pycache__/filter.cpython-310.pyc ADDED Viewed

Binary file (2.69 kB). View file

Amphion/models/codec/ns3_codec/alias_free_torch/act.py ADDED Viewed

	@@ -0,0 +1,29 @@

+# Adapted from https://github.com/junjun3518/alias-free-torch under the Apache License 2.0
+import torch.nn as nn
+from .resample import UpSample1d, DownSample1d
+class Activation1d(nn.Module):
+    def __init__(
+        self,
+        activation,
+        up_ratio: int = 2,
+        down_ratio: int = 2,
+        up_kernel_size: int = 12,
+        down_kernel_size: int = 12,
+    ):
+        super().__init__()
+        self.up_ratio = up_ratio
+        self.down_ratio = down_ratio
+        self.act = activation
+        self.upsample = UpSample1d(up_ratio, up_kernel_size)
+        self.downsample = DownSample1d(down_ratio, down_kernel_size)
+    # x: [B,C,T]
+    def forward(self, x):
+        x = self.upsample(x)
+        x = self.act(x)
+        x = self.downsample(x)
+        return x

Amphion/models/codec/ns3_codec/facodec.py ADDED Viewed

	@@ -0,0 +1,1163 @@

+# Copyright (c) 2023 Amphion.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+import numpy as np
+import torch
+from einops import rearrange
+from einops.layers.torch import Rearrange
+from torch import nn, pow, sin
+from torch.nn import Parameter
+from torch.nn.utils import weight_norm
+from .alias_free_torch import *
+from .gradient_reversal import GradientReversal
+from .melspec import MelSpectrogram
+from .quantize import *
+from .transformer import TransformerEncoder
+def init_weights(m):
+    if isinstance(m, nn.Conv1d):
+        nn.init.trunc_normal_(m.weight, std=0.02)
+        nn.init.constant_(m.bias, 0)
+def WNConv1d(*args, **kwargs):
+    return weight_norm(nn.Conv1d(*args, **kwargs))
+def WNConvTranspose1d(*args, **kwargs):
+    return weight_norm(nn.ConvTranspose1d(*args, **kwargs))
+class CNNLSTM(nn.Module):
+    def __init__(self, indim, outdim, head, global_pred=False):
+        super().__init__()
+        self.global_pred = global_pred
+        self.model = nn.Sequential(
+            ResidualUnit(indim, dilation=1),
+            ResidualUnit(indim, dilation=2),
+            ResidualUnit(indim, dilation=3),
+            Activation1d(activation=SnakeBeta(indim, alpha_logscale=True)),
+            Rearrange("b c t -> b t c"),
+        )
+        self.heads = nn.ModuleList([nn.Linear(indim, outdim) for i in range(head)])
+    def forward(self, x):
+        # x: [B, C, T]
+        x = self.model(x)
+        if self.global_pred:
+            x = torch.mean(x, dim=1, keepdim=False)
+        outs = [head(x) for head in self.heads]
+        return outs
+class SnakeBeta(nn.Module):
+    """
+    A modified Snake function which uses separate parameters for the magnitude of the periodic components
+    Shape:
+        - Input: (B, C, T)
+        - Output: (B, C, T), same shape as the input
+    Parameters:
+        - alpha - trainable parameter that controls frequency
+        - beta - trainable parameter that controls magnitude
+    References:
+        - This activation function is a modified version based on this paper by Liu Ziyin, Tilman Hartwig, Masahito Ueda:
+        https://arxiv.org/abs/2006.08195
+    Examples:
+        >>> a1 = snakebeta(256)
+        >>> x = torch.randn(256)
+        >>> x = a1(x)
+    """
+    def __init__(self, in_features, alpha=1.0, alpha_trainable=True, alpha_logscale=False):
+        """
+        Initialization.
+        INPUT:
+            - in_features: shape of the input
+            - alpha - trainable parameter that controls frequency
+            - beta - trainable parameter that controls magnitude
+            alpha is initialized to 1 by default, higher values = higher-frequency.
+            beta is initialized to 1 by default, higher values = higher-magnitude.
+            alpha will be trained along with the rest of your model.
+        """
+        super(SnakeBeta, self).__init__()
+        self.in_features = in_features
+        # initialize alpha
+        self.alpha_logscale = alpha_logscale
+        if self.alpha_logscale:  # log scale alphas initialized to zeros
+            self.alpha = Parameter(torch.zeros(in_features) * alpha)
+            self.beta = Parameter(torch.zeros(in_features) * alpha)
+        else:  # linear scale alphas initialized to ones
+            self.alpha = Parameter(torch.ones(in_features) * alpha)
+            self.beta = Parameter(torch.ones(in_features) * alpha)
+        self.alpha.requires_grad = alpha_trainable
+        self.beta.requires_grad = alpha_trainable
+        self.no_div_by_zero = 0.000000001
+    def forward(self, x):
+        """
+        Forward pass of the function.
+        Applies the function to the input elementwise.
+        SnakeBeta := x + 1/b * sin^2 (xa)
+        """
+        alpha = self.alpha.unsqueeze(0).unsqueeze(-1)  # line up with x to [B, C, T]
+        beta = self.beta.unsqueeze(0).unsqueeze(-1)
+        if self.alpha_logscale:
+            alpha = torch.exp(alpha)
+            beta = torch.exp(beta)
+        x = x + (1.0 / (beta + self.no_div_by_zero)) * pow(sin(x * alpha), 2)
+        return x
+class ResidualUnit(nn.Module):
+    def __init__(self, dim: int = 16, dilation: int = 1):
+        super().__init__()
+        pad = ((7 - 1) * dilation) // 2
+        self.block = nn.Sequential(
+            Activation1d(activation=SnakeBeta(dim, alpha_logscale=True)),
+            WNConv1d(dim, dim, kernel_size=7, dilation=dilation, padding=pad),
+            Activation1d(activation=SnakeBeta(dim, alpha_logscale=True)),
+            WNConv1d(dim, dim, kernel_size=1),
+        )
+    def forward(self, x):
+        return x + self.block(x)
+class EncoderBlock(nn.Module):
+    def __init__(self, dim: int = 16, stride: int = 1):
+        super().__init__()
+        self.block = nn.Sequential(
+            ResidualUnit(dim // 2, dilation=1),
+            ResidualUnit(dim // 2, dilation=3),
+            ResidualUnit(dim // 2, dilation=9),
+            Activation1d(activation=SnakeBeta(dim // 2, alpha_logscale=True)),
+            WNConv1d(
+                dim // 2,
+                dim,
+                kernel_size=2 * stride,
+                stride=stride,
+                padding=stride // 2 + stride % 2,
+            ),
+        )
+    def forward(self, x):
+        return self.block(x)
+class FACodecEncoder(nn.Module):
+    def __init__(
+        self,
+        ngf=32,
+        up_ratios=(2, 4, 5, 5),
+        out_channels=1024,
+    ):
+        super().__init__()
+        self.hop_length = np.prod(up_ratios)
+        self.up_ratios = up_ratios
+        # Create first convolution
+        d_model = ngf
+        self.block = [WNConv1d(1, d_model, kernel_size=7, padding=3)]
+        # Create EncoderBlocks that double channels as they downsample by `stride`
+        for stride in up_ratios:
+            d_model *= 2
+            self.block += [EncoderBlock(d_model, stride=stride)]
+        # Create last convolution
+        self.block += [
+            Activation1d(activation=SnakeBeta(d_model, alpha_logscale=True)),
+            WNConv1d(d_model, out_channels, kernel_size=3, padding=1),
+        ]
+        # Wrap black into nn.Sequential
+        self.block = nn.Sequential(*self.block)
+        self.enc_dim = d_model
+        self.reset_parameters()
+    def forward(self, x):
+        out = self.block(x)
+        return out
+    def inference(self, x):
+        return self.block(x)
+    def remove_weight_norm(self):
+        """Remove weight normalization module from all of the layers."""
+        def _remove_weight_norm(m):
+            try:
+                torch.nn.utils.remove_weight_norm(m)
+            except ValueError:  # this module didn't have weight norm
+                return
+        self.apply(_remove_weight_norm)
+    def apply_weight_norm(self):
+        """Apply weight normalization module from all of the layers."""
+        def _apply_weight_norm(m):
+            if isinstance(m, nn.Conv1d):
+                torch.nn.utils.weight_norm(m)
+        self.apply(_apply_weight_norm)
+    def reset_parameters(self):
+        self.apply(init_weights)
+class DecoderBlock(nn.Module):
+    def __init__(self, input_dim: int = 16, output_dim: int = 8, stride: int = 1):
+        super().__init__()
+        self.block = nn.Sequential(
+            Activation1d(activation=SnakeBeta(input_dim, alpha_logscale=True)),
+            WNConvTranspose1d(
+                input_dim,
+                output_dim,
+                kernel_size=2 * stride,
+                stride=stride,
+                padding=stride // 2 + stride % 2,
+                output_padding=stride % 2,
+            ),
+            ResidualUnit(output_dim, dilation=1),
+            ResidualUnit(output_dim, dilation=3),
+            ResidualUnit(output_dim, dilation=9),
+        )
+    def forward(self, x):
+        return self.block(x)
+class FACodecDecoder(nn.Module):
+    def __init__(
+        self,
+        in_channels=256,
+        upsample_initial_channel=1536,
+        ngf=32,
+        up_ratios=(5, 5, 4, 2),
+        vq_num_q_c=2,
+        vq_num_q_p=1,
+        vq_num_q_r=3,
+        vq_dim=1024,
+        vq_commit_weight=0.005,
+        vq_weight_init=False,
+        vq_full_commit_loss=False,
+        codebook_dim=8,
+        codebook_size_prosody=10,  # true codebook size is equal to 2^codebook_size
+        codebook_size_content=10,
+        codebook_size_residual=10,
+        quantizer_dropout=0.0,
+        dropout_type="linear",
+        use_gr_content_f0=False,
+        use_gr_prosody_phone=False,
+        use_gr_residual_f0=False,
+        use_gr_residual_phone=False,
+        use_gr_x_timbre=False,
+        use_random_mask_residual=True,
+        prob_random_mask_residual=0.75,
+    ):
+        super().__init__()
+        self.hop_length = np.prod(up_ratios)
+        self.ngf = ngf
+        self.up_ratios = up_ratios
+        self.use_random_mask_residual = use_random_mask_residual
+        self.prob_random_mask_residual = prob_random_mask_residual
+        self.vq_num_q_p = vq_num_q_p
+        self.vq_num_q_c = vq_num_q_c
+        self.vq_num_q_r = vq_num_q_r
+        self.codebook_size_prosody = codebook_size_prosody
+        self.codebook_size_content = codebook_size_content
+        self.codebook_size_residual = codebook_size_residual
+        quantizer_class = ResidualVQ
+        self.quantizer = nn.ModuleList()
+        # prosody
+        quantizer = quantizer_class(
+            num_quantizers=vq_num_q_p,
+            dim=vq_dim,
+            codebook_size=codebook_size_prosody,
+            codebook_dim=codebook_dim,
+            threshold_ema_dead_code=2,
+            commitment=vq_commit_weight,
+            weight_init=vq_weight_init,
+            full_commit_loss=vq_full_commit_loss,
+            quantizer_dropout=quantizer_dropout,
+            dropout_type=dropout_type,
+        )
+        self.quantizer.append(quantizer)
+        # phone
+        quantizer = quantizer_class(
+            num_quantizers=vq_num_q_c,
+            dim=vq_dim,
+            codebook_size=codebook_size_content,
+            codebook_dim=codebook_dim,
+            threshold_ema_dead_code=2,
+            commitment=vq_commit_weight,
+            weight_init=vq_weight_init,
+            full_commit_loss=vq_full_commit_loss,
+            quantizer_dropout=quantizer_dropout,
+            dropout_type=dropout_type,
+        )
+        self.quantizer.append(quantizer)
+        # residual
+        if self.vq_num_q_r > 0:
+            quantizer = quantizer_class(
+                num_quantizers=vq_num_q_r,
+                dim=vq_dim,
+                codebook_size=codebook_size_residual,
+                codebook_dim=codebook_dim,
+                threshold_ema_dead_code=2,
+                commitment=vq_commit_weight,
+                weight_init=vq_weight_init,
+                full_commit_loss=vq_full_commit_loss,
+                quantizer_dropout=quantizer_dropout,
+                dropout_type=dropout_type,
+            )
+            self.quantizer.append(quantizer)
+        # Add first conv layer
+        channels = upsample_initial_channel
+        layers = [WNConv1d(in_channels, channels, kernel_size=7, padding=3)]
+        # Add upsampling + MRF blocks
+        for i, stride in enumerate(up_ratios):
+            input_dim = channels // 2**i
+            output_dim = channels // 2 ** (i + 1)
+            layers += [DecoderBlock(input_dim, output_dim, stride)]
+        # Add final conv layer
+        layers += [
+            Activation1d(activation=SnakeBeta(output_dim, alpha_logscale=True)),
+            WNConv1d(output_dim, 1, kernel_size=7, padding=3),
+            nn.Tanh(),
+        ]
+        self.model = nn.Sequential(*layers)
+        self.timbre_encoder = TransformerEncoder(
+            enc_emb_tokens=None,
+            encoder_layer=4,
+            encoder_hidden=256,
+            encoder_head=4,
+            conv_filter_size=1024,
+            conv_kernel_size=5,
+            encoder_dropout=0.1,
+            use_cln=False,
+        )
+        self.timbre_linear = nn.Linear(in_channels, in_channels * 2)
+        self.timbre_linear.bias.data[:in_channels] = 1
+        self.timbre_linear.bias.data[in_channels:] = 0
+        self.timbre_norm = nn.LayerNorm(in_channels, elementwise_affine=False)
+        self.f0_predictor = CNNLSTM(in_channels, 1, 2)
+        self.phone_predictor = CNNLSTM(in_channels, 5003, 1)
+        self.use_gr_content_f0 = use_gr_content_f0
+        self.use_gr_prosody_phone = use_gr_prosody_phone
+        self.use_gr_residual_f0 = use_gr_residual_f0
+        self.use_gr_residual_phone = use_gr_residual_phone
+        self.use_gr_x_timbre = use_gr_x_timbre
+        if self.vq_num_q_r > 0 and self.use_gr_residual_f0:
+            self.res_f0_predictor = nn.Sequential(GradientReversal(alpha=1.0), CNNLSTM(in_channels, 1, 2))
+        if self.vq_num_q_r > 0 and self.use_gr_residual_phone > 0:
+            self.res_phone_predictor = nn.Sequential(GradientReversal(alpha=1.0), CNNLSTM(in_channels, 5003, 1))
+        if self.use_gr_content_f0:
+            self.content_f0_predictor = nn.Sequential(GradientReversal(alpha=1.0), CNNLSTM(in_channels, 1, 2))
+        if self.use_gr_prosody_phone:
+            self.prosody_phone_predictor = nn.Sequential(GradientReversal(alpha=1.0), CNNLSTM(in_channels, 5003, 1))
+        if self.use_gr_x_timbre:
+            self.x_timbre_predictor = nn.Sequential(
+                GradientReversal(alpha=1),
+                CNNLSTM(in_channels, 245200, 1, global_pred=True),
+            )
+        self.reset_parameters()
+    def quantize(self, x, n_quantizers=None):
+        outs, qs, commit_loss, quantized_buf = 0, [], [], []
+        # prosody
+        f0_input = x  # (B, d, T)
+        f0_quantizer = self.quantizer[0]
+        out, q, commit, quantized = f0_quantizer(f0_input, n_quantizers=n_quantizers)
+        outs += out
+        qs.append(q)
+        quantized_buf.append(quantized.sum(0))
+        commit_loss.append(commit)
+        # phone
+        phone_input = x
+        phone_quantizer = self.quantizer[1]
+        out, q, commit, quantized = phone_quantizer(phone_input, n_quantizers=n_quantizers)
+        outs += out
+        qs.append(q)
+        quantized_buf.append(quantized.sum(0))
+        commit_loss.append(commit)
+        # residual
+        if self.vq_num_q_r > 0:
+            residual_quantizer = self.quantizer[2]
+            residual_input = x - (quantized_buf[0] + quantized_buf[1]).detach()
+            out, q, commit, quantized = residual_quantizer(residual_input, n_quantizers=n_quantizers)
+            outs += out
+            qs.append(q)
+            quantized_buf.append(quantized.sum(0))  # [L, B, C, T] -> [B, C, T]
+            commit_loss.append(commit)
+        qs = torch.cat(qs, dim=0)
+        commit_loss = torch.cat(commit_loss, dim=0)
+        return outs, qs, commit_loss, quantized_buf
+    def forward(
+        self,
+        x,
+        vq=True,
+        get_vq=False,
+        eval_vq=True,
+        speaker_embedding=None,
+        n_quantizers=None,
+        quantized=None,
+    ):
+        if get_vq:
+            return self.quantizer.get_emb()
+        if vq is True:
+            if eval_vq:
+                self.quantizer.eval()
+            x_timbre = x
+            outs, qs, commit_loss, quantized_buf = self.quantize(x, n_quantizers=n_quantizers)
+            x_timbre = x_timbre.transpose(1, 2)
+            x_timbre = self.timbre_encoder(x_timbre, None, None)
+            x_timbre = x_timbre.transpose(1, 2)
+            spk_embs = torch.mean(x_timbre, dim=2)
+            return outs, qs, commit_loss, quantized_buf, spk_embs
+        out = {}
+        layer_0 = quantized[0]
+        f0, uv = self.f0_predictor(layer_0)
+        f0 = rearrange(f0, "... 1 -> ...")
+        uv = rearrange(uv, "... 1 -> ...")
+        layer_1 = quantized[1]
+        (phone,) = self.phone_predictor(layer_1)
+        out = {"f0": f0, "uv": uv, "phone": phone}
+        if self.use_gr_prosody_phone:
+            (prosody_phone,) = self.prosody_phone_predictor(layer_0)
+            out["prosody_phone"] = prosody_phone
+        if self.use_gr_content_f0:
+            content_f0, content_uv = self.content_f0_predictor(layer_1)
+            content_f0 = rearrange(content_f0, "... 1 -> ...")
+            content_uv = rearrange(content_uv, "... 1 -> ...")
+            out["content_f0"] = content_f0
+            out["content_uv"] = content_uv
+        if self.vq_num_q_r > 0:
+            layer_2 = quantized[2]
+            if self.use_gr_residual_f0:
+                res_f0, res_uv = self.res_f0_predictor(layer_2)
+                res_f0 = rearrange(res_f0, "... 1 -> ...")
+                res_uv = rearrange(res_uv, "... 1 -> ...")
+                out["res_f0"] = res_f0
+                out["res_uv"] = res_uv
+            if self.use_gr_residual_phone:
+                (res_phone,) = self.res_phone_predictor(layer_2)
+                out["res_phone"] = res_phone
+        style = self.timbre_linear(speaker_embedding).unsqueeze(2)  # (B, 2d, 1)
+        gamma, beta = style.chunk(2, 1)  # (B, d, 1)
+        if self.vq_num_q_r > 0:
+            if self.use_random_mask_residual:
+                bsz = quantized[2].shape[0]
+                res_mask = np.random.choice(
+                    [0, 1],
+                    size=bsz,
+                    p=[
+                        self.prob_random_mask_residual,
+                        1 - self.prob_random_mask_residual,
+                    ],
+                )
+                res_mask = torch.from_numpy(res_mask).unsqueeze(1).unsqueeze(1)  # (B, 1, 1)
+                res_mask = res_mask.to(device=quantized[2].device, dtype=quantized[2].dtype)
+                x = quantized[0].detach() + quantized[1].detach() + quantized[2] * res_mask
+                # x = quantized_perturbe[0].detach() + quantized[1].detach() + quantized[2] * res_mask
+            else:
+                x = quantized[0].detach() + quantized[1].detach() + quantized[2]
+                # x = quantized_perturbe[0].detach() + quantized[1].detach() + quantized[2]
+        else:
+            x = quantized[0].detach() + quantized[1].detach()
+            # x = quantized_perturbe[0].detach() + quantized[1].detach()
+        if self.use_gr_x_timbre:
+            (x_timbre,) = self.x_timbre_predictor(x)
+            out["x_timbre"] = x_timbre
+        x = x.transpose(1, 2)
+        x = self.timbre_norm(x)
+        x = x.transpose(1, 2)
+        x = x * gamma + beta
+        x = self.model(x)
+        out["audio"] = x
+        return out
+    def vq2emb(self, vq, use_residual_code=True):
+        # vq: [num_quantizer, B, T]
+        self.quantizer = self.quantizer.eval()
+        out = 0
+        out += self.quantizer[0].vq2emb(vq[0 : self.vq_num_q_p])
+        out += self.quantizer[1].vq2emb(vq[self.vq_num_q_p : self.vq_num_q_p + self.vq_num_q_c])
+        if self.vq_num_q_r > 0 and use_residual_code:
+            out += self.quantizer[2].vq2emb(vq[self.vq_num_q_p + self.vq_num_q_c :])
+        return out
+    def inference(self, x, speaker_embedding):
+        style = self.timbre_linear(speaker_embedding).unsqueeze(2)  # (B, 2d, 1)
+        gamma, beta = style.chunk(2, 1)  # (B, d, 1)
+        x = x.transpose(1, 2)
+        x = self.timbre_norm(x)
+        x = x.transpose(1, 2)
+        x = x * gamma + beta
+        x = self.model(x)
+        return x
+    def remove_weight_norm(self):
+        """Remove weight normalization module from all of the layers."""
+        def _remove_weight_norm(m):
+            try:
+                torch.nn.utils.remove_weight_norm(m)
+            except ValueError:  # this module didn't have weight norm
+                return
+        self.apply(_remove_weight_norm)
+    def apply_weight_norm(self):
+        """Apply weight normalization module from all of the layers."""
+        def _apply_weight_norm(m):
+            if isinstance(m, nn.Conv1d) or isinstance(m, nn.ConvTranspose1d):
+                torch.nn.utils.weight_norm(m)
+        self.apply(_apply_weight_norm)
+    def reset_parameters(self):
+        self.apply(init_weights)
+class FACodecRedecoder(nn.Module):
+    def __init__(
+        self,
+        in_channels=256,
+        upsample_initial_channel=1280,
+        up_ratios=(5, 5, 4, 2),
+        vq_num_q_c=2,
+        vq_num_q_p=1,
+        vq_num_q_r=3,
+        vq_dim=256,
+        codebook_size_prosody=10,
+        codebook_size_content=10,
+        codebook_size_residual=10,
+    ):
+        super().__init__()
+        self.hop_length = np.prod(up_ratios)
+        self.up_ratios = up_ratios
+        self.vq_num_q_p = vq_num_q_p
+        self.vq_num_q_c = vq_num_q_c
+        self.vq_num_q_r = vq_num_q_r
+        self.vq_dim = vq_dim
+        self.codebook_size_prosody = codebook_size_prosody
+        self.codebook_size_content = codebook_size_content
+        self.codebook_size_residual = codebook_size_residual
+        self.prosody_embs = nn.ModuleList()
+        for i in range(self.vq_num_q_p):
+            emb_tokens = nn.Embedding(
+                num_embeddings=2**self.codebook_size_prosody,
+                embedding_dim=self.vq_dim,
+            )
+            emb_tokens.weight.data.normal_(mean=0.0, std=1e-5)
+            self.prosody_embs.append(emb_tokens)
+        self.content_embs = nn.ModuleList()
+        for i in range(self.vq_num_q_c):
+            emb_tokens = nn.Embedding(
+                num_embeddings=2**self.codebook_size_content,
+                embedding_dim=self.vq_dim,
+            )
+            emb_tokens.weight.data.normal_(mean=0.0, std=1e-5)
+            self.content_embs.append(emb_tokens)
+        self.residual_embs = nn.ModuleList()
+        for i in range(self.vq_num_q_r):
+            emb_tokens = nn.Embedding(
+                num_embeddings=2**self.codebook_size_residual,
+                embedding_dim=self.vq_dim,
+            )
+            emb_tokens.weight.data.normal_(mean=0.0, std=1e-5)
+            self.residual_embs.append(emb_tokens)
+        # Add first conv layer
+        channels = upsample_initial_channel
+        layers = [WNConv1d(in_channels, channels, kernel_size=7, padding=3)]
+        # Add upsampling + MRF blocks
+        for i, stride in enumerate(up_ratios):
+            input_dim = channels // 2**i
+            output_dim = channels // 2 ** (i + 1)
+            layers += [DecoderBlock(input_dim, output_dim, stride)]
+        # Add final conv layer
+        layers += [
+            Activation1d(activation=SnakeBeta(output_dim, alpha_logscale=True)),
+            WNConv1d(output_dim, 1, kernel_size=7, padding=3),
+            nn.Tanh(),
+        ]
+        self.model = nn.Sequential(*layers)
+        self.timbre_linear = nn.Linear(in_channels, in_channels * 2)
+        self.timbre_linear.bias.data[:in_channels] = 1
+        self.timbre_linear.bias.data[in_channels:] = 0
+        self.timbre_norm = nn.LayerNorm(in_channels, elementwise_affine=False)
+        self.timbre_cond_prosody_enc = TransformerEncoder(
+            enc_emb_tokens=None,
+            encoder_layer=4,
+            encoder_hidden=256,
+            encoder_head=4,
+            conv_filter_size=1024,
+            conv_kernel_size=5,
+            encoder_dropout=0.1,
+            use_cln=True,
+            cfg=None,
+        )
+    def forward(
+        self,
+        vq,
+        speaker_embedding,
+        use_residual_code=False,
+    ):
+        x = 0
+        x_p = 0
+        for i in range(self.vq_num_q_p):
+            x_p = x_p + self.prosody_embs[i](vq[i])  # (B, T, d)
+        spk_cond = speaker_embedding.unsqueeze(1).expand(-1, x_p.shape[1], -1)
+        x_p = self.timbre_cond_prosody_enc(x_p, key_padding_mask=None, condition=spk_cond)
+        x = x + x_p
+        x_c = 0
+        for i in range(self.vq_num_q_c):
+            x_c = x_c + self.content_embs[i](vq[self.vq_num_q_p + i])
+        x = x + x_c
+        if use_residual_code:
+            x_r = 0
+            for i in range(self.vq_num_q_r):
+                x_r = x_r + self.residual_embs[i](vq[self.vq_num_q_p + self.vq_num_q_c + i])
+            x = x + x_r
+        style = self.timbre_linear(speaker_embedding).unsqueeze(2)  # (B, 2d, 1)
+        gamma, beta = style.chunk(2, 1)  # (B, d, 1)
+        x = x.transpose(1, 2)
+        x = self.timbre_norm(x)
+        x = x.transpose(1, 2)
+        x = x * gamma + beta
+        x = self.model(x)
+        return x
+    def vq2emb(self, vq, speaker_embedding, use_residual=True):
+        out = 0
+        x_t = 0
+        for i in range(self.vq_num_q_p):
+            x_t += self.prosody_embs[i](vq[i])  # (B, T, d)
+            spk_cond = speaker_embedding.unsqueeze(1).expand(-1, x_t.shape[1], -1)
+            x_t = self.timbre_cond_prosody_enc(x_t, key_padding_mask=None, condition=spk_cond)
+        # prosody
+        out += x_t
+        # content
+        for i in range(self.vq_num_q_c):
+            out += self.content_embs[i](vq[self.vq_num_q_p + i])
+        # residual
+        if use_residual:
+            for i in range(self.vq_num_q_r):
+                out += self.residual_embs[i](vq[self.vq_num_q_p + self.vq_num_q_c + i])
+        out = out.transpose(1, 2)  # (B, T, d) -> (B, d, T)
+        return out
+    def inference(self, x, speaker_embedding):
+        style = self.timbre_linear(speaker_embedding).unsqueeze(2)  # (B, 2d, 1)
+        gamma, beta = style.chunk(2, 1)  # (B, d, 1)
+        x = x.transpose(1, 2)
+        x = self.timbre_norm(x)
+        x = x.transpose(1, 2)
+        x = x * gamma + beta
+        x = self.model(x)
+        return x
+class FACodecEncoderV2(nn.Module):
+    def __init__(
+        self,
+        ngf=32,
+        up_ratios=(2, 4, 5, 5),
+        out_channels=1024,
+    ):
+        super().__init__()
+        self.hop_length = np.prod(up_ratios)
+        self.up_ratios = up_ratios
+        # Create first convolution
+        d_model = ngf
+        self.block = [WNConv1d(1, d_model, kernel_size=7, padding=3)]
+        # Create EncoderBlocks that double channels as they downsample by `stride`
+        for stride in up_ratios:
+            d_model *= 2
+            self.block += [EncoderBlock(d_model, stride=stride)]
+        # Create last convolution
+        self.block += [
+            Activation1d(activation=SnakeBeta(d_model, alpha_logscale=True)),
+            WNConv1d(d_model, out_channels, kernel_size=3, padding=1),
+        ]
+        # Wrap black into nn.Sequential
+        self.block = nn.Sequential(*self.block)
+        self.enc_dim = d_model
+        self.mel_transform = MelSpectrogram(
+            n_fft=1024,
+            num_mels=80,
+            sampling_rate=16000,
+            hop_size=200,
+            win_size=800,
+            fmin=0,
+            fmax=8000,
+        )
+        self.reset_parameters()
+    def forward(self, x):
+        out = self.block(x)
+        return out
+    def inference(self, x):
+        return self.block(x)
+    def get_prosody_feature(self, x):
+        return self.mel_transform(x.squeeze(1))[:, :20, :]
+    def remove_weight_norm(self):
+        """Remove weight normalization module from all of the layers."""
+        def _remove_weight_norm(m):
+            try:
+                torch.nn.utils.remove_weight_norm(m)
+            except ValueError:  # this module didn't have weight norm
+                return
+        self.apply(_remove_weight_norm)
+    def apply_weight_norm(self):
+        """Apply weight normalization module from all of the layers."""
+        def _apply_weight_norm(m):
+            if isinstance(m, nn.Conv1d):
+                torch.nn.utils.weight_norm(m)
+        self.apply(_apply_weight_norm)
+    def reset_parameters(self):
+        self.apply(init_weights)
+class FACodecDecoderV2(nn.Module):
+    def __init__(
+        self,
+        in_channels=256,
+        upsample_initial_channel=1536,
+        ngf=32,
+        up_ratios=(5, 5, 4, 2),
+        vq_num_q_c=2,
+        vq_num_q_p=1,
+        vq_num_q_r=3,
+        vq_dim=1024,
+        vq_commit_weight=0.005,
+        vq_weight_init=False,
+        vq_full_commit_loss=False,
+        codebook_dim=8,
+        codebook_size_prosody=10,  # true codebook size is equal to 2^codebook_size
+        codebook_size_content=10,
+        codebook_size_residual=10,
+        quantizer_dropout=0.0,
+        dropout_type="linear",
+        use_gr_content_f0=False,
+        use_gr_prosody_phone=False,
+        use_gr_residual_f0=False,
+        use_gr_residual_phone=False,
+        use_gr_x_timbre=False,
+        use_random_mask_residual=True,
+        prob_random_mask_residual=0.75,
+    ):
+        super().__init__()
+        self.hop_length = np.prod(up_ratios)
+        self.ngf = ngf
+        self.up_ratios = up_ratios
+        self.use_random_mask_residual = use_random_mask_residual
+        self.prob_random_mask_residual = prob_random_mask_residual
+        self.vq_num_q_p = vq_num_q_p
+        self.vq_num_q_c = vq_num_q_c
+        self.vq_num_q_r = vq_num_q_r
+        self.codebook_size_prosody = codebook_size_prosody
+        self.codebook_size_content = codebook_size_content
+        self.codebook_size_residual = codebook_size_residual
+        quantizer_class = ResidualVQ
+        self.quantizer = nn.ModuleList()
+        # prosody
+        quantizer = quantizer_class(
+            num_quantizers=vq_num_q_p,
+            dim=vq_dim,
+            codebook_size=codebook_size_prosody,
+            codebook_dim=codebook_dim,
+            threshold_ema_dead_code=2,
+            commitment=vq_commit_weight,
+            weight_init=vq_weight_init,
+            full_commit_loss=vq_full_commit_loss,
+            quantizer_dropout=quantizer_dropout,
+            dropout_type=dropout_type,
+        )
+        self.quantizer.append(quantizer)
+        # phone
+        quantizer = quantizer_class(
+            num_quantizers=vq_num_q_c,
+            dim=vq_dim,
+            codebook_size=codebook_size_content,
+            codebook_dim=codebook_dim,
+            threshold_ema_dead_code=2,
+            commitment=vq_commit_weight,
+            weight_init=vq_weight_init,
+            full_commit_loss=vq_full_commit_loss,
+            quantizer_dropout=quantizer_dropout,
+            dropout_type=dropout_type,
+        )
+        self.quantizer.append(quantizer)
+        # residual
+        if self.vq_num_q_r > 0:
+            quantizer = quantizer_class(
+                num_quantizers=vq_num_q_r,
+                dim=vq_dim,
+                codebook_size=codebook_size_residual,
+                codebook_dim=codebook_dim,
+                threshold_ema_dead_code=2,
+                commitment=vq_commit_weight,
+                weight_init=vq_weight_init,
+                full_commit_loss=vq_full_commit_loss,
+                quantizer_dropout=quantizer_dropout,
+                dropout_type=dropout_type,
+            )
+            self.quantizer.append(quantizer)
+        # Add first conv layer
+        channels = upsample_initial_channel
+        layers = [WNConv1d(in_channels, channels, kernel_size=7, padding=3)]
+        # Add upsampling + MRF blocks
+        for i, stride in enumerate(up_ratios):
+            input_dim = channels // 2**i
+            output_dim = channels // 2 ** (i + 1)
+            layers += [DecoderBlock(input_dim, output_dim, stride)]
+        # Add final conv layer
+        layers += [
+            Activation1d(activation=SnakeBeta(output_dim, alpha_logscale=True)),
+            WNConv1d(output_dim, 1, kernel_size=7, padding=3),
+            nn.Tanh(),
+        ]
+        self.model = nn.Sequential(*layers)
+        self.timbre_encoder = TransformerEncoder(
+            enc_emb_tokens=None,
+            encoder_layer=4,
+            encoder_hidden=256,
+            encoder_head=4,
+            conv_filter_size=1024,
+            conv_kernel_size=5,
+            encoder_dropout=0.1,
+            use_cln=False,
+        )
+        self.timbre_linear = nn.Linear(in_channels, in_channels * 2)
+        self.timbre_linear.bias.data[:in_channels] = 1
+        self.timbre_linear.bias.data[in_channels:] = 0
+        self.timbre_norm = nn.LayerNorm(in_channels, elementwise_affine=False)
+        self.f0_predictor = CNNLSTM(in_channels, 1, 2)
+        self.phone_predictor = CNNLSTM(in_channels, 5003, 1)
+        self.use_gr_content_f0 = use_gr_content_f0
+        self.use_gr_prosody_phone = use_gr_prosody_phone
+        self.use_gr_residual_f0 = use_gr_residual_f0
+        self.use_gr_residual_phone = use_gr_residual_phone
+        self.use_gr_x_timbre = use_gr_x_timbre
+        if self.vq_num_q_r > 0 and self.use_gr_residual_f0:
+            self.res_f0_predictor = nn.Sequential(GradientReversal(alpha=1.0), CNNLSTM(in_channels, 1, 2))
+        if self.vq_num_q_r > 0 and self.use_gr_residual_phone > 0:
+            self.res_phone_predictor = nn.Sequential(GradientReversal(alpha=1.0), CNNLSTM(in_channels, 5003, 1))
+        if self.use_gr_content_f0:
+            self.content_f0_predictor = nn.Sequential(GradientReversal(alpha=1.0), CNNLSTM(in_channels, 1, 2))
+        if self.use_gr_prosody_phone:
+            self.prosody_phone_predictor = nn.Sequential(GradientReversal(alpha=1.0), CNNLSTM(in_channels, 5003, 1))
+        if self.use_gr_x_timbre:
+            self.x_timbre_predictor = nn.Sequential(
+                GradientReversal(alpha=1),
+                CNNLSTM(in_channels, 245200, 1, global_pred=True),
+            )
+        self.melspec_linear = nn.Linear(20, 256)
+        self.melspec_encoder = TransformerEncoder(
+            enc_emb_tokens=None,
+            encoder_layer=4,
+            encoder_hidden=256,
+            encoder_head=4,
+            conv_filter_size=1024,
+            conv_kernel_size=5,
+            encoder_dropout=0.1,
+            use_cln=False,
+            cfg=None,
+        )
+        self.reset_parameters()
+    def quantize(self, x, prosody_feature, n_quantizers=None):
+        outs, qs, commit_loss, quantized_buf = 0, [], [], []
+        # prosody
+        f0_input = prosody_feature.transpose(1, 2)  # (B, T, 20)
+        f0_input = self.melspec_linear(f0_input)
+        f0_input = self.melspec_encoder(f0_input, None, None)
+        f0_input = f0_input.transpose(1, 2)
+        f0_quantizer = self.quantizer[0]
+        out, q, commit, quantized = f0_quantizer(f0_input, n_quantizers=n_quantizers)
+        outs += out
+        qs.append(q)
+        quantized_buf.append(quantized.sum(0))
+        commit_loss.append(commit)
+        # phone
+        phone_input = x
+        phone_quantizer = self.quantizer[1]
+        out, q, commit, quantized = phone_quantizer(phone_input, n_quantizers=n_quantizers)
+        outs += out
+        qs.append(q)
+        quantized_buf.append(quantized.sum(0))
+        commit_loss.append(commit)
+        # residual
+        if self.vq_num_q_r > 0:
+            residual_quantizer = self.quantizer[2]
+            residual_input = x - (quantized_buf[0] + quantized_buf[1]).detach()
+            out, q, commit, quantized = residual_quantizer(residual_input, n_quantizers=n_quantizers)
+            outs += out
+            qs.append(q)
+            quantized_buf.append(quantized.sum(0))  # [L, B, C, T] -> [B, C, T]
+            commit_loss.append(commit)
+        qs = torch.cat(qs, dim=0)
+        commit_loss = torch.cat(commit_loss, dim=0)
+        return outs, qs, commit_loss, quantized_buf
+    def forward(
+        self,
+        x,
+        prosody_feature,
+        vq=True,
+        get_vq=False,
+        eval_vq=True,
+        speaker_embedding=None,
+        n_quantizers=None,
+        quantized=None,
+    ):
+        if get_vq:
+            return self.quantizer.get_emb()
+        if vq is True:
+            if eval_vq:
+                self.quantizer.eval()
+            x_timbre = x
+            outs, qs, commit_loss, quantized_buf = self.quantize(x, prosody_feature, n_quantizers=n_quantizers)
+            x_timbre = x_timbre.transpose(1, 2)
+            x_timbre = self.timbre_encoder(x_timbre, None, None)
+            x_timbre = x_timbre.transpose(1, 2)
+            spk_embs = torch.mean(x_timbre, dim=2)
+            return outs, qs, commit_loss, quantized_buf, spk_embs
+        out = {}
+        layer_0 = quantized[0]
+        f0, uv = self.f0_predictor(layer_0)
+        f0 = rearrange(f0, "... 1 -> ...")
+        uv = rearrange(uv, "... 1 -> ...")
+        layer_1 = quantized[1]
+        (phone,) = self.phone_predictor(layer_1)
+        out = {"f0": f0, "uv": uv, "phone": phone}
+        if self.use_gr_prosody_phone:
+            (prosody_phone,) = self.prosody_phone_predictor(layer_0)
+            out["prosody_phone"] = prosody_phone
+        if self.use_gr_content_f0:
+            content_f0, content_uv = self.content_f0_predictor(layer_1)
+            content_f0 = rearrange(content_f0, "... 1 -> ...")
+            content_uv = rearrange(content_uv, "... 1 -> ...")
+            out["content_f0"] = content_f0
+            out["content_uv"] = content_uv
+        if self.vq_num_q_r > 0:
+            layer_2 = quantized[2]
+            if self.use_gr_residual_f0:
+                res_f0, res_uv = self.res_f0_predictor(layer_2)
+                res_f0 = rearrange(res_f0, "... 1 -> ...")
+                res_uv = rearrange(res_uv, "... 1 -> ...")
+                out["res_f0"] = res_f0
+                out["res_uv"] = res_uv
+            if self.use_gr_residual_phone:
+                (res_phone,) = self.res_phone_predictor(layer_2)
+                out["res_phone"] = res_phone
+        style = self.timbre_linear(speaker_embedding).unsqueeze(2)  # (B, 2d, 1)
+        gamma, beta = style.chunk(2, 1)  # (B, d, 1)
+        if self.vq_num_q_r > 0:
+            if self.use_random_mask_residual:
+                bsz = quantized[2].shape[0]
+                res_mask = np.random.choice(
+                    [0, 1],
+                    size=bsz,
+                    p=[
+                        self.prob_random_mask_residual,
+                        1 - self.prob_random_mask_residual,
+                    ],
+                )
+                res_mask = torch.from_numpy(res_mask).unsqueeze(1).unsqueeze(1)  # (B, 1, 1)
+                res_mask = res_mask.to(device=quantized[2].device, dtype=quantized[2].dtype)
+                x = quantized[0].detach() + quantized[1].detach() + quantized[2] * res_mask
+                # x = quantized_perturbe[0].detach() + quantized[1].detach() + quantized[2] * res_mask
+            else:
+                x = quantized[0].detach() + quantized[1].detach() + quantized[2]
+                # x = quantized_perturbe[0].detach() + quantized[1].detach() + quantized[2]
+        else:
+            x = quantized[0].detach() + quantized[1].detach()
+            # x = quantized_perturbe[0].detach() + quantized[1].detach()
+        if self.use_gr_x_timbre:
+            (x_timbre,) = self.x_timbre_predictor(x)
+            out["x_timbre"] = x_timbre
+        x = x.transpose(1, 2)
+        x = self.timbre_norm(x)
+        x = x.transpose(1, 2)
+        x = x * gamma + beta
+        x = self.model(x)
+        out["audio"] = x
+        return out
+    def vq2emb(self, vq, use_residual=True):
+        # vq: [num_quantizer, B, T]
+        self.quantizer = self.quantizer.eval()
+        out = 0
+        out += self.quantizer[0].vq2emb(vq[0 : self.vq_num_q_p])
+        out += self.quantizer[1].vq2emb(vq[self.vq_num_q_p : self.vq_num_q_p + self.vq_num_q_c])
+        if self.vq_num_q_r > 0 and use_residual:
+            out += self.quantizer[2].vq2emb(vq[self.vq_num_q_p + self.vq_num_q_c :])
+        return out
+    def inference(self, x, speaker_embedding):
+        style = self.timbre_linear(speaker_embedding).unsqueeze(2)  # (B, 2d, 1)
+        gamma, beta = style.chunk(2, 1)  # (B, d, 1)
+        x = x.transpose(1, 2)
+        x = self.timbre_norm(x)
+        x = x.transpose(1, 2)
+        x = x * gamma + beta
+        x = self.model(x)
+        return x
+    def remove_weight_norm(self):
+        """Remove weight normalization module from all of the layers."""
+        def _remove_weight_norm(m):
+            try:
+                torch.nn.utils.remove_weight_norm(m)
+            except ValueError:  # this module didn't have weight norm
+                return
+        self.apply(_remove_weight_norm)
+    def apply_weight_norm(self):
+        """Apply weight normalization module from all of the layers."""
+        def _apply_weight_norm(m):
+            if isinstance(m, nn.Conv1d) or isinstance(m, nn.ConvTranspose1d):
+                torch.nn.utils.weight_norm(m)
+        self.apply(_apply_weight_norm)
+    def reset_parameters(self):
+        self.apply(init_weights)

Amphion/models/codec/ns3_codec/gradient_reversal.py ADDED Viewed

	@@ -0,0 +1,35 @@

+# Copyright (c) 2023 Amphion.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+from torch.autograd import Function
+import torch
+from torch import nn
+class GradientReversal(Function):
+    @staticmethod
+    def forward(ctx, x, alpha):
+        ctx.save_for_backward(x, alpha)
+        return x
+    @staticmethod
+    def backward(ctx, grad_output):
+        grad_input = None
+        _, alpha = ctx.saved_tensors
+        if ctx.needs_input_grad[0]:
+            grad_input = -alpha * grad_output
+        return grad_input, None
+revgrad = GradientReversal.apply
+class GradientReversal(nn.Module):
+    def __init__(self, alpha):
+        super().__init__()
+        self.alpha = torch.tensor(alpha, requires_grad=False)
+    def forward(self, x):
+        return revgrad(x, self.alpha)

Amphion/models/codec/ns3_codec/melspec.py ADDED Viewed

	@@ -0,0 +1,102 @@

+import torch
+import pyworld as pw
+import numpy as np
+import soundfile as sf
+import os
+from torchaudio.functional import pitch_shift
+import librosa
+from librosa.filters import mel as librosa_mel_fn
+import torch.nn as nn
+import torch.nn.functional as F
+def dynamic_range_compression(x, C=1, clip_val=1e-5):
+    return np.log(np.clip(x, a_min=clip_val, a_max=None) * C)
+def dynamic_range_decompression(x, C=1):
+    return np.exp(x) / C
+def dynamic_range_compression_torch(x, C=1, clip_val=1e-5):
+    return torch.log(torch.clamp(x, min=clip_val) * C)
+def dynamic_range_decompression_torch(x, C=1):
+    return torch.exp(x) / C
+def spectral_normalize_torch(magnitudes):
+    output = dynamic_range_compression_torch(magnitudes)
+    return output
+def spectral_de_normalize_torch(magnitudes):
+    output = dynamic_range_decompression_torch(magnitudes)
+    return output
+class MelSpectrogram(nn.Module):
+    def __init__(
+        self,
+        n_fft,
+        num_mels,
+        sampling_rate,
+        hop_size,
+        win_size,
+        fmin,
+        fmax,
+        center=False,
+    ):
+        super(MelSpectrogram, self).__init__()
+        self.n_fft = n_fft
+        self.hop_size = hop_size
+        self.win_size = win_size
+        self.sampling_rate = sampling_rate
+        self.num_mels = num_mels
+        self.fmin = fmin
+        self.fmax = fmax
+        self.center = center
+        mel_basis = {}
+        hann_window = {}
+        mel = librosa_mel_fn(
+            sr=sampling_rate, n_fft=n_fft, n_mels=num_mels, fmin=fmin, fmax=fmax
+        )
+        mel_basis = torch.from_numpy(mel).float()
+        hann_window = torch.hann_window(win_size)
+        self.register_buffer("mel_basis", mel_basis)
+        self.register_buffer("hann_window", hann_window)
+    def forward(self, y):
+        y = torch.nn.functional.pad(
+            y.unsqueeze(1),
+            (
+                int((self.n_fft - self.hop_size) / 2),
+                int((self.n_fft - self.hop_size) / 2),
+            ),
+            mode="reflect",
+        )
+        y = y.squeeze(1)
+        spec = torch.stft(
+            y,
+            self.n_fft,
+            hop_length=self.hop_size,
+            win_length=self.win_size,
+            window=self.hann_window,
+            center=self.center,
+            pad_mode="reflect",
+            normalized=False,
+            onesided=True,
+            return_complex=True,
+        )
+        spec = torch.view_as_real(spec)
+        spec = torch.sqrt(spec.pow(2).sum(-1) + (1e-9))
+        spec = torch.matmul(self.mel_basis, spec)
+        spec = spectral_normalize_torch(spec)
+        return spec

Amphion/models/codec/ns3_codec/quantize/__pycache__/__init__.cpython-310.pyc ADDED Viewed

Binary file (246 Bytes). View file

Amphion/models/codec/ns3_codec/quantize/fvq.py ADDED Viewed

	@@ -0,0 +1,116 @@

+# Copyright (c) 2023 Amphion.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+from typing import Union
+import numpy as np
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from einops import rearrange
+from torch.nn.utils import weight_norm
+class FactorizedVectorQuantize(nn.Module):
+    def __init__(self, dim, codebook_size, codebook_dim, commitment, **kwargs):
+        super().__init__()
+        self.codebook_size = codebook_size
+        self.codebook_dim = codebook_dim
+        self.commitment = commitment
+        if dim != self.codebook_dim:
+            self.in_proj = weight_norm(nn.Linear(dim, self.codebook_dim))
+            self.out_proj = weight_norm(nn.Linear(self.codebook_dim, dim))
+        else:
+            self.in_proj = nn.Identity()
+            self.out_proj = nn.Identity()
+        self._codebook = nn.Embedding(codebook_size, self.codebook_dim)
+    @property
+    def codebook(self):
+        return self._codebook
+    def forward(self, z):
+        """Quantized the input tensor using a fixed codebook and returns
+        the corresponding codebook vectors
+        Parameters
+        ----------
+        z : Tensor[B x D x T]
+        Returns
+        -------
+        Tensor[B x D x T]
+            Quantized continuous representation of input
+        Tensor[1]
+            Commitment loss to train encoder to predict vectors closer to codebook
+            entries
+        Tensor[1]
+            Codebook loss to update the codebook
+        Tensor[B x T]
+            Codebook indices (quantized discrete representation of input)
+        Tensor[B x D x T]
+            Projected latents (continuous representation of input before quantization)
+        """
+        # transpose since we use linear
+        z = rearrange(z, "b d t -> b t d")
+        # Factorized codes project input into low-dimensional space
+        z_e = self.in_proj(z)  # z_e : (B x T x D)
+        z_e = rearrange(z_e, "b t d -> b d t")
+        z_q, indices = self.decode_latents(z_e)
+        if self.training:
+            commitment_loss = (
+                F.mse_loss(z_e, z_q.detach(), reduction="none").mean([1, 2])
+                * self.commitment
+            )
+            codebook_loss = F.mse_loss(z_q, z_e.detach(), reduction="none").mean([1, 2])
+            commit_loss = commitment_loss + codebook_loss
+        else:
+            commit_loss = torch.zeros(z.shape[0], device=z.device)
+        z_q = (
+            z_e + (z_q - z_e).detach()
+        )  # noop in forward pass, straight-through gradient estimator in backward pass
+        z_q = rearrange(z_q, "b d t -> b t d")
+        z_q = self.out_proj(z_q)
+        z_q = rearrange(z_q, "b t d -> b d t")
+        return z_q, indices, commit_loss
+    def vq2emb(self, vq, proj=True):
+        emb = self.embed_code(vq)
+        if proj:
+            emb = self.out_proj(emb)
+        return emb.transpose(1, 2)
+    def get_emb(self):
+        return self.codebook.weight
+    def embed_code(self, embed_id):
+        return F.embedding(embed_id, self.codebook.weight)
+    def decode_code(self, embed_id):
+        return self.embed_code(embed_id).transpose(1, 2)
+    def decode_latents(self, latents):
+        encodings = rearrange(latents, "b d t -> (b t) d")
+        codebook = self.codebook.weight  # codebook: (N x D)
+        # L2 normalize encodings and codebook
+        encodings = F.normalize(encodings)
+        codebook = F.normalize(codebook)
+        # Compute euclidean distance with codebook
+        dist = (
+            encodings.pow(2).sum(1, keepdim=True)
+            - 2 * encodings @ codebook.t()
+            + codebook.pow(2).sum(1, keepdim=True).t()
+        )
+        indices = rearrange((-dist).max(1)[1], "(b t) -> b t", b=latents.size(0))
+        z_q = self.decode_code(indices)
+        return z_q, indices

Amphion/models/svc/transformer/transformer_inference.py ADDED Viewed

	@@ -0,0 +1,45 @@

+# Copyright (c) 2023 Amphion.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+import os
+import time
+import numpy as np
+import torch
+from tqdm import tqdm
+import torch.nn as nn
+from collections import OrderedDict
+from models.svc.base import SVCInference
+from modules.encoder.condition_encoder import ConditionEncoder
+from models.svc.transformer.transformer import Transformer
+from models.svc.transformer.conformer import Conformer
+class TransformerInference(SVCInference):
+    def __init__(self, args=None, cfg=None, infer_type="from_dataset"):
+        SVCInference.__init__(self, args, cfg, infer_type)
+    def _build_model(self):
+        self.cfg.model.condition_encoder.f0_min = self.cfg.preprocess.f0_min
+        self.cfg.model.condition_encoder.f0_max = self.cfg.preprocess.f0_max
+        self.condition_encoder = ConditionEncoder(self.cfg.model.condition_encoder)
+        if self.cfg.model.transformer.type == "transformer":
+            self.acoustic_mapper = Transformer(self.cfg.model.transformer)
+        elif self.cfg.model.transformer.type == "conformer":
+            self.acoustic_mapper = Conformer(self.cfg.model.transformer)
+        else:
+            raise NotImplementedError
+        model = torch.nn.ModuleList([self.condition_encoder, self.acoustic_mapper])
+        return model
+    def _inference_each_batch(self, batch_data):
+        device = self.accelerator.device
+        for k, v in batch_data.items():
+            batch_data[k] = v.to(device)
+        condition = self.condition_encoder(batch_data)
+        y_pred = self.acoustic_mapper(condition, batch_data["mask"])
+        return y_pred

Amphion/models/svc/vits/vits_trainer.py ADDED Viewed

	@@ -0,0 +1,704 @@

+# Copyright (c) 2023 Amphion.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+import torch
+from torch.optim.lr_scheduler import ExponentialLR
+from tqdm import tqdm
+from pathlib import Path
+import shutil
+import accelerate
+# from models.svc.base import SVCTrainer
+from models.svc.base.svc_dataset import SVCOfflineCollator, SVCOfflineDataset
+from models.svc.vits.vits import *
+from models.svc.base import SVCTrainer
+from utils.mel import mel_spectrogram_torch
+import json
+from models.vocoders.gan.discriminator.mpd import (
+    MultiPeriodDiscriminator_vits as MultiPeriodDiscriminator,
+)
+class VitsSVCTrainer(SVCTrainer):
+    def __init__(self, args, cfg):
+        self.args = args
+        self.cfg = cfg
+        SVCTrainer.__init__(self, args, cfg)
+    def _accelerator_prepare(self):
+        (
+            self.train_dataloader,
+            self.valid_dataloader,
+        ) = self.accelerator.prepare(
+            self.train_dataloader,
+            self.valid_dataloader,
+        )
+        if isinstance(self.model, dict):
+            for key in self.model.keys():
+                self.model[key] = self.accelerator.prepare(self.model[key])
+        else:
+            self.model = self.accelerator.prepare(self.model)
+        if isinstance(self.optimizer, dict):
+            for key in self.optimizer.keys():
+                self.optimizer[key] = self.accelerator.prepare(self.optimizer[key])
+        else:
+            self.optimizer = self.accelerator.prepare(self.optimizer)
+        if isinstance(self.scheduler, dict):
+            for key in self.scheduler.keys():
+                self.scheduler[key] = self.accelerator.prepare(self.scheduler[key])
+        else:
+            self.scheduler = self.accelerator.prepare(self.scheduler)
+    def _load_model(
+        self,
+        checkpoint_dir: str = None,
+        checkpoint_path: str = None,
+        resume_type: str = "",
+    ):
+        r"""Load model from checkpoint. If checkpoint_path is None, it will
+        load the latest checkpoint in checkpoint_dir. If checkpoint_path is not
+        None, it will load the checkpoint specified by checkpoint_path. **Only use this
+        method after** ``accelerator.prepare()``.
+        """
+        if checkpoint_path is None:
+            ls = [str(i) for i in Path(checkpoint_dir).glob("*")]
+            ls.sort(key=lambda x: int(x.split("_")[-3].split("-")[-1]), reverse=True)
+            checkpoint_path = ls[0]
+            self.logger.info("Resume from {}...".format(checkpoint_path))
+        if resume_type in ["resume", ""]:
+            # Load all the things, including model weights, optimizer, scheduler, and random states.
+            self.accelerator.load_state(input_dir=checkpoint_path)
+            # set epoch and step
+            self.epoch = int(checkpoint_path.split("_")[-3].split("-")[-1]) + 1
+            self.step = int(checkpoint_path.split("_")[-2].split("-")[-1]) + 1
+        elif resume_type == "finetune":
+            # Load only the model weights
+            accelerate.load_checkpoint_and_dispatch(
+                self.accelerator.unwrap_model(self.model["generator"]),
+                os.path.join(checkpoint_path, "pytorch_model.bin"),
+            )
+            accelerate.load_checkpoint_and_dispatch(
+                self.accelerator.unwrap_model(self.model["discriminator"]),
+                os.path.join(checkpoint_path, "pytorch_model.bin"),
+            )
+            self.logger.info("Load model weights for finetune...")
+        else:
+            raise ValueError("Resume_type must be `resume` or `finetune`.")
+        return checkpoint_path
+    def _build_model(self):
+        net_g = SynthesizerTrn(
+            self.cfg.preprocess.n_fft // 2 + 1,
+            self.cfg.preprocess.segment_size // self.cfg.preprocess.hop_size,
+            # directly use cfg
+            self.cfg,
+        )
+        net_d = MultiPeriodDiscriminator(self.cfg.model.vits.use_spectral_norm)
+        model = {"generator": net_g, "discriminator": net_d}
+        return model
+    def _build_dataset(self):
+        return SVCOfflineDataset, SVCOfflineCollator
+    def _build_optimizer(self):
+        optimizer_g = torch.optim.AdamW(
+            self.model["generator"].parameters(),
+            self.cfg.train.learning_rate,
+            betas=self.cfg.train.AdamW.betas,
+            eps=self.cfg.train.AdamW.eps,
+        )
+        optimizer_d = torch.optim.AdamW(
+            self.model["discriminator"].parameters(),
+            self.cfg.train.learning_rate,
+            betas=self.cfg.train.AdamW.betas,
+            eps=self.cfg.train.AdamW.eps,
+        )
+        optimizer = {"optimizer_g": optimizer_g, "optimizer_d": optimizer_d}
+        return optimizer
+    def _build_scheduler(self):
+        scheduler_g = ExponentialLR(
+            self.optimizer["optimizer_g"],
+            gamma=self.cfg.train.lr_decay,
+            last_epoch=self.epoch - 1,
+        )
+        scheduler_d = ExponentialLR(
+            self.optimizer["optimizer_d"],
+            gamma=self.cfg.train.lr_decay,
+            last_epoch=self.epoch - 1,
+        )
+        scheduler = {"scheduler_g": scheduler_g, "scheduler_d": scheduler_d}
+        return scheduler
+    def _build_criterion(self):
+        class GeneratorLoss(nn.Module):
+            def __init__(self, cfg):
+                super(GeneratorLoss, self).__init__()
+                self.cfg = cfg
+                self.l1_loss = nn.L1Loss()
+            def generator_loss(self, disc_outputs):
+                loss = 0
+                gen_losses = []
+                for dg in disc_outputs:
+                    dg = dg.float()
+                    l = torch.mean((1 - dg) ** 2)
+                    gen_losses.append(l)
+                    loss += l
+                return loss, gen_losses
+            def feature_loss(self, fmap_r, fmap_g):
+                loss = 0
+                for dr, dg in zip(fmap_r, fmap_g):
+                    for rl, gl in zip(dr, dg):
+                        rl = rl.float().detach()
+                        gl = gl.float()
+                        loss += torch.mean(torch.abs(rl - gl))
+                return loss * 2
+            def kl_loss(self, z_p, logs_q, m_p, logs_p, z_mask):
+                """
+                z_p, logs_q: [b, h, t_t]
+                m_p, logs_p: [b, h, t_t]
+                """
+                z_p = z_p.float()
+                logs_q = logs_q.float()
+                m_p = m_p.float()
+                logs_p = logs_p.float()
+                z_mask = z_mask.float()
+                kl = logs_p - logs_q - 0.5
+                kl += 0.5 * ((z_p - m_p) ** 2) * torch.exp(-2.0 * logs_p)
+                kl = torch.sum(kl * z_mask)
+                l = kl / torch.sum(z_mask)
+                return l
+            def forward(
+                self,
+                outputs_g,
+                outputs_d,
+                y_mel,
+                y_hat_mel,
+            ):
+                loss_g = {}
+                # mel loss
+                loss_mel = self.l1_loss(y_mel, y_hat_mel) * self.cfg.train.c_mel
+                loss_g["loss_mel"] = loss_mel
+                # kl loss
+                loss_kl = (
+                    self.kl_loss(
+                        outputs_g["z_p"],
+                        outputs_g["logs_q"],
+                        outputs_g["m_p"],
+                        outputs_g["logs_p"],
+                        outputs_g["z_mask"],
+                    )
+                    * self.cfg.train.c_kl
+                )
+                loss_g["loss_kl"] = loss_kl
+                # feature loss
+                loss_fm = self.feature_loss(outputs_d["fmap_rs"], outputs_d["fmap_gs"])
+                loss_g["loss_fm"] = loss_fm
+                # gan loss
+                loss_gen, losses_gen = self.generator_loss(outputs_d["y_d_hat_g"])
+                loss_g["loss_gen"] = loss_gen
+                loss_g["loss_gen_all"] = loss_mel + loss_kl + loss_fm + loss_gen
+                return loss_g
+        class DiscriminatorLoss(nn.Module):
+            def __init__(self, cfg):
+                super(DiscriminatorLoss, self).__init__()
+                self.cfg = cfg
+                self.l1Loss = torch.nn.L1Loss(reduction="mean")
+            def __call__(self, disc_real_outputs, disc_generated_outputs):
+                loss_d = {}
+                loss = 0
+                r_losses = []
+                g_losses = []
+                for dr, dg in zip(disc_real_outputs, disc_generated_outputs):
+                    dr = dr.float()
+                    dg = dg.float()
+                    r_loss = torch.mean((1 - dr) ** 2)
+                    g_loss = torch.mean(dg**2)
+                    loss += r_loss + g_loss
+                    r_losses.append(r_loss.item())
+                    g_losses.append(g_loss.item())
+                loss_d["loss_disc_all"] = loss
+                return loss_d
+        criterion = {
+            "generator": GeneratorLoss(self.cfg),
+            "discriminator": DiscriminatorLoss(self.cfg),
+        }
+        return criterion
+    # Keep legacy unchanged
+    def write_summary(
+        self,
+        losses,
+        stats,
+        images={},
+        audios={},
+        audio_sampling_rate=24000,
+        tag="train",
+    ):
+        for key, value in losses.items():
+            self.sw.add_scalar(tag + "/" + key, value, self.step)
+        self.sw.add_scalar(
+            "learning_rate",
+            self.optimizer["optimizer_g"].param_groups[0]["lr"],
+            self.step,
+        )
+        if len(images) != 0:
+            for key, value in images.items():
+                self.sw.add_image(key, value, self.global_step, batchformats="HWC")
+        if len(audios) != 0:
+            for key, value in audios.items():
+                self.sw.add_audio(key, value, self.global_step, audio_sampling_rate)
+    def write_valid_summary(
+        self, losses, stats, images={}, audios={}, audio_sampling_rate=24000, tag="val"
+    ):
+        for key, value in losses.items():
+            self.sw.add_scalar(tag + "/" + key, value, self.step)
+        if len(images) != 0:
+            for key, value in images.items():
+                self.sw.add_image(key, value, self.global_step, batchformats="HWC")
+        if len(audios) != 0:
+            for key, value in audios.items():
+                self.sw.add_audio(key, value, self.global_step, audio_sampling_rate)
+    def _get_state_dict(self):
+        state_dict = {
+            "generator": self.model["generator"].state_dict(),
+            "discriminator": self.model["discriminator"].state_dict(),
+            "optimizer_g": self.optimizer["optimizer_g"].state_dict(),
+            "optimizer_d": self.optimizer["optimizer_d"].state_dict(),
+            "scheduler_g": self.scheduler["scheduler_g"].state_dict(),
+            "scheduler_d": self.scheduler["scheduler_d"].state_dict(),
+            "step": self.step,
+            "epoch": self.epoch,
+            "batch_size": self.cfg.train.batch_size,
+        }
+        return state_dict
+    def get_state_dict(self):
+        state_dict = {
+            "generator": self.model["generator"].state_dict(),
+            "discriminator": self.model["discriminator"].state_dict(),
+            "optimizer_g": self.optimizer["optimizer_g"].state_dict(),
+            "optimizer_d": self.optimizer["optimizer_d"].state_dict(),
+            "scheduler_g": self.scheduler["scheduler_g"].state_dict(),
+            "scheduler_d": self.scheduler["scheduler_d"].state_dict(),
+            "step": self.step,
+            "epoch": self.epoch,
+            "batch_size": self.cfg.train.batch_size,
+        }
+        return state_dict
+    def load_model(self, checkpoint):
+        self.step = checkpoint["step"]
+        self.epoch = checkpoint["epoch"]
+        self.model["generator"].load_state_dict(checkpoint["generator"])
+        self.model["discriminator"].load_state_dict(checkpoint["discriminator"])
+        self.optimizer["optimizer_g"].load_state_dict(checkpoint["optimizer_g"])
+        self.optimizer["optimizer_d"].load_state_dict(checkpoint["optimizer_d"])
+        self.scheduler["scheduler_g"].load_state_dict(checkpoint["scheduler_g"])
+        self.scheduler["scheduler_d"].load_state_dict(checkpoint["scheduler_d"])
+    @torch.inference_mode()
+    def _valid_step(self, batch):
+        r"""Testing forward step. Should return average loss of a sample over
+        one batch. Provoke ``_forward_step`` is recommended except for special case.
+        See ``_test_epoch`` for usage.
+        """
+        valid_losses = {}
+        total_loss = 0
+        valid_stats = {}
+        #  Discriminator
+        # Generator output
+        outputs_g = self.model["generator"](batch)
+        y_mel = slice_segments(
+            batch["mel"].transpose(1, 2),
+            outputs_g["ids_slice"],
+            self.cfg.preprocess.segment_size // self.cfg.preprocess.hop_size,
+        )
+        y_hat_mel = mel_spectrogram_torch(
+            outputs_g["y_hat"].squeeze(1), self.cfg.preprocess
+        )
+        y = slice_segments(
+            batch["audio"].unsqueeze(1),
+            outputs_g["ids_slice"] * self.cfg.preprocess.hop_size,
+            self.cfg.preprocess.segment_size,
+        )
+        # Discriminator output
+        outputs_d = self.model["discriminator"](y, outputs_g["y_hat"].detach())
+        ##  Discriminator loss
+        loss_d = self.criterion["discriminator"](
+            outputs_d["y_d_hat_r"], outputs_d["y_d_hat_g"]
+        )
+        valid_losses.update(loss_d)
+        ##  Generator
+        outputs_d = self.model["discriminator"](y, outputs_g["y_hat"])
+        loss_g = self.criterion["generator"](outputs_g, outputs_d, y_mel, y_hat_mel)
+        valid_losses.update(loss_g)
+        for item in valid_losses:
+            valid_losses[item] = valid_losses[item].item()
+        total_loss = loss_g["loss_gen_all"] + loss_d["loss_disc_all"]
+        return (
+            total_loss.item(),
+            valid_losses,
+            valid_stats,
+        )
+    @torch.inference_mode()
+    def _valid_epoch(self):
+        r"""Testing epoch. Should return average loss of a batch (sample) over
+        one epoch. See ``train_loop`` for usage.
+        """
+        if isinstance(self.model, dict):
+            for key in self.model.keys():
+                self.model[key].eval()
+        else:
+            self.model.eval()
+        epoch_sum_loss = 0.0
+        epoch_losses = dict()
+        for batch in tqdm(
+            self.valid_dataloader,
+            desc=f"Validating Epoch {self.epoch}",
+            unit="batch",
+            colour="GREEN",
+            leave=False,
+            dynamic_ncols=True,
+            smoothing=0.04,
+            disable=not self.accelerator.is_main_process,
+        ):
+            total_loss, valid_losses, valid_stats = self._valid_step(batch)
+            epoch_sum_loss += total_loss
+            if isinstance(valid_losses, dict):
+                for key, value in valid_losses.items():
+                    if key not in epoch_losses.keys():
+                        epoch_losses[key] = value
+                    else:
+                        epoch_losses[key] += value
+        epoch_sum_loss = epoch_sum_loss / len(self.valid_dataloader)
+        for key in epoch_losses.keys():
+            epoch_losses[key] = epoch_losses[key] / len(self.valid_dataloader)
+        self.accelerator.wait_for_everyone()
+        return epoch_sum_loss, epoch_losses
+    ### THIS IS MAIN ENTRY ###
+    def train_loop(self):
+        r"""Training loop. The public entry of training process."""
+        # Wait everyone to prepare before we move on
+        self.accelerator.wait_for_everyone()
+        # dump config file
+        if self.accelerator.is_main_process:
+            self.__dump_cfg(self.config_save_path)
+        # self.optimizer.zero_grad()
+        # Wait to ensure good to go
+        self.accelerator.wait_for_everyone()
+        while self.epoch < self.max_epoch:
+            self.logger.info("\n")
+            self.logger.info("-" * 32)
+            self.logger.info("Epoch {}: ".format(self.epoch))
+            # Do training & validating epoch
+            train_total_loss, train_losses = self._train_epoch()
+            if isinstance(train_losses, dict):
+                for key, loss in train_losses.items():
+                    self.logger.info("  |- Train/{} Loss: {:.6f}".format(key, loss))
+                    self.accelerator.log(
+                        {"Epoch/Train {} Loss".format(key): loss},
+                        step=self.epoch,
+                    )
+            valid_total_loss, valid_losses = self._valid_epoch()
+            if isinstance(valid_losses, dict):
+                for key, loss in valid_losses.items():
+                    self.logger.info("  |- Valid/{} Loss: {:.6f}".format(key, loss))
+                    self.accelerator.log(
+                        {"Epoch/Train {} Loss".format(key): loss},
+                        step=self.epoch,
+                    )
+            self.logger.info("  |- Train/Loss: {:.6f}".format(train_total_loss))
+            self.logger.info("  |- Valid/Loss: {:.6f}".format(valid_total_loss))
+            self.accelerator.log(
+                {
+                    "Epoch/Train Loss": train_total_loss,
+                    "Epoch/Valid Loss": valid_total_loss,
+                },
+                step=self.epoch,
+            )
+            self.accelerator.wait_for_everyone()
+            # Check if hit save_checkpoint_stride and run_eval
+            run_eval = False
+            if self.accelerator.is_main_process:
+                save_checkpoint = False
+                hit_dix = []
+                for i, num in enumerate(self.save_checkpoint_stride):
+                    if self.epoch % num == 0:
+                        save_checkpoint = True
+                        hit_dix.append(i)
+                        run_eval |= self.run_eval[i]
+            self.accelerator.wait_for_everyone()
+            if self.accelerator.is_main_process and save_checkpoint:
+                path = os.path.join(
+                    self.checkpoint_dir,
+                    "epoch-{:04d}_step-{:07d}_loss-{:.6f}".format(
+                        self.epoch, self.step, train_total_loss
+                    ),
+                )
+                self.tmp_checkpoint_save_path = path
+                self.accelerator.save_state(path)
+                json.dump(
+                    self.checkpoints_path,
+                    open(os.path.join(path, "ckpts.json"), "w"),
+                    ensure_ascii=False,
+                    indent=4,
+                )
+                self._save_auxiliary_states()
+                # Remove old checkpoints
+                to_remove = []
+                for idx in hit_dix:
+                    self.checkpoints_path[idx].append(path)
+                    while len(self.checkpoints_path[idx]) > self.keep_last[idx]:
+                        to_remove.append((idx, self.checkpoints_path[idx].pop(0)))
+                # Search conflicts
+                total = set()
+                for i in self.checkpoints_path:
+                    total |= set(i)
+                do_remove = set()
+                for idx, path in to_remove[::-1]:
+                    if path in total:
+                        self.checkpoints_path[idx].insert(0, path)
+                    else:
+                        do_remove.add(path)
+                # Remove old checkpoints
+                for path in do_remove:
+                    shutil.rmtree(path, ignore_errors=True)
+                    self.logger.debug(f"Remove old checkpoint: {path}")
+            self.accelerator.wait_for_everyone()
+            if run_eval:
+                # TODO: run evaluation
+                pass
+            # Update info for each epoch
+            self.epoch += 1
+        # Finish training and save final checkpoint
+        self.accelerator.wait_for_everyone()
+        if self.accelerator.is_main_process:
+            path = os.path.join(
+                self.checkpoint_dir,
+                "final_epoch-{:04d}_step-{:07d}_loss-{:.6f}".format(
+                    self.epoch, self.step, valid_total_loss
+                ),
+            )
+            self.tmp_checkpoint_save_path = path
+            self.accelerator.save_state(
+                os.path.join(
+                    self.checkpoint_dir,
+                    "final_epoch-{:04d}_step-{:07d}_loss-{:.6f}".format(
+                        self.epoch, self.step, valid_total_loss
+                    ),
+                )
+            )
+            json.dump(
+                self.checkpoints_path,
+                open(os.path.join(path, "ckpts.json"), "w"),
+                ensure_ascii=False,
+                indent=4,
+            )
+            self._save_auxiliary_states()
+        self.accelerator.end_training()
+    def _train_step(self, batch):
+        r"""Forward step for training and inference. This function is called
+        in ``_train_step`` & ``_test_step`` function.
+        """
+        train_losses = {}
+        total_loss = 0
+        training_stats = {}
+        ## Train Discriminator
+        # Generator output
+        outputs_g = self.model["generator"](batch)
+        y_mel = slice_segments(
+            batch["mel"].transpose(1, 2),
+            outputs_g["ids_slice"],
+            self.cfg.preprocess.segment_size // self.cfg.preprocess.hop_size,
+        )
+        y_hat_mel = mel_spectrogram_torch(
+            outputs_g["y_hat"].squeeze(1), self.cfg.preprocess
+        )
+        y = slice_segments(
+            # [1, 168418] -> [1, 1, 168418]
+            batch["audio"].unsqueeze(1),
+            outputs_g["ids_slice"] * self.cfg.preprocess.hop_size,
+            self.cfg.preprocess.segment_size,
+        )
+        # Discriminator output
+        outputs_d = self.model["discriminator"](y, outputs_g["y_hat"].detach())
+        #  Discriminator loss
+        loss_d = self.criterion["discriminator"](
+            outputs_d["y_d_hat_r"], outputs_d["y_d_hat_g"]
+        )
+        train_losses.update(loss_d)
+        # BP and Grad Updated
+        self.optimizer["optimizer_d"].zero_grad()
+        self.accelerator.backward(loss_d["loss_disc_all"])
+        self.optimizer["optimizer_d"].step()
+        ## Train Generator
+        outputs_d = self.model["discriminator"](y, outputs_g["y_hat"])
+        loss_g = self.criterion["generator"](outputs_g, outputs_d, y_mel, y_hat_mel)
+        train_losses.update(loss_g)
+        # BP and Grad Updated
+        self.optimizer["optimizer_g"].zero_grad()
+        self.accelerator.backward(loss_g["loss_gen_all"])
+        self.optimizer["optimizer_g"].step()
+        for item in train_losses:
+            train_losses[item] = train_losses[item].item()
+        total_loss = loss_g["loss_gen_all"] + loss_d["loss_disc_all"]
+        return (
+            total_loss.item(),
+            train_losses,
+            training_stats,
+        )
+    def _train_epoch(self):
+        r"""Training epoch. Should return average loss of a batch (sample) over
+        one epoch. See ``train_loop`` for usage.
+        """
+        epoch_sum_loss: float = 0.0
+        epoch_losses: dict = {}
+        epoch_step: int = 0
+        for batch in tqdm(
+            self.train_dataloader,
+            desc=f"Training Epoch {self.epoch}",
+            unit="batch",
+            colour="GREEN",
+            leave=False,
+            dynamic_ncols=True,
+            smoothing=0.04,
+            disable=not self.accelerator.is_main_process,
+        ):
+            # Do training step and BP
+            with self.accelerator.accumulate(self.model):
+                total_loss, train_losses, training_stats = self._train_step(batch)
+            self.batch_count += 1
+            # Update info for each step
+            if self.batch_count % self.cfg.train.gradient_accumulation_step == 0:
+                epoch_sum_loss += total_loss
+                for key, value in train_losses.items():
+                    if key not in epoch_losses.keys():
+                        epoch_losses[key] = value
+                    else:
+                        epoch_losses[key] += value
+                self.accelerator.log(
+                    {
+                        "Step/Generator Loss": train_losses["loss_gen_all"],
+                        "Step/Discriminator Loss": train_losses["loss_disc_all"],
+                        "Step/Generator Learning Rate": self.optimizer[
+                            "optimizer_d"
+                        ].param_groups[0]["lr"],
+                        "Step/Discriminator Learning Rate": self.optimizer[
+                            "optimizer_g"
+                        ].param_groups[0]["lr"],
+                    },
+                    step=self.step,
+                )
+                self.step += 1
+                epoch_step += 1
+        self.accelerator.wait_for_everyone()
+        epoch_sum_loss = (
+            epoch_sum_loss
+            / len(self.train_dataloader)
+            * self.cfg.train.gradient_accumulation_step
+        )
+        for key in epoch_losses.keys():
+            epoch_losses[key] = (
+                epoch_losses[key]
+                / len(self.train_dataloader)
+                * self.cfg.train.gradient_accumulation_step
+            )
+        return epoch_sum_loss, epoch_losses
+    def __dump_cfg(self, path):
+        os.makedirs(os.path.dirname(path), exist_ok=True)
+        json5.dump(
+            self.cfg,
+            open(path, "w"),
+            indent=4,
+            sort_keys=True,
+            ensure_ascii=False,
+            quote_keys=True,
+        )

Amphion/models/tta/autoencoder/autoencoder_dataset.py ADDED Viewed

	@@ -0,0 +1,112 @@

+# Copyright (c) 2023 Amphion.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+import random
+import torch
+from torch.nn.utils.rnn import pad_sequence
+from utils.data_utils import *
+from models.base.base_dataset import (
+    BaseOfflineCollator,
+    BaseOfflineDataset,
+    BaseTestDataset,
+    BaseTestCollator,
+)
+import librosa
+class AutoencoderKLDataset(BaseOfflineDataset):
+    def __init__(self, cfg, dataset, is_valid=False):
+        BaseOfflineDataset.__init__(self, cfg, dataset, is_valid=is_valid)
+        cfg = self.cfg
+        # utt2melspec
+        if cfg.preprocess.use_melspec:
+            self.utt2melspec_path = {}
+            for utt_info in self.metadata:
+                dataset = utt_info["Dataset"]
+                uid = utt_info["Uid"]
+                utt = "{}_{}".format(dataset, uid)
+                self.utt2melspec_path[utt] = os.path.join(
+                    cfg.preprocess.processed_dir,
+                    dataset,
+                    cfg.preprocess.melspec_dir,
+                    uid + ".npy",
+                )
+        # utt2wav
+        if cfg.preprocess.use_wav:
+            self.utt2wav_path = {}
+            for utt_info in self.metadata:
+                dataset = utt_info["Dataset"]
+                uid = utt_info["Uid"]
+                utt = "{}_{}".format(dataset, uid)
+                self.utt2wav_path[utt] = os.path.join(
+                    cfg.preprocess.processed_dir,
+                    dataset,
+                    cfg.preprocess.wav_dir,
+                    uid + ".wav",
+                )
+    def __getitem__(self, index):
+        # melspec: (n_mels, T)
+        # wav: (T,)
+        single_feature = BaseOfflineDataset.__getitem__(self, index)
+        utt_info = self.metadata[index]
+        dataset = utt_info["Dataset"]
+        uid = utt_info["Uid"]
+        utt = "{}_{}".format(dataset, uid)
+        if self.cfg.preprocess.use_melspec:
+            single_feature["melspec"] = np.load(self.utt2melspec_path[utt])
+        if self.cfg.preprocess.use_wav:
+            wav, sr = librosa.load(
+                self.utt2wav_path[utt], sr=16000
+            )  # hard coding for 16KHz...
+            single_feature["wav"] = wav
+        return single_feature
+    def __len__(self):
+        return len(self.metadata)
+    def __len__(self):
+        return len(self.metadata)
+class AutoencoderKLCollator(BaseOfflineCollator):
+    def __init__(self, cfg):
+        BaseOfflineCollator.__init__(self, cfg)
+    def __call__(self, batch):
+        # mel: (B, n_mels, T)
+        # wav (option): (B, T)
+        packed_batch_features = dict()
+        for key in batch[0].keys():
+            if key == "melspec":
+                packed_batch_features["melspec"] = torch.from_numpy(
+                    np.array([b["melspec"][:, :624] for b in batch])
+                )
+            if key == "wav":
+                values = [torch.from_numpy(b[key]) for b in batch]
+                packed_batch_features[key] = pad_sequence(
+                    values, batch_first=True, padding_value=0
+                )
+        return packed_batch_features
+class AutoencoderKLTestDataset(BaseTestDataset): ...
+class AutoencoderKLTestCollator(BaseTestCollator): ...

Amphion/models/tta/ldm/__init__.py ADDED Viewed

File without changes

Amphion/models/tta/ldm/audioldm_dataset.py ADDED Viewed

	@@ -0,0 +1,151 @@

+# Copyright (c) 2023 Amphion.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+import random
+import torch
+from torch.nn.utils.rnn import pad_sequence
+from utils.data_utils import *
+from models.base.base_dataset import (
+    BaseOfflineCollator,
+    BaseOfflineDataset,
+    BaseTestDataset,
+    BaseTestCollator,
+)
+import librosa
+from transformers import AutoTokenizer
+class AudioLDMDataset(BaseOfflineDataset):
+    def __init__(self, cfg, dataset, is_valid=False):
+        BaseOfflineDataset.__init__(self, cfg, dataset, is_valid=is_valid)
+        self.cfg = cfg
+        # utt2melspec
+        if cfg.preprocess.use_melspec:
+            self.utt2melspec_path = {}
+            for utt_info in self.metadata:
+                dataset = utt_info["Dataset"]
+                uid = utt_info["Uid"]
+                utt = "{}_{}".format(dataset, uid)
+                self.utt2melspec_path[utt] = os.path.join(
+                    cfg.preprocess.processed_dir,
+                    dataset,
+                    cfg.preprocess.melspec_dir,
+                    uid + ".npy",
+                )
+        # utt2wav
+        if cfg.preprocess.use_wav:
+            self.utt2wav_path = {}
+            for utt_info in self.metadata:
+                dataset = utt_info["Dataset"]
+                uid = utt_info["Uid"]
+                utt = "{}_{}".format(dataset, uid)
+                self.utt2wav_path[utt] = os.path.join(
+                    cfg.preprocess.processed_dir,
+                    dataset,
+                    cfg.preprocess.wav_dir,
+                    uid + ".wav",
+                )
+        # utt2caption
+        if cfg.preprocess.use_caption:
+            self.utt2caption = {}
+            for utt_info in self.metadata:
+                dataset = utt_info["Dataset"]
+                uid = utt_info["Uid"]
+                utt = "{}_{}".format(dataset, uid)
+                self.utt2caption[utt] = utt_info["Caption"]
+    def __getitem__(self, index):
+        # melspec: (n_mels, T)
+        # wav: (T,)
+        single_feature = BaseOfflineDataset.__getitem__(self, index)
+        utt_info = self.metadata[index]
+        dataset = utt_info["Dataset"]
+        uid = utt_info["Uid"]
+        utt = "{}_{}".format(dataset, uid)
+        if self.cfg.preprocess.use_melspec:
+            single_feature["melspec"] = np.load(self.utt2melspec_path[utt])
+        if self.cfg.preprocess.use_wav:
+            wav, sr = librosa.load(
+                self.utt2wav_path[utt], sr=16000
+            )  # hard coding for 16KHz...
+            single_feature["wav"] = wav
+        if self.cfg.preprocess.use_caption:
+            cond_mask = np.random.choice(
+                [1, 0],
+                p=[
+                    self.cfg.preprocess.cond_mask_prob,
+                    1 - self.cfg.preprocess.cond_mask_prob,
+                ],
+            )  # (0.1, 0.9)
+            if cond_mask:
+                single_feature["caption"] = ""
+            else:
+                single_feature["caption"] = self.utt2caption[utt]
+        return single_feature
+    def __len__(self):
+        return len(self.metadata)
+class AudioLDMCollator(BaseOfflineCollator):
+    def __init__(self, cfg):
+        BaseOfflineCollator.__init__(self, cfg)
+        self.tokenizer = AutoTokenizer.from_pretrained("t5-base", model_max_length=512)
+    def __call__(self, batch):
+        # mel: (B, n_mels, T)
+        # wav (option): (B, T)
+        # text_input_ids: (B, L)
+        # text_attention_mask: (B, L)
+        packed_batch_features = dict()
+        for key in batch[0].keys():
+            if key == "melspec":
+                packed_batch_features["melspec"] = torch.from_numpy(
+                    np.array([b["melspec"][:, :624] for b in batch])
+                )
+            if key == "wav":
+                values = [torch.from_numpy(b[key]) for b in batch]
+                packed_batch_features[key] = pad_sequence(
+                    values, batch_first=True, padding_value=0
+                )
+            if key == "caption":
+                captions = [b[key] for b in batch]
+                text_input = self.tokenizer(
+                    captions, return_tensors="pt", truncation=True, padding="longest"
+                )
+                text_input_ids = text_input["input_ids"]
+                text_attention_mask = text_input["attention_mask"]
+                packed_batch_features["text_input_ids"] = text_input_ids
+                packed_batch_features["text_attention_mask"] = text_attention_mask
+        return packed_batch_features
+class AudioLDMTestDataset(BaseTestDataset): ...
+class AudioLDMTestCollator(BaseTestCollator): ...

Amphion/models/tta/ldm/audioldm_trainer.py ADDED Viewed

	@@ -0,0 +1,251 @@

+# Copyright (c) 2023 Amphion.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+from models.base.base_trainer import BaseTrainer
+from diffusers import DDPMScheduler
+from models.tta.ldm.audioldm_dataset import AudioLDMDataset, AudioLDMCollator
+from models.tta.autoencoder.autoencoder import AutoencoderKL
+from models.tta.ldm.audioldm import AudioLDM, UNetModel
+import torch
+import torch.nn as nn
+from torch.nn import MSELoss, L1Loss
+import torch.nn.functional as F
+from torch.utils.data import ConcatDataset, DataLoader
+from transformers import T5EncoderModel
+from diffusers import DDPMScheduler
+class AudioLDMTrainer(BaseTrainer):
+    def __init__(self, args, cfg):
+        BaseTrainer.__init__(self, args, cfg)
+        self.cfg = cfg
+        self.build_autoencoderkl()
+        self.build_textencoder()
+        self.nosie_scheduler = self.build_noise_scheduler()
+        self.save_config_file()
+    def build_autoencoderkl(self):
+        self.autoencoderkl = AutoencoderKL(self.cfg.model.autoencoderkl)
+        self.autoencoder_path = self.cfg.model.autoencoder_path
+        checkpoint = torch.load(self.autoencoder_path, map_location="cpu")
+        self.autoencoderkl.load_state_dict(checkpoint["model"])
+        self.autoencoderkl.cuda(self.args.local_rank)
+        self.autoencoderkl.requires_grad_(requires_grad=False)
+        self.autoencoderkl.eval()
+    def build_textencoder(self):
+        self.text_encoder = T5EncoderModel.from_pretrained("t5-base")
+        self.text_encoder.cuda(self.args.local_rank)
+        self.text_encoder.requires_grad_(requires_grad=False)
+        self.text_encoder.eval()
+    def build_noise_scheduler(self):
+        nosie_scheduler = DDPMScheduler(
+            num_train_timesteps=self.cfg.model.noise_scheduler.num_train_timesteps,
+            beta_start=self.cfg.model.noise_scheduler.beta_start,
+            beta_end=self.cfg.model.noise_scheduler.beta_end,
+            beta_schedule=self.cfg.model.noise_scheduler.beta_schedule,
+            clip_sample=self.cfg.model.noise_scheduler.clip_sample,
+            # steps_offset=self.cfg.model.noise_scheduler.steps_offset,
+            # set_alpha_to_one=self.cfg.model.noise_scheduler.set_alpha_to_one,
+            # skip_prk_steps=self.cfg.model.noise_scheduler.skip_prk_steps,
+            prediction_type=self.cfg.model.noise_scheduler.prediction_type,
+        )
+        return nosie_scheduler
+    def build_dataset(self):
+        return AudioLDMDataset, AudioLDMCollator
+    def build_data_loader(self):
+        Dataset, Collator = self.build_dataset()
+        # build dataset instance for each dataset and combine them by ConcatDataset
+        datasets_list = []
+        for dataset in self.cfg.dataset:
+            subdataset = Dataset(self.cfg, dataset, is_valid=False)
+            datasets_list.append(subdataset)
+        train_dataset = ConcatDataset(datasets_list)
+        train_collate = Collator(self.cfg)
+        # use batch_sampler argument instead of (sampler, shuffle, drop_last, batch_size)
+        train_loader = DataLoader(
+            train_dataset,
+            collate_fn=train_collate,
+            num_workers=self.args.num_workers,
+            batch_size=self.cfg.train.batch_size,
+            pin_memory=False,
+        )
+        if not self.cfg.train.ddp or self.args.local_rank == 0:
+            datasets_list = []
+            for dataset in self.cfg.dataset:
+                subdataset = Dataset(self.cfg, dataset, is_valid=True)
+                datasets_list.append(subdataset)
+            valid_dataset = ConcatDataset(datasets_list)
+            valid_collate = Collator(self.cfg)
+            valid_loader = DataLoader(
+                valid_dataset,
+                collate_fn=valid_collate,
+                num_workers=1,
+                batch_size=self.cfg.train.batch_size,
+            )
+        else:
+            raise NotImplementedError("DDP is not supported yet.")
+            # valid_loader = None
+        data_loader = {"train": train_loader, "valid": valid_loader}
+        return data_loader
+    def build_optimizer(self):
+        optimizer = torch.optim.AdamW(self.model.parameters(), **self.cfg.train.adam)
+        return optimizer
+    # TODO: check it...
+    def build_scheduler(self):
+        return None
+        # return ReduceLROnPlateau(self.optimizer["opt_ae"], **self.cfg.train.lronPlateau)
+    def write_summary(self, losses, stats):
+        for key, value in losses.items():
+            self.sw.add_scalar(key, value, self.step)
+    def write_valid_summary(self, losses, stats):
+        for key, value in losses.items():
+            self.sw.add_scalar(key, value, self.step)
+    def build_criterion(self):
+        criterion = nn.MSELoss(reduction="mean")
+        return criterion
+    def get_state_dict(self):
+        if self.scheduler != None:
+            state_dict = {
+                "model": self.model.state_dict(),
+                "optimizer": self.optimizer.state_dict(),
+                "scheduler": self.scheduler.state_dict(),
+                "step": self.step,
+                "epoch": self.epoch,
+                "batch_size": self.cfg.train.batch_size,
+            }
+        else:
+            state_dict = {
+                "model": self.model.state_dict(),
+                "optimizer": self.optimizer.state_dict(),
+                "step": self.step,
+                "epoch": self.epoch,
+                "batch_size": self.cfg.train.batch_size,
+            }
+        return state_dict
+    def load_model(self, checkpoint):
+        self.step = checkpoint["step"]
+        self.epoch = checkpoint["epoch"]
+        self.model.load_state_dict(checkpoint["model"])
+        self.optimizer.load_state_dict(checkpoint["optimizer"])
+        if self.scheduler != None:
+            self.scheduler.load_state_dict(checkpoint["scheduler"])
+    def build_model(self):
+        self.model = AudioLDM(self.cfg.model.audioldm)
+        return self.model
+    @torch.no_grad()
+    def mel_to_latent(self, melspec):
+        posterior = self.autoencoderkl.encode(melspec)
+        latent = posterior.sample()  # (B, 4, 5, 78)
+        return latent
+    @torch.no_grad()
+    def get_text_embedding(self, text_input_ids, text_attention_mask):
+        text_embedding = self.text_encoder(
+            input_ids=text_input_ids, attention_mask=text_attention_mask
+        ).last_hidden_state
+        return text_embedding  # (B, T, 768)
+    def train_step(self, data):
+        train_losses = {}
+        total_loss = 0
+        train_stats = {}
+        melspec = data["melspec"].unsqueeze(1)  # (B, 80, T) -> (B, 1, 80, T)
+        latents = self.mel_to_latent(melspec)
+        text_embedding = self.get_text_embedding(
+            data["text_input_ids"], data["text_attention_mask"]
+        )
+        noise = torch.randn_like(latents).float()
+        bsz = latents.shape[0]
+        timesteps = torch.randint(
+            0,
+            self.cfg.model.noise_scheduler.num_train_timesteps,
+            (bsz,),
+            device=latents.device,
+        )
+        timesteps = timesteps.long()
+        with torch.no_grad():
+            noisy_latents = self.nosie_scheduler.add_noise(latents, noise, timesteps)
+        model_pred = self.model(
+            noisy_latents, timesteps=timesteps, context=text_embedding
+        )
+        loss = self.criterion(model_pred, noise)
+        train_losses["loss"] = loss
+        total_loss += loss
+        self.optimizer.zero_grad()
+        total_loss.backward()
+        self.optimizer.step()
+        for item in train_losses:
+            train_losses[item] = train_losses[item].item()
+        return train_losses, train_stats, total_loss.item()
+    # TODO: eval step
+    @torch.no_grad()
+    def eval_step(self, data, index):
+        valid_loss = {}
+        total_valid_loss = 0
+        valid_stats = {}
+        melspec = data["melspec"].unsqueeze(1)  # (B, 80, T) -> (B, 1, 80, T)
+        latents = self.mel_to_latent(melspec)
+        text_embedding = self.get_text_embedding(
+            data["text_input_ids"], data["text_attention_mask"]
+        )
+        noise = torch.randn_like(latents).float()
+        bsz = latents.shape[0]
+        timesteps = torch.randint(
+            0,
+            self.cfg.model.noise_scheduler.num_train_timesteps,
+            (bsz,),
+            device=latents.device,
+        )
+        timesteps = timesteps.long()
+        noisy_latents = self.nosie_scheduler.add_noise(latents, noise, timesteps)
+        model_pred = self.model(noisy_latents, timesteps, text_embedding)
+        loss = self.criterion(model_pred, noise)
+        valid_loss["loss"] = loss
+        total_valid_loss += loss
+        for item in valid_loss:
+            valid_loss[item] = valid_loss[item].item()
+        return valid_loss, valid_stats, total_valid_loss.item()

Amphion/models/tta/ldm/inference_utils/vocoder.py ADDED Viewed

	@@ -0,0 +1,408 @@

+# Copyright (c) 2023 Amphion.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+import torch
+import torch.nn.functional as F
+import torch.nn as nn
+from torch.nn import Conv1d, ConvTranspose1d, AvgPool1d, Conv2d
+from torch.nn.utils import weight_norm, remove_weight_norm, spectral_norm
+from models.tta.ldm.inference_utils.utils import get_padding, init_weights
+LRELU_SLOPE = 0.1
+class ResBlock1(torch.nn.Module):
+    def __init__(self, h, channels, kernel_size=3, dilation=(1, 3, 5)):
+        super(ResBlock1, self).__init__()
+        self.h = h
+        self.convs1 = nn.ModuleList(
+            [
+                weight_norm(
+                    Conv1d(
+                        channels,
+                        channels,
+                        kernel_size,
+                        1,
+                        dilation=dilation[0],
+                        padding=get_padding(kernel_size, dilation[0]),
+                    )
+                ),
+                weight_norm(
+                    Conv1d(
+                        channels,
+                        channels,
+                        kernel_size,
+                        1,
+                        dilation=dilation[1],
+                        padding=get_padding(kernel_size, dilation[1]),
+                    )
+                ),
+                weight_norm(
+                    Conv1d(
+                        channels,
+                        channels,
+                        kernel_size,
+                        1,
+                        dilation=dilation[2],
+                        padding=get_padding(kernel_size, dilation[2]),
+                    )
+                ),
+            ]
+        )
+        self.convs1.apply(init_weights)
+        self.convs2 = nn.ModuleList(
+            [
+                weight_norm(
+                    Conv1d(
+                        channels,
+                        channels,
+                        kernel_size,
+                        1,
+                        dilation=1,
+                        padding=get_padding(kernel_size, 1),
+                    )
+                ),
+                weight_norm(
+                    Conv1d(
+                        channels,
+                        channels,
+                        kernel_size,
+                        1,
+                        dilation=1,
+                        padding=get_padding(kernel_size, 1),
+                    )
+                ),
+                weight_norm(
+                    Conv1d(
+                        channels,
+                        channels,
+                        kernel_size,
+                        1,
+                        dilation=1,
+                        padding=get_padding(kernel_size, 1),
+                    )
+                ),
+            ]
+        )
+        self.convs2.apply(init_weights)
+    def forward(self, x):
+        for c1, c2 in zip(self.convs1, self.convs2):
+            xt = F.leaky_relu(x, LRELU_SLOPE)
+            xt = c1(xt)
+            xt = F.leaky_relu(xt, LRELU_SLOPE)
+            xt = c2(xt)
+            x = xt + x
+        return x
+    def remove_weight_norm(self):
+        for l in self.convs1:
+            remove_weight_norm(l)
+        for l in self.convs2:
+            remove_weight_norm(l)
+class ResBlock2(torch.nn.Module):
+    def __init__(self, h, channels, kernel_size=3, dilation=(1, 3)):
+        super(ResBlock2, self).__init__()
+        self.h = h
+        self.convs = nn.ModuleList(
+            [
+                weight_norm(
+                    Conv1d(
+                        channels,
+                        channels,
+                        kernel_size,
+                        1,
+                        dilation=dilation[0],
+                        padding=get_padding(kernel_size, dilation[0]),
+                    )
+                ),
+                weight_norm(
+                    Conv1d(
+                        channels,
+                        channels,
+                        kernel_size,
+                        1,
+                        dilation=dilation[1],
+                        padding=get_padding(kernel_size, dilation[1]),
+                    )
+                ),
+            ]
+        )
+        self.convs.apply(init_weights)
+    def forward(self, x):
+        for c in self.convs:
+            xt = F.leaky_relu(x, LRELU_SLOPE)
+            xt = c(xt)
+            x = xt + x
+        return x
+    def remove_weight_norm(self):
+        for l in self.convs:
+            remove_weight_norm(l)
+class Generator(torch.nn.Module):
+    def __init__(self, h):
+        super(Generator, self).__init__()
+        self.h = h
+        self.num_kernels = len(h.resblock_kernel_sizes)
+        self.num_upsamples = len(h.upsample_rates)
+        self.conv_pre = weight_norm(
+            Conv1d(80, h.upsample_initial_channel, 7, 1, padding=3)
+        )
+        resblock = ResBlock1 if h.resblock == "1" else ResBlock2
+        self.ups = nn.ModuleList()
+        for i, (u, k) in enumerate(zip(h.upsample_rates, h.upsample_kernel_sizes)):
+            self.ups.append(
+                weight_norm(
+                    ConvTranspose1d(
+                        h.upsample_initial_channel // (2**i),
+                        h.upsample_initial_channel // (2 ** (i + 1)),
+                        k,
+                        u,
+                        padding=(k - u) // 2,
+                    )
+                )
+            )
+        self.resblocks = nn.ModuleList()
+        for i in range(len(self.ups)):
+            ch = h.upsample_initial_channel // (2 ** (i + 1))
+            for j, (k, d) in enumerate(
+                zip(h.resblock_kernel_sizes, h.resblock_dilation_sizes)
+            ):
+                self.resblocks.append(resblock(h, ch, k, d))
+        self.conv_post = weight_norm(Conv1d(ch, 1, 7, 1, padding=3))
+        self.ups.apply(init_weights)
+        self.conv_post.apply(init_weights)
+    def forward(self, x):
+        x = self.conv_pre(x)
+        for i in range(self.num_upsamples):
+            x = F.leaky_relu(x, LRELU_SLOPE)
+            x = self.ups[i](x)
+            xs = None
+            for j in range(self.num_kernels):
+                if xs is None:
+                    xs = self.resblocks[i * self.num_kernels + j](x)
+                else:
+                    xs += self.resblocks[i * self.num_kernels + j](x)
+            x = xs / self.num_kernels
+        x = F.leaky_relu(x)
+        x = self.conv_post(x)
+        x = torch.tanh(x)
+        return x
+    def remove_weight_norm(self):
+        print("Removing weight norm...")
+        for l in self.ups:
+            remove_weight_norm(l)
+        for l in self.resblocks:
+            l.remove_weight_norm()
+        remove_weight_norm(self.conv_pre)
+        remove_weight_norm(self.conv_post)
+class DiscriminatorP(torch.nn.Module):
+    def __init__(self, period, kernel_size=5, stride=3, use_spectral_norm=False):
+        super(DiscriminatorP, self).__init__()
+        self.period = period
+        norm_f = weight_norm if use_spectral_norm == False else spectral_norm
+        self.convs = nn.ModuleList(
+            [
+                norm_f(
+                    Conv2d(
+                        1,
+                        32,
+                        (kernel_size, 1),
+                        (stride, 1),
+                        padding=(get_padding(5, 1), 0),
+                    )
+                ),
+                norm_f(
+                    Conv2d(
+                        32,
+                        128,
+                        (kernel_size, 1),
+                        (stride, 1),
+                        padding=(get_padding(5, 1), 0),
+                    )
+                ),
+                norm_f(
+                    Conv2d(
+                        128,
+                        512,
+                        (kernel_size, 1),
+                        (stride, 1),
+                        padding=(get_padding(5, 1), 0),
+                    )
+                ),
+                norm_f(
+                    Conv2d(
+                        512,
+                        1024,
+                        (kernel_size, 1),
+                        (stride, 1),
+                        padding=(get_padding(5, 1), 0),
+                    )
+                ),
+                norm_f(Conv2d(1024, 1024, (kernel_size, 1), 1, padding=(2, 0))),
+            ]
+        )
+        self.conv_post = norm_f(Conv2d(1024, 1, (3, 1), 1, padding=(1, 0)))
+    def forward(self, x):
+        fmap = []
+        # 1d to 2d
+        b, c, t = x.shape
+        if t % self.period != 0:  # pad first
+            n_pad = self.period - (t % self.period)
+            x = F.pad(x, (0, n_pad), "reflect")
+            t = t + n_pad
+        x = x.view(b, c, t // self.period, self.period)
+        for l in self.convs:
+            x = l(x)
+            x = F.leaky_relu(x, LRELU_SLOPE)
+            fmap.append(x)
+        x = self.conv_post(x)
+        fmap.append(x)
+        x = torch.flatten(x, 1, -1)
+        return x, fmap
+class MultiPeriodDiscriminator(torch.nn.Module):
+    def __init__(self):
+        super(MultiPeriodDiscriminator, self).__init__()
+        self.discriminators = nn.ModuleList(
+            [
+                DiscriminatorP(2),
+                DiscriminatorP(3),
+                DiscriminatorP(5),
+                DiscriminatorP(7),
+                DiscriminatorP(11),
+            ]
+        )
+    def forward(self, y, y_hat):
+        y_d_rs = []
+        y_d_gs = []
+        fmap_rs = []
+        fmap_gs = []
+        for i, d in enumerate(self.discriminators):
+            y_d_r, fmap_r = d(y)
+            y_d_g, fmap_g = d(y_hat)
+            y_d_rs.append(y_d_r)
+            fmap_rs.append(fmap_r)
+            y_d_gs.append(y_d_g)
+            fmap_gs.append(fmap_g)
+        return y_d_rs, y_d_gs, fmap_rs, fmap_gs
+class DiscriminatorS(torch.nn.Module):
+    def __init__(self, use_spectral_norm=False):
+        super(DiscriminatorS, self).__init__()
+        norm_f = weight_norm if use_spectral_norm == False else spectral_norm
+        self.convs = nn.ModuleList(
+            [
+                norm_f(Conv1d(1, 128, 15, 1, padding=7)),
+                norm_f(Conv1d(128, 128, 41, 2, groups=4, padding=20)),
+                norm_f(Conv1d(128, 256, 41, 2, groups=16, padding=20)),
+                norm_f(Conv1d(256, 512, 41, 4, groups=16, padding=20)),
+                norm_f(Conv1d(512, 1024, 41, 4, groups=16, padding=20)),
+                norm_f(Conv1d(1024, 1024, 41, 1, groups=16, padding=20)),
+                norm_f(Conv1d(1024, 1024, 5, 1, padding=2)),
+            ]
+        )
+        self.conv_post = norm_f(Conv1d(1024, 1, 3, 1, padding=1))
+    def forward(self, x):
+        fmap = []
+        for l in self.convs:
+            x = l(x)
+            x = F.leaky_relu(x, LRELU_SLOPE)
+            fmap.append(x)
+        x = self.conv_post(x)
+        fmap.append(x)
+        x = torch.flatten(x, 1, -1)
+        return x, fmap
+class MultiScaleDiscriminator(torch.nn.Module):
+    def __init__(self):
+        super(MultiScaleDiscriminator, self).__init__()
+        self.discriminators = nn.ModuleList(
+            [
+                DiscriminatorS(use_spectral_norm=True),
+                DiscriminatorS(),
+                DiscriminatorS(),
+            ]
+        )
+        self.meanpools = nn.ModuleList(
+            [AvgPool1d(4, 2, padding=2), AvgPool1d(4, 2, padding=2)]
+        )
+    def forward(self, y, y_hat):
+        y_d_rs = []
+        y_d_gs = []
+        fmap_rs = []
+        fmap_gs = []
+        for i, d in enumerate(self.discriminators):
+            if i != 0:
+                y = self.meanpools[i - 1](y)
+                y_hat = self.meanpools[i - 1](y_hat)
+            y_d_r, fmap_r = d(y)
+            y_d_g, fmap_g = d(y_hat)
+            y_d_rs.append(y_d_r)
+            fmap_rs.append(fmap_r)
+            y_d_gs.append(y_d_g)
+            fmap_gs.append(fmap_g)
+        return y_d_rs, y_d_gs, fmap_rs, fmap_gs
+def feature_loss(fmap_r, fmap_g):
+    loss = 0
+    for dr, dg in zip(fmap_r, fmap_g):
+        for rl, gl in zip(dr, dg):
+            loss += torch.mean(torch.abs(rl - gl))
+    return loss * 2
+def discriminator_loss(disc_real_outputs, disc_generated_outputs):
+    loss = 0
+    r_losses = []
+    g_losses = []
+    for dr, dg in zip(disc_real_outputs, disc_generated_outputs):
+        r_loss = torch.mean((1 - dr) ** 2)
+        g_loss = torch.mean(dg**2)
+        loss += r_loss + g_loss
+        r_losses.append(r_loss.item())
+        g_losses.append(g_loss.item())
+    return loss, r_losses, g_losses
+def generator_loss(disc_outputs):
+    loss = 0
+    gen_losses = []
+    for dg in disc_outputs:
+        l = torch.mean((1 - dg) ** 2)
+        gen_losses.append(l)
+        loss += l
+    return loss, gen_losses

Amphion/models/tts/base/__init__.py ADDED Viewed

	@@ -0,0 +1,7 @@

+# Copyright (c) 2023 Amphion.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+# from .tts_inferece import TTSInference
+from .tts_trainer import TTSTrainer

Amphion/models/tts/base/tts_trainer.py ADDED Viewed

	@@ -0,0 +1,721 @@

+# Copyright (c) 2023 Amphion.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+import json
+import os
+import shutil
+import torch
+import time
+from pathlib import Path
+import torch
+from tqdm import tqdm
+import re
+import logging
+import json5
+import accelerate
+from accelerate.logging import get_logger
+from accelerate.utils import ProjectConfiguration
+from torch.utils.data import ConcatDataset, DataLoader
+from accelerate import DistributedDataParallelKwargs
+from schedulers.scheduler import Eden
+from models.base.base_sampler import build_samplers
+from models.base.new_trainer import BaseTrainer
+class TTSTrainer(BaseTrainer):
+    r"""The base trainer for all TTS models. It inherits from BaseTrainer and implements
+    ``build_criterion``, ``_build_dataset`` and ``_build_singer_lut`` methods. You can inherit from this
+    class, and implement ``_build_model``, ``_forward_step``.
+    """
+    def __init__(self, args=None, cfg=None):
+        self.args = args
+        self.cfg = cfg
+        cfg.exp_name = args.exp_name
+        # init with accelerate
+        self._init_accelerator()
+        self.accelerator.wait_for_everyone()
+        with self.accelerator.main_process_first():
+            self.logger = get_logger(args.exp_name, log_level="INFO")
+        # Log some info
+        self.logger.info("=" * 56)
+        self.logger.info("||\t\t" + "New training process started." + "\t\t||")
+        self.logger.info("=" * 56)
+        self.logger.info("\n")
+        self.logger.debug(f"Using {args.log_level.upper()} logging level.")
+        self.logger.info(f"Experiment name: {args.exp_name}")
+        self.logger.info(f"Experiment directory: {self.exp_dir}")
+        self.checkpoint_dir = os.path.join(self.exp_dir, "checkpoint")
+        if self.accelerator.is_main_process:
+            os.makedirs(self.checkpoint_dir, exist_ok=True)
+        self.logger.debug(f"Checkpoint directory: {self.checkpoint_dir}")
+        # init counts
+        self.batch_count: int = 0
+        self.step: int = 0
+        self.epoch: int = 0
+        self.max_epoch = (
+            self.cfg.train.max_epoch if self.cfg.train.max_epoch > 0 else float("inf")
+        )
+        self.logger.info(
+            "Max epoch: {}".format(
+                self.max_epoch if self.max_epoch < float("inf") else "Unlimited"
+            )
+        )
+        # Check values
+        if self.accelerator.is_main_process:
+            self.__check_basic_configs()
+            # Set runtime configs
+            self.save_checkpoint_stride = self.cfg.train.save_checkpoint_stride
+            self.checkpoints_path = [
+                [] for _ in range(len(self.save_checkpoint_stride))
+            ]
+            self.keep_last = [
+                i if i > 0 else float("inf") for i in self.cfg.train.keep_last
+            ]
+            self.run_eval = self.cfg.train.run_eval
+        # set random seed
+        with self.accelerator.main_process_first():
+            start = time.monotonic_ns()
+            self._set_random_seed(self.cfg.train.random_seed)
+            end = time.monotonic_ns()
+            self.logger.debug(
+                f"Setting random seed done in {(end - start) / 1e6:.2f}ms"
+            )
+            self.logger.debug(f"Random seed: {self.cfg.train.random_seed}")
+        # setup data_loader
+        with self.accelerator.main_process_first():
+            self.logger.info("Building dataset...")
+            start = time.monotonic_ns()
+            self.train_dataloader, self.valid_dataloader = self._build_dataloader()
+            end = time.monotonic_ns()
+            self.logger.info(f"Building dataset done in {(end - start) / 1e6:.2f}ms")
+        # save phone table to exp dir. Should be done before building model due to loading phone table in model
+        if cfg.preprocess.use_phone and cfg.preprocess.phone_extractor != "lexicon":
+            self._save_phone_symbols_file_to_exp_path()
+        # setup model
+        with self.accelerator.main_process_first():
+            self.logger.info("Building model...")
+            start = time.monotonic_ns()
+            self.model = self._build_model()
+            end = time.monotonic_ns()
+            self.logger.debug(self.model)
+            self.logger.info(f"Building model done in {(end - start) / 1e6:.2f}ms")
+            self.logger.info(
+                f"Model parameters: {self.__count_parameters(self.model)/1e6:.2f}M"
+            )
+        # optimizer & scheduler
+        with self.accelerator.main_process_first():
+            self.logger.info("Building optimizer and scheduler...")
+            start = time.monotonic_ns()
+            self.optimizer = self._build_optimizer()
+            self.scheduler = self._build_scheduler()
+            end = time.monotonic_ns()
+            self.logger.info(
+                f"Building optimizer and scheduler done in {(end - start) / 1e6:.2f}ms"
+            )
+        # create criterion
+        with self.accelerator.main_process_first():
+            self.logger.info("Building criterion...")
+            start = time.monotonic_ns()
+            self.criterion = self._build_criterion()
+            end = time.monotonic_ns()
+            self.logger.info(f"Building criterion done in {(end - start) / 1e6:.2f}ms")
+        # Resume or Finetune
+        with self.accelerator.main_process_first():
+            self._check_resume()
+        # accelerate prepare
+        self.logger.info("Initializing accelerate...")
+        start = time.monotonic_ns()
+        self._accelerator_prepare()
+        end = time.monotonic_ns()
+        self.logger.info(f"Initializing accelerate done in {(end - start) / 1e6:.2f}ms")
+        # save config file path
+        self.config_save_path = os.path.join(self.exp_dir, "args.json")
+        self.device = self.accelerator.device
+        if cfg.preprocess.use_spkid and cfg.train.multi_speaker_training:
+            self.speakers = self._build_speaker_lut()
+            self.utt2spk_dict = self._build_utt2spk_dict()
+        # Only for TTS tasks
+        self.task_type = "TTS"
+        self.logger.info("Task type: {}".format(self.task_type))
+    def _check_resume(self):
+        # if args.resume:
+        if self.args.resume or (
+            self.cfg.model_type == "VALLE" and self.args.train_stage == 2
+        ):
+            checkpoint_dir = self.checkpoint_dir
+            if self.cfg.model_type == "VALLE" and self.args.train_stage == 2:
+                ls = [str(i) for i in Path(checkpoint_dir).glob("*")]
+                if (
+                    self.args.checkpoint_path is None or len(ls) == 0
+                ):  # Train stage 2 from scratch using the checkpoint of stage 1
+                    assert (
+                        self.args.ar_model_ckpt_dir is not None
+                    ), "Error: ar_model_ckpt_dir should be set to train nar model."
+                    self.args.resume_type = "finetune"
+                    checkpoint_dir = self.args.ar_model_ckpt_dir
+                    self.logger.info(
+                        f"Training NAR model at stage 2 using the checkpoint of AR model at stage 1."
+                    )
+            self.logger.info(f"Resuming from checkpoint: {checkpoint_dir}")
+            start = time.monotonic_ns()
+            self.ckpt_path = self._load_model(
+                checkpoint_dir, self.args.checkpoint_path, self.args.resume_type
+            )
+            self.logger.info(f"Checkpoint path: {self.ckpt_path}")
+            end = time.monotonic_ns()
+            self.logger.info(
+                f"Resuming from checkpoint done in {(end - start) / 1e6:.2f}ms"
+            )
+            self.checkpoints_path = json.load(
+                open(os.path.join(self.ckpt_path, "ckpts.json"), "r")
+            )
+    def _init_accelerator(self):
+        self.exp_dir = os.path.join(
+            os.path.abspath(self.cfg.log_dir), self.args.exp_name
+        )
+        project_config = ProjectConfiguration(
+            project_dir=self.exp_dir,
+            logging_dir=os.path.join(self.exp_dir, "log"),
+        )
+        kwargs = DistributedDataParallelKwargs(find_unused_parameters=True)
+        self.accelerator = accelerate.Accelerator(
+            gradient_accumulation_steps=self.cfg.train.gradient_accumulation_step,
+            log_with=self.cfg.train.tracker,
+            project_config=project_config,
+            kwargs_handlers=[kwargs],
+        )
+        if self.accelerator.is_main_process:
+            os.makedirs(project_config.project_dir, exist_ok=True)
+            os.makedirs(project_config.logging_dir, exist_ok=True)
+        with self.accelerator.main_process_first():
+            self.accelerator.init_trackers(self.args.exp_name)
+    def _accelerator_prepare(self):
+        (
+            self.train_dataloader,
+            self.valid_dataloader,
+        ) = self.accelerator.prepare(
+            self.train_dataloader,
+            self.valid_dataloader,
+        )
+        if isinstance(self.model, dict):
+            for key in self.model.keys():
+                self.model[key] = self.accelerator.prepare(self.model[key])
+        else:
+            self.model = self.accelerator.prepare(self.model)
+        if isinstance(self.optimizer, dict):
+            for key in self.optimizer.keys():
+                self.optimizer[key] = self.accelerator.prepare(self.optimizer[key])
+        else:
+            self.optimizer = self.accelerator.prepare(self.optimizer)
+        if isinstance(self.scheduler, dict):
+            for key in self.scheduler.keys():
+                self.scheduler[key] = self.accelerator.prepare(self.scheduler[key])
+        else:
+            self.scheduler = self.accelerator.prepare(self.scheduler)
+    ### Following are methods only for TTS tasks ###
+    def _build_dataset(self):
+        pass
+    def _build_criterion(self):
+        pass
+    def _build_model(self):
+        pass
+    def _build_dataloader(self):
+        """Build dataloader which merges a series of datasets."""
+        # Build dataset instance for each dataset and combine them by ConcatDataset
+        Dataset, Collator = self._build_dataset()
+        # Build train set
+        datasets_list = []
+        for dataset in self.cfg.dataset:
+            subdataset = Dataset(self.cfg, dataset, is_valid=False)
+            datasets_list.append(subdataset)
+        train_dataset = ConcatDataset(datasets_list)
+        train_collate = Collator(self.cfg)
+        _, batch_sampler = build_samplers(train_dataset, self.cfg, self.logger, "train")
+        train_loader = DataLoader(
+            train_dataset,
+            collate_fn=train_collate,
+            batch_sampler=batch_sampler,
+            num_workers=self.cfg.train.dataloader.num_worker,
+            pin_memory=self.cfg.train.dataloader.pin_memory,
+        )
+        # Build test set
+        datasets_list = []
+        for dataset in self.cfg.dataset:
+            subdataset = Dataset(self.cfg, dataset, is_valid=True)
+            datasets_list.append(subdataset)
+        valid_dataset = ConcatDataset(datasets_list)
+        valid_collate = Collator(self.cfg)
+        _, batch_sampler = build_samplers(valid_dataset, self.cfg, self.logger, "valid")
+        valid_loader = DataLoader(
+            valid_dataset,
+            collate_fn=valid_collate,
+            batch_sampler=batch_sampler,
+            num_workers=self.cfg.train.dataloader.num_worker,
+            pin_memory=self.cfg.train.dataloader.pin_memory,
+        )
+        return train_loader, valid_loader
+    def _build_optimizer(self):
+        pass
+    def _build_scheduler(self):
+        pass
+    def _load_model(self, checkpoint_dir, checkpoint_path=None, resume_type="resume"):
+        """Load model from checkpoint. If a folder is given, it will
+        load the latest checkpoint in checkpoint_dir. If a path is given
+        it will load the checkpoint specified by checkpoint_path.
+        **Only use this method after** ``accelerator.prepare()``.
+        """
+        if checkpoint_path is None or checkpoint_path == "":
+            ls = [str(i) for i in Path(checkpoint_dir).glob("*")]
+            ls.sort(key=lambda x: int(x.split("_")[-3].split("-")[-1]), reverse=True)
+            checkpoint_path = ls[0]
+        self.logger.info("Load model from {}".format(checkpoint_path))
+        print("Load model from {}".format(checkpoint_path))
+        if resume_type == "resume":
+            self.accelerator.load_state(checkpoint_path)
+            self.epoch = int(checkpoint_path.split("_")[-3].split("-")[-1]) + 1
+            self.step = int(checkpoint_path.split("_")[-2].split("-")[-1]) + 1
+        elif resume_type == "finetune":
+            if isinstance(self.model, dict):
+                for idx, sub_model in enumerate(self.model.keys()):
+                    if idx == 0:
+                        ckpt_name = "pytorch_model.bin"
+                    else:
+                        ckpt_name = "pytorch_model_{}.bin".format(idx)
+                    self.model[sub_model].load_state_dict(
+                        torch.load(os.path.join(checkpoint_path, ckpt_name))
+                    )
+                self.model[sub_model].cuda(self.accelerator.device)
+            else:
+                self.model.load_state_dict(
+                    torch.load(os.path.join(checkpoint_path, "pytorch_model.bin"))
+                )
+                self.model.cuda(self.accelerator.device)
+            self.logger.info("Load model weights for finetune SUCCESS!")
+        else:
+            raise ValueError("Unsupported resume type: {}".format(resume_type))
+        return checkpoint_path
+    ### THIS IS MAIN ENTRY ###
+    def train_loop(self):
+        r"""Training loop. The public entry of training process."""
+        # Wait everyone to prepare before we move on
+        self.accelerator.wait_for_everyone()
+        # dump config file
+        if self.accelerator.is_main_process:
+            self.__dump_cfg(self.config_save_path)
+        # self.optimizer.zero_grad()
+        # Wait to ensure good to go
+        self.accelerator.wait_for_everyone()
+        while self.epoch < self.max_epoch:
+            self.logger.info("\n")
+            self.logger.info("-" * 32)
+            self.logger.info("Epoch {}: ".format(self.epoch))
+            # Do training & validating epoch
+            train_total_loss, train_losses = self._train_epoch()
+            if isinstance(train_losses, dict):
+                for key, loss in train_losses.items():
+                    self.logger.info("  |- Train/{} Loss: {:.6f}".format(key, loss))
+                    self.accelerator.log(
+                        {"Epoch/Train {} Loss".format(key): loss},
+                        step=self.epoch,
+                    )
+            valid_total_loss, valid_losses = self._valid_epoch()
+            if isinstance(valid_losses, dict):
+                for key, loss in valid_losses.items():
+                    self.logger.info("  |- Valid/{} Loss: {:.6f}".format(key, loss))
+                    self.accelerator.log(
+                        {"Epoch/Train {} Loss".format(key): loss},
+                        step=self.epoch,
+                    )
+            self.logger.info("  |- Train/Loss: {:.6f}".format(train_total_loss))
+            self.logger.info("  |- Valid/Loss: {:.6f}".format(valid_total_loss))
+            self.accelerator.log(
+                {
+                    "Epoch/Train Loss": train_total_loss,
+                    "Epoch/Valid Loss": valid_total_loss,
+                },
+                step=self.epoch,
+            )
+            self.accelerator.wait_for_everyone()
+            # Check if hit save_checkpoint_stride and run_eval
+            run_eval = False
+            if self.accelerator.is_main_process:
+                save_checkpoint = False
+                hit_dix = []
+                for i, num in enumerate(self.save_checkpoint_stride):
+                    if self.epoch % num == 0:
+                        save_checkpoint = True
+                        hit_dix.append(i)
+                        run_eval |= self.run_eval[i]
+            self.accelerator.wait_for_everyone()
+            if self.accelerator.is_main_process and save_checkpoint:
+                path = os.path.join(
+                    self.checkpoint_dir,
+                    "epoch-{:04d}_step-{:07d}_loss-{:.6f}".format(
+                        self.epoch, self.step, train_total_loss
+                    ),
+                )
+                self.accelerator.save_state(path)
+                json.dump(
+                    self.checkpoints_path,
+                    open(os.path.join(path, "ckpts.json"), "w"),
+                    ensure_ascii=False,
+                    indent=4,
+                )
+                # Remove old checkpoints
+                to_remove = []
+                for idx in hit_dix:
+                    self.checkpoints_path[idx].append(path)
+                    while len(self.checkpoints_path[idx]) > self.keep_last[idx]:
+                        to_remove.append((idx, self.checkpoints_path[idx].pop(0)))
+                # Search conflicts
+                total = set()
+                for i in self.checkpoints_path:
+                    total |= set(i)
+                do_remove = set()
+                for idx, path in to_remove[::-1]:
+                    if path in total:
+                        self.checkpoints_path[idx].insert(0, path)
+                    else:
+                        do_remove.add(path)
+                # Remove old checkpoints
+                for path in do_remove:
+                    shutil.rmtree(path, ignore_errors=True)
+                    self.logger.debug(f"Remove old checkpoint: {path}")
+            self.accelerator.wait_for_everyone()
+            if run_eval:
+                # TODO: run evaluation
+                pass
+            # Update info for each epoch
+            self.epoch += 1
+        # Finish training and save final checkpoint
+        self.accelerator.wait_for_everyone()
+        if self.accelerator.is_main_process:
+            path = os.path.join(
+                self.checkpoint_dir,
+                "final_epoch-{:04d}_step-{:07d}_loss-{:.6f}".format(
+                    self.epoch, self.step, valid_total_loss
+                ),
+            )
+            self.accelerator.save_state(
+                os.path.join(
+                    self.checkpoint_dir,
+                    "final_epoch-{:04d}_step-{:07d}_loss-{:.6f}".format(
+                        self.epoch, self.step, valid_total_loss
+                    ),
+                )
+            )
+            json.dump(
+                self.checkpoints_path,
+                open(os.path.join(path, "ckpts.json"), "w"),
+                ensure_ascii=False,
+                indent=4,
+            )
+        self.accelerator.end_training()
+    ### Following are methods that can be used directly in child classes ###
+    def _train_epoch(self):
+        r"""Training epoch. Should return average loss of a batch (sample) over
+        one epoch. See ``train_loop`` for usage.
+        """
+        if isinstance(self.model, dict):
+            for key in self.model.keys():
+                self.model[key].train()
+        else:
+            self.model.train()
+        epoch_sum_loss: float = 0.0
+        epoch_losses: dict = {}
+        epoch_step: int = 0
+        for batch in tqdm(
+            self.train_dataloader,
+            desc=f"Training Epoch {self.epoch}",
+            unit="batch",
+            colour="GREEN",
+            leave=False,
+            dynamic_ncols=True,
+            smoothing=0.04,
+            disable=not self.accelerator.is_main_process,
+        ):
+            # Do training step and BP
+            with self.accelerator.accumulate(self.model):
+                total_loss, train_losses, _ = self._train_step(batch)
+            self.batch_count += 1
+            # Update info for each step
+            # TODO: step means BP counts or batch counts?
+            if self.batch_count % self.cfg.train.gradient_accumulation_step == 0:
+                if isinstance(self.scheduler, dict):
+                    for key in self.scheduler.keys():
+                        self.scheduler[key].step()
+                else:
+                    if isinstance(self.scheduler, Eden):
+                        self.scheduler.step_batch(self.step)
+                    else:
+                        self.scheduler.step()
+                epoch_sum_loss += total_loss
+                if isinstance(train_losses, dict):
+                    for key, value in train_losses.items():
+                        epoch_losses[key] += value
+                if isinstance(train_losses, dict):
+                    for key, loss in train_losses.items():
+                        self.accelerator.log(
+                            {"Epoch/Train {} Loss".format(key): loss},
+                            step=self.step,
+                        )
+                self.step += 1
+                epoch_step += 1
+        self.accelerator.wait_for_everyone()
+        epoch_sum_loss = (
+            epoch_sum_loss
+            / len(self.train_dataloader)
+            * self.cfg.train.gradient_accumulation_step
+        )
+        for key in epoch_losses.keys():
+            epoch_losses[key] = (
+                epoch_losses[key]
+                / len(self.train_dataloader)
+                * self.cfg.train.gradient_accumulation_step
+            )
+        return epoch_sum_loss, epoch_losses
+    @torch.inference_mode()
+    def _valid_epoch(self):
+        r"""Testing epoch. Should return average loss of a batch (sample) over
+        one epoch. See ``train_loop`` for usage.
+        """
+        if isinstance(self.model, dict):
+            for key in self.model.keys():
+                self.model[key].eval()
+        else:
+            self.model.eval()
+        epoch_sum_loss = 0.0
+        epoch_losses = dict()
+        for batch in tqdm(
+            self.valid_dataloader,
+            desc=f"Validating Epoch {self.epoch}",
+            unit="batch",
+            colour="GREEN",
+            leave=False,
+            dynamic_ncols=True,
+            smoothing=0.04,
+            disable=not self.accelerator.is_main_process,
+        ):
+            total_loss, valid_losses, valid_stats = self._valid_step(batch)
+            epoch_sum_loss += total_loss
+            if isinstance(valid_losses, dict):
+                for key, value in valid_losses.items():
+                    if key not in epoch_losses.keys():
+                        epoch_losses[key] = value
+                    else:
+                        epoch_losses[key] += value
+        epoch_sum_loss = epoch_sum_loss / len(self.valid_dataloader)
+        for key in epoch_losses.keys():
+            epoch_losses[key] = epoch_losses[key] / len(self.valid_dataloader)
+        self.accelerator.wait_for_everyone()
+        return epoch_sum_loss, epoch_losses
+    def _train_step(self):
+        pass
+    def _valid_step(self, batch):
+        pass
+    def _inference(self):
+        pass
+    def _is_valid_pattern(self, directory_name):
+        directory_name = str(directory_name)
+        pattern = r"^epoch-\d{4}_step-\d{7}_loss-\d{1}\.\d{6}"
+        return re.match(pattern, directory_name) is not None
+    def _check_basic_configs(self):
+        if self.cfg.train.gradient_accumulation_step <= 0:
+            self.logger.fatal("Invalid gradient_accumulation_step value!")
+            self.logger.error(
+                f"Invalid gradient_accumulation_step value: {self.cfg.train.gradient_accumulation_step}. It should be positive."
+            )
+            self.accelerator.end_training()
+            raise ValueError(
+                f"Invalid gradient_accumulation_step value: {self.cfg.train.gradient_accumulation_step}. It should be positive."
+            )
+    def __dump_cfg(self, path):
+        os.makedirs(os.path.dirname(path), exist_ok=True)
+        json5.dump(
+            self.cfg,
+            open(path, "w"),
+            indent=4,
+            sort_keys=True,
+            ensure_ascii=False,
+            quote_keys=True,
+        )
+    def __check_basic_configs(self):
+        if self.cfg.train.gradient_accumulation_step <= 0:
+            self.logger.fatal("Invalid gradient_accumulation_step value!")
+            self.logger.error(
+                f"Invalid gradient_accumulation_step value: {self.cfg.train.gradient_accumulation_step}. It should be positive."
+            )
+            self.accelerator.end_training()
+            raise ValueError(
+                f"Invalid gradient_accumulation_step value: {self.cfg.train.gradient_accumulation_step}. It should be positive."
+            )
+        # TODO: check other values
+    @staticmethod
+    def __count_parameters(model):
+        model_param = 0.0
+        if isinstance(model, dict):
+            for key, value in model.items():
+                model_param += sum(p.numel() for p in model[key].parameters())
+        else:
+            model_param = sum(p.numel() for p in model.parameters())
+        return model_param
+    def _build_speaker_lut(self):
+        # combine speakers
+        if not os.path.exists(os.path.join(self.exp_dir, self.cfg.preprocess.spk2id)):
+            speakers = {}
+        else:
+            with open(
+                os.path.join(self.exp_dir, self.cfg.preprocess.spk2id), "r"
+            ) as speaker_file:
+                speakers = json.load(speaker_file)
+        for dataset in self.cfg.dataset:
+            speaker_lut_path = os.path.join(
+                self.cfg.preprocess.processed_dir, dataset, self.cfg.preprocess.spk2id
+            )
+            with open(speaker_lut_path, "r") as speaker_lut_path:
+                singer_lut = json.load(speaker_lut_path)
+            for singer in singer_lut.keys():
+                if singer not in speakers:
+                    speakers[singer] = len(speakers)
+        with open(
+            os.path.join(self.exp_dir, self.cfg.preprocess.spk2id), "w"
+        ) as speaker_file:
+            json.dump(speakers, speaker_file, indent=4, ensure_ascii=False)
+        print(
+            "speakers have been dumped to {}".format(
+                os.path.join(self.exp_dir, self.cfg.preprocess.spk2id)
+            )
+        )
+        return speakers
+    def _build_utt2spk_dict(self):
+        # combine speakers
+        utt2spk = {}
+        if not os.path.exists(os.path.join(self.exp_dir, self.cfg.preprocess.utt2spk)):
+            utt2spk = {}
+        else:
+            with open(
+                os.path.join(self.exp_dir, self.cfg.preprocess.utt2spk), "r"
+            ) as utt2spk_file:
+                for line in utt2spk_file.readlines():
+                    utt, spk = line.strip().split("\t")
+                    utt2spk[utt] = spk
+        for dataset in self.cfg.dataset:
+            utt2spk_dict_path = os.path.join(
+                self.cfg.preprocess.processed_dir, dataset, self.cfg.preprocess.utt2spk
+            )
+            with open(utt2spk_dict_path, "r") as utt2spk_dict:
+                for line in utt2spk_dict.readlines():
+                    utt, spk = line.strip().split("\t")
+                    if utt not in utt2spk.keys():
+                        utt2spk[utt] = spk
+        with open(
+            os.path.join(self.exp_dir, self.cfg.preprocess.utt2spk), "w"
+        ) as utt2spk_file:
+            for utt, spk in utt2spk.items():
+                utt2spk_file.write(utt + "\t" + spk + "\n")
+        print(
+            "utterance and speaker mapper have been dumped to {}".format(
+                os.path.join(self.exp_dir, self.cfg.preprocess.utt2spk)
+            )
+        )
+        return utt2spk
+    def _save_phone_symbols_file_to_exp_path(self):
+        phone_symbols_file = os.path.join(
+            self.cfg.preprocess.processed_dir,
+            self.cfg.dataset[0],
+            self.cfg.preprocess.symbols_dict,
+        )
+        phone_symbols_file_to_exp_path = os.path.join(
+            self.exp_dir, self.cfg.preprocess.symbols_dict
+        )
+        shutil.copy(phone_symbols_file, phone_symbols_file_to_exp_path)
+        os.chmod(phone_symbols_file_to_exp_path, 0o666)
+        print(
+            "phone symbols been dumped to {}".format(
+                os.path.join(self.exp_dir, self.cfg.preprocess.symbols_dict)
+            )
+        )

Amphion/models/tts/fastspeech2/fs2_trainer.py ADDED Viewed

	@@ -0,0 +1,155 @@

+# Copyright (c) 2023 Amphion.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+import torch
+import torch.nn as nn
+from tqdm import tqdm
+from models.tts.base import TTSTrainer
+from models.tts.fastspeech2.fs2 import FastSpeech2, FastSpeech2Loss
+from models.tts.fastspeech2.fs2_dataset import FS2Dataset, FS2Collator
+from optimizer.optimizers import NoamLR
+class FastSpeech2Trainer(TTSTrainer):
+    def __init__(self, args, cfg):
+        TTSTrainer.__init__(self, args, cfg)
+        self.cfg = cfg
+    def _build_dataset(self):
+        return FS2Dataset, FS2Collator
+    def __build_scheduler(self):
+        return NoamLR(self.optimizer, **self.cfg.train.lr_scheduler)
+    def _write_summary(self, losses, stats):
+        for key, value in losses.items():
+            self.sw.add_scalar("train/" + key, value, self.step)
+        lr = self.optimizer.state_dict()["param_groups"][0]["lr"]
+        self.sw.add_scalar("learning_rate", lr, self.step)
+    def _write_valid_summary(self, losses, stats):
+        for key, value in losses.items():
+            self.sw.add_scalar("val/" + key, value, self.step)
+    def _build_criterion(self):
+        return FastSpeech2Loss(self.cfg)
+    def get_state_dict(self):
+        state_dict = {
+            "model": self.model.state_dict(),
+            "optimizer": self.optimizer.state_dict(),
+            "scheduler": self.scheduler.state_dict(),
+            "step": self.step,
+            "epoch": self.epoch,
+            "batch_size": self.cfg.train.batch_size,
+        }
+        return state_dict
+    def _build_optimizer(self):
+        optimizer = torch.optim.Adam(self.model.parameters(), **self.cfg.train.adam)
+        return optimizer
+    def _build_scheduler(self):
+        scheduler = NoamLR(self.optimizer, **self.cfg.train.lr_scheduler)
+        return scheduler
+    def _build_model(self):
+        self.model = FastSpeech2(self.cfg)
+        return self.model
+    def _train_epoch(self):
+        r"""Training epoch. Should return average loss of a batch (sample) over
+        one epoch. See ``train_loop`` for usage.
+        """
+        self.model.train()
+        epoch_sum_loss: float = 0.0
+        epoch_step: int = 0
+        epoch_losses: dict = {}
+        for batch in tqdm(
+            self.train_dataloader,
+            desc=f"Training Epoch {self.epoch}",
+            unit="batch",
+            colour="GREEN",
+            leave=False,
+            dynamic_ncols=True,
+            smoothing=0.04,
+            disable=not self.accelerator.is_main_process,
+        ):
+            # Do training step and BP
+            with self.accelerator.accumulate(self.model):
+                loss, train_losses = self._train_step(batch)
+                self.accelerator.backward(loss)
+                grad_clip_thresh = self.cfg.train.grad_clip_thresh
+                nn.utils.clip_grad_norm_(self.model.parameters(), grad_clip_thresh)
+                self.optimizer.step()
+                self.scheduler.step()
+                self.optimizer.zero_grad()
+            self.batch_count += 1
+            # Update info for each step
+            if self.batch_count % self.cfg.train.gradient_accumulation_step == 0:
+                epoch_sum_loss += loss
+                for key, value in train_losses.items():
+                    if key not in epoch_losses.keys():
+                        epoch_losses[key] = value
+                    else:
+                        epoch_losses[key] += value
+                self.accelerator.log(
+                    {
+                        "Step/Train Loss": loss,
+                        "Step/Learning Rate": self.optimizer.param_groups[0]["lr"],
+                    },
+                    step=self.step,
+                )
+                self.step += 1
+                epoch_step += 1
+        self.accelerator.wait_for_everyone()
+        epoch_sum_loss = (
+            epoch_sum_loss
+            / len(self.train_dataloader)
+            * self.cfg.train.gradient_accumulation_step
+        )
+        for key in epoch_losses.keys():
+            epoch_losses[key] = (
+                epoch_losses[key]
+                / len(self.train_dataloader)
+                * self.cfg.train.gradient_accumulation_step
+            )
+        return epoch_sum_loss, epoch_losses
+    def _train_step(self, data):
+        train_losses = {}
+        total_loss = 0
+        train_stats = {}
+        preds = self.model(data)
+        train_losses = self.criterion(data, preds)
+        total_loss = train_losses["loss"]
+        for key, value in train_losses.items():
+            train_losses[key] = value.item()
+        return total_loss, train_losses
+    @torch.no_grad()
+    def _valid_step(self, data):
+        valid_loss = {}
+        total_valid_loss = 0
+        valid_stats = {}
+        preds = self.model(data)
+        valid_losses = self.criterion(data, preds)
+        total_valid_loss = valid_losses["loss"]
+        for key, value in valid_losses.items():
+            valid_losses[key] = value.item()
+        return total_valid_loss, valid_losses, valid_stats

Amphion/models/tts/naturalspeech2/ns2.py ADDED Viewed

	@@ -0,0 +1,259 @@

+# Copyright (c) 2023 Amphion.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+import torch
+import torch.nn as nn
+import numpy as np
+import torch.nn.functional as F
+from models.tts.naturalspeech2.diffusion import Diffusion
+from models.tts.naturalspeech2.diffusion_flow import DiffusionFlow
+from models.tts.naturalspeech2.wavenet import WaveNet
+from models.tts.naturalspeech2.prior_encoder import PriorEncoder
+from modules.naturalpseech2.transformers import TransformerEncoder
+from encodec import EncodecModel
+from einops import rearrange, repeat
+import os
+import json
+class NaturalSpeech2(nn.Module):
+    def __init__(self, cfg):
+        super().__init__()
+        self.cfg = cfg
+        self.latent_dim = cfg.latent_dim
+        self.query_emb_num = cfg.query_emb.query_token_num
+        self.prior_encoder = PriorEncoder(cfg.prior_encoder)
+        if cfg.diffusion.diffusion_type == "diffusion":
+            self.diffusion = Diffusion(cfg.diffusion)
+        elif cfg.diffusion.diffusion_type == "flow":
+            self.diffusion = DiffusionFlow(cfg.diffusion)
+        self.prompt_encoder = TransformerEncoder(cfg=cfg.prompt_encoder)
+        if self.latent_dim != cfg.prompt_encoder.encoder_hidden:
+            self.prompt_lin = nn.Linear(
+                self.latent_dim, cfg.prompt_encoder.encoder_hidden
+            )
+            self.prompt_lin.weight.data.normal_(0.0, 0.02)
+        else:
+            self.prompt_lin = None
+        self.query_emb = nn.Embedding(self.query_emb_num, cfg.query_emb.hidden_size)
+        self.query_attn = nn.MultiheadAttention(
+            cfg.query_emb.hidden_size, cfg.query_emb.head_num, batch_first=True
+        )
+        codec_model = EncodecModel.encodec_model_24khz()
+        codec_model.set_target_bandwidth(12.0)
+        codec_model.requires_grad_(False)
+        self.quantizer = codec_model.quantizer
+    @torch.no_grad()
+    def code_to_latent(self, code):
+        latent = self.quantizer.decode(code.transpose(0, 1))
+        return latent
+    def latent_to_code(self, latent, nq=16):
+        residual = latent
+        all_indices = []
+        all_dist = []
+        for i in range(nq):
+            layer = self.quantizer.vq.layers[i]
+            x = rearrange(residual, "b d n -> b n d")
+            x = layer.project_in(x)
+            shape = x.shape
+            x = layer._codebook.preprocess(x)
+            embed = layer._codebook.embed.t()
+            dist = -(
+                x.pow(2).sum(1, keepdim=True)
+                - 2 * x @ embed
+                + embed.pow(2).sum(0, keepdim=True)
+            )
+            indices = dist.max(dim=-1).indices
+            indices = layer._codebook.postprocess_emb(indices, shape)
+            dist = dist.reshape(*shape[:-1], dist.shape[-1])
+            quantized = layer.decode(indices)
+            residual = residual - quantized
+            all_indices.append(indices)
+            all_dist.append(dist)
+        out_indices = torch.stack(all_indices)
+        out_dist = torch.stack(all_dist)
+        return out_indices, out_dist  # (nq, B, T); (nq, B, T, 1024)
+    @torch.no_grad()
+    def latent_to_latent(self, latent, nq=16):
+        codes, _ = self.latent_to_code(latent, nq)
+        latent = self.quantizer.vq.decode(codes)
+        return latent
+    def forward(
+        self,
+        code=None,
+        pitch=None,
+        duration=None,
+        phone_id=None,
+        phone_id_frame=None,
+        frame_nums=None,
+        ref_code=None,
+        ref_frame_nums=None,
+        phone_mask=None,
+        mask=None,
+        ref_mask=None,
+    ):
+        ref_latent = self.code_to_latent(ref_code)
+        latent = self.code_to_latent(code)
+        if self.latent_dim is not None:
+            ref_latent = self.prompt_lin(ref_latent.transpose(1, 2))
+        ref_latent = self.prompt_encoder(ref_latent, ref_mask, condition=None)
+        spk_emb = ref_latent.transpose(1, 2)  # (B, d, T')
+        spk_query_emb = self.query_emb(
+            torch.arange(self.query_emb_num).to(latent.device)
+        ).repeat(
+            latent.shape[0], 1, 1
+        )  # (B, query_emb_num, d)
+        spk_query_emb, _ = self.query_attn(
+            spk_query_emb,
+            spk_emb.transpose(1, 2),
+            spk_emb.transpose(1, 2),
+            key_padding_mask=~(ref_mask.bool()),
+        )  # (B, query_emb_num, d)
+        prior_out = self.prior_encoder(
+            phone_id=phone_id,
+            duration=duration,
+            pitch=pitch,
+            phone_mask=phone_mask,
+            mask=mask,
+            ref_emb=spk_emb,
+            ref_mask=ref_mask,
+            is_inference=False,
+        )
+        prior_condition = prior_out["prior_out"]  # (B, T, d)
+        diff_out = self.diffusion(latent, mask, prior_condition, spk_query_emb)
+        return diff_out, prior_out
+    @torch.no_grad()
+    def inference(
+        self, ref_code=None, phone_id=None, ref_mask=None, inference_steps=1000
+    ):
+        ref_latent = self.code_to_latent(ref_code)
+        if self.latent_dim is not None:
+            ref_latent = self.prompt_lin(ref_latent.transpose(1, 2))
+        ref_latent = self.prompt_encoder(ref_latent, ref_mask, condition=None)
+        spk_emb = ref_latent.transpose(1, 2)  # (B, d, T')
+        spk_query_emb = self.query_emb(
+            torch.arange(self.query_emb_num).to(ref_latent.device)
+        ).repeat(
+            ref_latent.shape[0], 1, 1
+        )  # (B, query_emb_num, d)
+        spk_query_emb, _ = self.query_attn(
+            spk_query_emb,
+            spk_emb.transpose(1, 2),
+            spk_emb.transpose(1, 2),
+            key_padding_mask=~(ref_mask.bool()),
+        )  # (B, query_emb_num, d)
+        prior_out = self.prior_encoder(
+            phone_id=phone_id,
+            duration=None,
+            pitch=None,
+            phone_mask=None,
+            mask=None,
+            ref_emb=spk_emb,
+            ref_mask=ref_mask,
+            is_inference=True,
+        )
+        prior_condition = prior_out["prior_out"]  # (B, T, d)
+        z = torch.randn(
+            prior_condition.shape[0], self.latent_dim, prior_condition.shape[1]
+        ).to(ref_latent.device) / (1.20)
+        x0 = self.diffusion.reverse_diffusion(
+            z, None, prior_condition, inference_steps, spk_query_emb
+        )
+        return x0, prior_out
+    @torch.no_grad()
+    def reverse_diffusion_from_t(
+        self,
+        code=None,
+        pitch=None,
+        duration=None,
+        phone_id=None,
+        ref_code=None,
+        phone_mask=None,
+        mask=None,
+        ref_mask=None,
+        n_timesteps=None,
+        t=None,
+    ):
+        # o Only for debug
+        ref_latent = self.code_to_latent(ref_code)
+        latent = self.code_to_latent(code)
+        if self.latent_dim is not None:
+            ref_latent = self.prompt_lin(ref_latent.transpose(1, 2))
+        ref_latent = self.prompt_encoder(ref_latent, ref_mask, condition=None)
+        spk_emb = ref_latent.transpose(1, 2)  # (B, d, T')
+        spk_query_emb = self.query_emb(
+            torch.arange(self.query_emb_num).to(latent.device)
+        ).repeat(
+            latent.shape[0], 1, 1
+        )  # (B, query_emb_num, d)
+        spk_query_emb, _ = self.query_attn(
+            spk_query_emb,
+            spk_emb.transpose(1, 2),
+            spk_emb.transpose(1, 2),
+            key_padding_mask=~(ref_mask.bool()),
+        )  # (B, query_emb_num, d)
+        prior_out = self.prior_encoder(
+            phone_id=phone_id,
+            duration=duration,
+            pitch=pitch,
+            phone_mask=phone_mask,
+            mask=mask,
+            ref_emb=spk_emb,
+            ref_mask=ref_mask,
+            is_inference=False,
+        )
+        prior_condition = prior_out["prior_out"]  # (B, T, d)
+        diffusion_step = (
+            torch.ones(
+                latent.shape[0],
+                dtype=latent.dtype,
+                device=latent.device,
+                requires_grad=False,
+            )
+            * t
+        )
+        diffusion_step = torch.clamp(diffusion_step, 1e-5, 1.0 - 1e-5)
+        xt, _ = self.diffusion.forward_diffusion(
+            x0=latent, diffusion_step=diffusion_step
+        )
+        # print(torch.abs(xt-latent).max(), torch.abs(xt-latent).mean(), torch.abs(xt-latent).std())
+        x0 = self.diffusion.reverse_diffusion_from_t(
+            xt, mask, prior_condition, n_timesteps, spk_query_emb, t_start=t
+        )
+        return x0, prior_out, xt

Amphion/models/tts/naturalspeech2/ns2_dataset.py ADDED Viewed

	@@ -0,0 +1,524 @@

+# Copyright (c) 2023 Amphion.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+import random
+import torch
+from torch.nn.utils.rnn import pad_sequence
+from utils.data_utils import *
+from processors.acoustic_extractor import cal_normalized_mel
+from processors.acoustic_extractor import load_normalized
+from models.base.base_dataset import (
+    BaseOfflineCollator,
+    BaseOfflineDataset,
+    BaseTestDataset,
+    BaseTestCollator,
+)
+from text import text_to_sequence
+from text.cmudict import valid_symbols
+from tqdm import tqdm
+import pickle
+class NS2Dataset(torch.utils.data.Dataset):
+    def __init__(self, cfg, dataset, is_valid=False):
+        assert isinstance(dataset, str)
+        processed_data_dir = os.path.join(cfg.preprocess.processed_dir, dataset)
+        meta_file = cfg.preprocess.valid_file if is_valid else cfg.preprocess.train_file
+        # train.json
+        self.metafile_path = os.path.join(processed_data_dir, meta_file)
+        self.metadata = self.get_metadata()
+        self.cfg = cfg
+        assert cfg.preprocess.use_mel == False
+        if cfg.preprocess.use_mel:
+            self.utt2melspec_path = {}
+            for utt_info in self.metadata:
+                dataset = utt_info["Dataset"]
+                uid = utt_info["Uid"]
+                utt = "{}_{}".format(dataset, uid)
+                self.utt2melspec_path[utt] = os.path.join(
+                    cfg.preprocess.processed_dir,
+                    dataset,
+                    cfg.preprocess.melspec_dir,  # mel
+                    utt_info["speaker"],
+                    uid + ".npy",
+                )
+        assert cfg.preprocess.use_code == True
+        if cfg.preprocess.use_code:
+            self.utt2code_path = {}
+            for utt_info in self.metadata:
+                dataset = utt_info["Dataset"]
+                uid = utt_info["Uid"]
+                utt = "{}_{}".format(dataset, uid)
+                self.utt2code_path[utt] = os.path.join(
+                    cfg.preprocess.processed_dir,
+                    dataset,
+                    cfg.preprocess.code_dir,  # code
+                    utt_info["speaker"],
+                    uid + ".npy",
+                )
+        assert cfg.preprocess.use_spkid == True
+        if cfg.preprocess.use_spkid:
+            self.utt2spkid = {}
+            for utt_info in self.metadata:
+                dataset = utt_info["Dataset"]
+                uid = utt_info["Uid"]
+                utt = "{}_{}".format(dataset, uid)
+                self.utt2spkid[utt] = utt_info["speaker"]
+        assert cfg.preprocess.use_pitch == True
+        if cfg.preprocess.use_pitch:
+            self.utt2pitch_path = {}
+            for utt_info in self.metadata:
+                dataset = utt_info["Dataset"]
+                uid = utt_info["Uid"]
+                utt = "{}_{}".format(dataset, uid)
+                self.utt2pitch_path[utt] = os.path.join(
+                    cfg.preprocess.processed_dir,
+                    dataset,
+                    cfg.preprocess.pitch_dir,  # pitch
+                    utt_info["speaker"],
+                    uid + ".npy",
+                )
+        assert cfg.preprocess.use_duration == True
+        if cfg.preprocess.use_duration:
+            self.utt2duration_path = {}
+            for utt_info in self.metadata:
+                dataset = utt_info["Dataset"]
+                uid = utt_info["Uid"]
+                utt = "{}_{}".format(dataset, uid)
+                self.utt2duration_path[utt] = os.path.join(
+                    cfg.preprocess.processed_dir,
+                    dataset,
+                    cfg.preprocess.duration_dir,  # duration
+                    utt_info["speaker"],
+                    uid + ".npy",
+                )
+        assert cfg.preprocess.use_phone == True
+        if cfg.preprocess.use_phone:
+            self.utt2phone = {}
+            for utt_info in self.metadata:
+                dataset = utt_info["Dataset"]
+                uid = utt_info["Uid"]
+                utt = "{}_{}".format(dataset, uid)
+                self.utt2phone[utt] = utt_info["phones"]
+        assert cfg.preprocess.use_len == True
+        if cfg.preprocess.use_len:
+            self.utt2len = {}
+            for utt_info in self.metadata:
+                dataset = utt_info["Dataset"]
+                uid = utt_info["Uid"]
+                utt = "{}_{}".format(dataset, uid)
+                self.utt2len[utt] = utt_info["num_frames"]
+        # for cross reference
+        if cfg.preprocess.use_cross_reference:
+            self.spkid2utt = {}
+            for utt_info in self.metadata:
+                dataset = utt_info["Dataset"]
+                uid = utt_info["Uid"]
+                utt = "{}_{}".format(dataset, uid)
+                spkid = utt_info["speaker"]
+                if spkid not in self.spkid2utt:
+                    self.spkid2utt[spkid] = []
+                self.spkid2utt[spkid].append(utt)
+        # get phone to id / id to phone map
+        self.phone2id, self.id2phone = self.get_phone_map()
+        self.all_num_frames = []
+        for i in range(len(self.metadata)):
+            self.all_num_frames.append(self.metadata[i]["num_frames"])
+        self.num_frame_sorted = np.array(sorted(self.all_num_frames))
+        self.num_frame_indices = np.array(
+            sorted(
+                range(len(self.all_num_frames)), key=lambda k: self.all_num_frames[k]
+            )
+        )
+    def __len__(self):
+        return len(self.metadata)
+    def get_dataset_name(self):
+        return self.metadata[0]["Dataset"]
+    def get_metadata(self):
+        with open(self.metafile_path, "r", encoding="utf-8") as f:
+            metadata = json.load(f)
+        print("metadata len: ", len(metadata))
+        return metadata
+    def get_phone_map(self):
+        symbols = valid_symbols + ["sp", "spn", "sil"] + ["<s>", "</s>"]
+        phone2id = {s: i for i, s in enumerate(symbols)}
+        id2phone = {i: s for s, i in phone2id.items()}
+        return phone2id, id2phone
+    def __getitem__(self, index):
+        utt_info = self.metadata[index]
+        dataset = utt_info["Dataset"]
+        uid = utt_info["Uid"]
+        utt = "{}_{}".format(dataset, uid)
+        single_feature = dict()
+        if self.cfg.preprocess.read_metadata:
+            metadata_uid_path = os.path.join(
+                self.cfg.preprocess.processed_dir,
+                self.cfg.preprocess.metadata_dir,
+                dataset,
+                # utt_info["speaker"],
+                uid + ".pkl",
+            )
+            with open(metadata_uid_path, "rb") as f:
+                metadata_uid = pickle.load(f)
+            # code
+            code = metadata_uid["code"]
+            # frame_nums
+            frame_nums = code.shape[1]
+            # pitch
+            pitch = metadata_uid["pitch"]
+            # duration
+            duration = metadata_uid["duration"]
+            # phone_id
+            phone_id = np.array(
+                [
+                    *map(
+                        self.phone2id.get,
+                        self.utt2phone[utt].replace("{", "").replace("}", "").split(),
+                    )
+                ]
+            )
+        else:
+            # code
+            code = np.load(self.utt2code_path[utt])
+            # frame_nums
+            frame_nums = code.shape[1]
+            # pitch
+            pitch = np.load(self.utt2pitch_path[utt])
+            # duration
+            duration = np.load(self.utt2duration_path[utt])
+            # phone_id
+            phone_id = np.array(
+                [
+                    *map(
+                        self.phone2id.get,
+                        self.utt2phone[utt].replace("{", "").replace("}", "").split(),
+                    )
+                ]
+            )
+        # align length
+        code, pitch, duration, phone_id, frame_nums = self.align_length(
+            code, pitch, duration, phone_id, frame_nums
+        )
+        # spkid
+        spkid = self.utt2spkid[utt]
+        # get target and reference
+        out = self.get_target_and_reference(code, pitch, duration, phone_id, frame_nums)
+        code, ref_code = out["code"], out["ref_code"]
+        pitch, ref_pitch = out["pitch"], out["ref_pitch"]
+        duration, ref_duration = out["duration"], out["ref_duration"]
+        phone_id, ref_phone_id = out["phone_id"], out["ref_phone_id"]
+        frame_nums, ref_frame_nums = out["frame_nums"], out["ref_frame_nums"]
+        # phone_id_frame
+        assert len(phone_id) == len(duration)
+        phone_id_frame = []
+        for i in range(len(phone_id)):
+            phone_id_frame.extend([phone_id[i] for _ in range(duration[i])])
+        phone_id_frame = np.array(phone_id_frame)
+        # ref_phone_id_frame
+        assert len(ref_phone_id) == len(ref_duration)
+        ref_phone_id_frame = []
+        for i in range(len(ref_phone_id)):
+            ref_phone_id_frame.extend([ref_phone_id[i] for _ in range(ref_duration[i])])
+        ref_phone_id_frame = np.array(ref_phone_id_frame)
+        single_feature.update(
+            {
+                "code": code,
+                "frame_nums": frame_nums,
+                "pitch": pitch,
+                "duration": duration,
+                "phone_id": phone_id,
+                "phone_id_frame": phone_id_frame,
+                "ref_code": ref_code,
+                "ref_frame_nums": ref_frame_nums,
+                "ref_pitch": ref_pitch,
+                "ref_duration": ref_duration,
+                "ref_phone_id": ref_phone_id,
+                "ref_phone_id_frame": ref_phone_id_frame,
+                "spkid": spkid,
+            }
+        )
+        return single_feature
+    def get_num_frames(self, index):
+        utt_info = self.metadata[index]
+        return utt_info["num_frames"]
+    def align_length(self, code, pitch, duration, phone_id, frame_nums):
+        # aligh lenght of code, pitch, duration, phone_id, and frame nums
+        code_len = code.shape[1]
+        pitch_len = len(pitch)
+        dur_sum = sum(duration)
+        min_len = min(code_len, dur_sum)
+        code = code[:, :min_len]
+        if pitch_len >= min_len:
+            pitch = pitch[:min_len]
+        else:
+            pitch = np.pad(pitch, (0, min_len - pitch_len), mode="edge")
+        frame_nums = min_len
+        if dur_sum > min_len:
+            assert (duration[-1] - (dur_sum - min_len)) >= 0
+            duration[-1] = duration[-1] - (dur_sum - min_len)
+            assert duration[-1] >= 0
+        return code, pitch, duration, phone_id, frame_nums
+    def get_target_and_reference(self, code, pitch, duration, phone_id, frame_nums):
+        phone_nums = len(phone_id)
+        clip_phone_nums = np.random.randint(
+            int(phone_nums * 0.1), int(phone_nums * 0.5) + 1
+        )
+        clip_phone_nums = max(clip_phone_nums, 1)
+        assert clip_phone_nums < phone_nums and clip_phone_nums >= 1
+        if self.cfg.preprocess.clip_mode == "mid":
+            start_idx = np.random.randint(0, phone_nums - clip_phone_nums)
+        elif self.cfg.preprocess.clip_mode == "start":
+            if duration[0] == 0 and clip_phone_nums == 1:
+                start_idx = 1
+            else:
+                start_idx = 0
+        else:
+            assert self.cfg.preprocess.clip_mode in ["mid", "start"]
+        end_idx = start_idx + clip_phone_nums
+        start_frames = sum(duration[:start_idx])
+        end_frames = sum(duration[:end_idx])
+        new_code = np.concatenate(
+            (code[:, :start_frames], code[:, end_frames:]), axis=1
+        )
+        ref_code = code[:, start_frames:end_frames]
+        new_pitch = np.append(pitch[:start_frames], pitch[end_frames:])
+        ref_pitch = pitch[start_frames:end_frames]
+        new_duration = np.append(duration[:start_idx], duration[end_idx:])
+        ref_duration = duration[start_idx:end_idx]
+        new_phone_id = np.append(phone_id[:start_idx], phone_id[end_idx:])
+        ref_phone_id = phone_id[start_idx:end_idx]
+        new_frame_nums = frame_nums - (end_frames - start_frames)
+        ref_frame_nums = end_frames - start_frames
+        return {
+            "code": new_code,
+            "ref_code": ref_code,
+            "pitch": new_pitch,
+            "ref_pitch": ref_pitch,
+            "duration": new_duration,
+            "ref_duration": ref_duration,
+            "phone_id": new_phone_id,
+            "ref_phone_id": ref_phone_id,
+            "frame_nums": new_frame_nums,
+            "ref_frame_nums": ref_frame_nums,
+        }
+class NS2Collator(BaseOfflineCollator):
+    def __init__(self, cfg):
+        BaseOfflineCollator.__init__(self, cfg)
+    def __call__(self, batch):
+        packed_batch_features = dict()
+        # code: (B, 16, T)
+        # frame_nums: (B,)   not used
+        # pitch: (B, T)
+        # duration: (B, N)
+        # phone_id: (B, N)
+        # phone_id_frame: (B, T)
+        # ref_code: (B, 16, T')
+        # ref_frame_nums: (B,)   not used
+        # ref_pitch: (B, T)   not used
+        # ref_duration: (B, N')   not used
+        # ref_phone_id: (B, N')   not used
+        # ref_phone_frame: (B, T')   not used
+        # spkid: (B,)   not used
+        # phone_mask: (B, N)
+        # mask: (B, T)
+        # ref_mask: (B, T')
+        for key in batch[0].keys():
+            if key == "phone_id":
+                phone_ids = [torch.LongTensor(b["phone_id"]) for b in batch]
+                phone_masks = [torch.ones(len(b["phone_id"])) for b in batch]
+                packed_batch_features["phone_id"] = pad_sequence(
+                    phone_ids,
+                    batch_first=True,
+                    padding_value=0,
+                )
+                packed_batch_features["phone_mask"] = pad_sequence(
+                    phone_masks,
+                    batch_first=True,
+                    padding_value=0,
+                )
+            elif key == "phone_id_frame":
+                phone_id_frames = [torch.LongTensor(b["phone_id_frame"]) for b in batch]
+                masks = [torch.ones(len(b["phone_id_frame"])) for b in batch]
+                packed_batch_features["phone_id_frame"] = pad_sequence(
+                    phone_id_frames,
+                    batch_first=True,
+                    padding_value=0,
+                )
+                packed_batch_features["mask"] = pad_sequence(
+                    masks,
+                    batch_first=True,
+                    padding_value=0,
+                )
+            elif key == "ref_code":
+                ref_codes = [
+                    torch.from_numpy(b["ref_code"]).transpose(0, 1) for b in batch
+                ]
+                ref_masks = [torch.ones(max(b["ref_code"].shape[1], 1)) for b in batch]
+                packed_batch_features["ref_code"] = pad_sequence(
+                    ref_codes,
+                    batch_first=True,
+                    padding_value=0,
+                ).transpose(1, 2)
+                packed_batch_features["ref_mask"] = pad_sequence(
+                    ref_masks,
+                    batch_first=True,
+                    padding_value=0,
+                )
+            elif key == "code":
+                codes = [torch.from_numpy(b["code"]).transpose(0, 1) for b in batch]
+                masks = [torch.ones(max(b["code"].shape[1], 1)) for b in batch]
+                packed_batch_features["code"] = pad_sequence(
+                    codes,
+                    batch_first=True,
+                    padding_value=0,
+                ).transpose(1, 2)
+                packed_batch_features["mask"] = pad_sequence(
+                    masks,
+                    batch_first=True,
+                    padding_value=0,
+                )
+            elif key == "pitch":
+                values = [torch.from_numpy(b[key]) for b in batch]
+                packed_batch_features[key] = pad_sequence(
+                    values, batch_first=True, padding_value=50.0
+                )
+            elif key == "duration":
+                values = [torch.from_numpy(b[key]) for b in batch]
+                packed_batch_features[key] = pad_sequence(
+                    values, batch_first=True, padding_value=0
+                )
+            elif key == "frame_nums":
+                packed_batch_features["frame_nums"] = torch.LongTensor(
+                    [b["frame_nums"] for b in batch]
+                )
+            elif key == "ref_frame_nums":
+                packed_batch_features["ref_frame_nums"] = torch.LongTensor(
+                    [b["ref_frame_nums"] for b in batch]
+                )
+            else:
+                pass
+        return packed_batch_features
+def _is_batch_full(batch, num_tokens, max_tokens, max_sentences):
+    if len(batch) == 0:
+        return 0
+    if len(batch) == max_sentences:
+        return 1
+    if num_tokens > max_tokens:
+        return 1
+    return 0
+def batch_by_size(
+    indices,
+    num_tokens_fn,
+    max_tokens=None,
+    max_sentences=None,
+    required_batch_size_multiple=1,
+):
+    """
+    Yield mini-batches of indices bucketed by size. Batches may contain
+    sequences of different lengths.
+    Args:
+        indices (List[int]): ordered list of dataset indices
+        num_tokens_fn (callable): function that returns the number of tokens at
+            a given index
+        max_tokens (int, optional): max number of tokens in each batch
+            (default: None).
+        max_sentences (int, optional): max number of sentences in each
+            batch (default: None).
+        required_batch_size_multiple (int, optional): require batch size to
+            be a multiple of N (default: 1).
+    """
+    bsz_mult = required_batch_size_multiple
+    sample_len = 0
+    sample_lens = []
+    batch = []
+    batches = []
+    for i in range(len(indices)):
+        idx = indices[i]
+        num_tokens = num_tokens_fn(idx)
+        sample_lens.append(num_tokens)
+        sample_len = max(sample_len, num_tokens)
+        assert (
+            sample_len <= max_tokens
+        ), "sentence at index {} of size {} exceeds max_tokens " "limit of {}!".format(
+            idx, sample_len, max_tokens
+        )
+        num_tokens = (len(batch) + 1) * sample_len
+        if _is_batch_full(batch, num_tokens, max_tokens, max_sentences):
+            mod_len = max(
+                bsz_mult * (len(batch) // bsz_mult),
+                len(batch) % bsz_mult,
+            )
+            batches.append(batch[:mod_len])
+            batch = batch[mod_len:]
+            sample_lens = sample_lens[mod_len:]
+            sample_len = max(sample_lens) if len(sample_lens) > 0 else 0
+        batch.append(idx)
+    if len(batch) > 0:
+        batches.append(batch)
+    return batches

Amphion/models/tts/naturalspeech2/ns2_inference.py ADDED Viewed

	@@ -0,0 +1,128 @@

+# Copyright (c) 2023 Amphion.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+import argparse
+import os
+import torch
+import soundfile as sf
+import numpy as np
+from models.tts.naturalspeech2.ns2 import NaturalSpeech2
+from encodec import EncodecModel
+from encodec.utils import convert_audio
+from utils.util import load_config
+from text import text_to_sequence
+from text.cmudict import valid_symbols
+from text.g2p import preprocess_english, read_lexicon
+import torchaudio
+class NS2Inference:
+    def __init__(self, args, cfg):
+        self.cfg = cfg
+        self.args = args
+        self.model = self.build_model()
+        self.codec = self.build_codec()
+        self.symbols = valid_symbols + ["sp", "spn", "sil"] + ["<s>", "</s>"]
+        self.phone2id = {s: i for i, s in enumerate(self.symbols)}
+        self.id2phone = {i: s for s, i in self.phone2id.items()}
+    def build_model(self):
+        model = NaturalSpeech2(self.cfg.model)
+        model.load_state_dict(
+            torch.load(
+                os.path.join(self.args.checkpoint_path, "pytorch_model.bin"),
+                map_location="cpu",
+            )
+        )
+        model = model.to(self.args.device)
+        return model
+    def build_codec(self):
+        encodec_model = EncodecModel.encodec_model_24khz()
+        encodec_model = encodec_model.to(device=self.args.device)
+        encodec_model.set_target_bandwidth(12.0)
+        return encodec_model
+    def get_ref_code(self):
+        ref_wav_path = self.args.ref_audio
+        ref_wav, sr = torchaudio.load(ref_wav_path)
+        ref_wav = convert_audio(
+            ref_wav, sr, self.codec.sample_rate, self.codec.channels
+        )
+        ref_wav = ref_wav.unsqueeze(0).to(device=self.args.device)
+        with torch.no_grad():
+            encoded_frames = self.codec.encode(ref_wav)
+            ref_code = torch.cat([encoded[0] for encoded in encoded_frames], dim=-1)
+        # print(ref_code.shape)
+        ref_mask = torch.ones(ref_code.shape[0], ref_code.shape[-1]).to(ref_code.device)
+        # print(ref_mask.shape)
+        return ref_code, ref_mask
+    def inference(self):
+        ref_code, ref_mask = self.get_ref_code()
+        lexicon = read_lexicon(self.cfg.preprocess.lexicon_path)
+        phone_seq = preprocess_english(self.args.text, lexicon)
+        print(phone_seq)
+        phone_id = np.array(
+            [
+                *map(
+                    self.phone2id.get,
+                    phone_seq.replace("{", "").replace("}", "").split(),
+                )
+            ]
+        )
+        phone_id = torch.from_numpy(phone_id).unsqueeze(0).to(device=self.args.device)
+        print(phone_id)
+        x0, prior_out = self.model.inference(
+            ref_code, phone_id, ref_mask, self.args.inference_step
+        )
+        print(prior_out["dur_pred"])
+        print(prior_out["dur_pred_round"])
+        print(torch.sum(prior_out["dur_pred_round"]))
+        latent_ref = self.codec.quantizer.vq.decode(ref_code.transpose(0, 1))
+        rec_wav = self.codec.decoder(x0)
+        # ref_wav = self.codec.decoder(latent_ref)
+        os.makedirs(self.args.output_dir, exist_ok=True)
+        sf.write(
+            "{}/{}.wav".format(
+                self.args.output_dir, self.args.text.replace(" ", "_", 100)
+            ),
+            rec_wav[0, 0].detach().cpu().numpy(),
+            samplerate=24000,
+        )
+    def add_arguments(parser: argparse.ArgumentParser):
+        parser.add_argument(
+            "--ref_audio",
+            type=str,
+            default="",
+            help="Reference audio path",
+        )
+        parser.add_argument(
+            "--device",
+            type=str,
+            default="cuda",
+        )
+        parser.add_argument(
+            "--inference_step",
+            type=int,
+            default=200,
+            help="Total inference steps for the diffusion model",
+        )

Amphion/models/tts/naturalspeech2/ns2_trainer.py ADDED Viewed

	@@ -0,0 +1,798 @@

+# Copyright (c) 2023 Amphion.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+import os
+import shutil
+import json
+import time
+import torch
+import numpy as np
+from utils.util import Logger, ValueWindow
+from torch.utils.data import ConcatDataset, DataLoader
+from models.tts.base.tts_trainer import TTSTrainer
+from models.base.base_trainer import BaseTrainer
+from models.base.base_sampler import VariableSampler
+from models.tts.naturalspeech2.ns2_dataset import NS2Dataset, NS2Collator, batch_by_size
+from models.tts.naturalspeech2.ns2_loss import (
+    log_pitch_loss,
+    log_dur_loss,
+    diff_loss,
+    diff_ce_loss,
+)
+from torch.utils.data.sampler import BatchSampler, SequentialSampler
+from models.tts.naturalspeech2.ns2 import NaturalSpeech2
+from torch.optim import Adam, AdamW
+from torch.nn import MSELoss, L1Loss
+import torch.nn.functional as F
+from diffusers import get_scheduler
+import accelerate
+from accelerate.logging import get_logger
+from accelerate.utils import ProjectConfiguration
+class NS2Trainer(TTSTrainer):
+    def __init__(self, args, cfg):
+        self.args = args
+        self.cfg = cfg
+        cfg.exp_name = args.exp_name
+        self._init_accelerator()
+        self.accelerator.wait_for_everyone()
+        # Init logger
+        with self.accelerator.main_process_first():
+            if self.accelerator.is_main_process:
+                os.makedirs(os.path.join(self.exp_dir, "checkpoint"), exist_ok=True)
+                self.log_file = os.path.join(
+                    os.path.join(self.exp_dir, "checkpoint"), "train.log"
+                )
+                self.logger = Logger(self.log_file, level=self.args.log_level).logger
+        self.time_window = ValueWindow(50)
+        if self.accelerator.is_main_process:
+            # Log some info
+            self.logger.info("=" * 56)
+            self.logger.info("||\t\t" + "New training process started." + "\t\t||")
+            self.logger.info("=" * 56)
+            self.logger.info("\n")
+            self.logger.debug(f"Using {args.log_level.upper()} logging level.")
+            self.logger.info(f"Experiment name: {args.exp_name}")
+            self.logger.info(f"Experiment directory: {self.exp_dir}")
+        self.checkpoint_dir = os.path.join(self.exp_dir, "checkpoint")
+        if self.accelerator.is_main_process:
+            os.makedirs(self.checkpoint_dir, exist_ok=True)
+        if self.accelerator.is_main_process:
+            self.logger.debug(f"Checkpoint directory: {self.checkpoint_dir}")
+        # init counts
+        self.batch_count: int = 0
+        self.step: int = 0
+        self.epoch: int = 0
+        self.max_epoch = (
+            self.cfg.train.max_epoch if self.cfg.train.max_epoch > 0 else float("inf")
+        )
+        if self.accelerator.is_main_process:
+            self.logger.info(
+                "Max epoch: {}".format(
+                    self.max_epoch if self.max_epoch < float("inf") else "Unlimited"
+                )
+            )
+        # Check values
+        if self.accelerator.is_main_process:
+            self._check_basic_configs()
+            # Set runtime configs
+            self.save_checkpoint_stride = self.cfg.train.save_checkpoint_stride
+            self.checkpoints_path = [
+                [] for _ in range(len(self.save_checkpoint_stride))
+            ]
+            self.keep_last = [
+                i if i > 0 else float("inf") for i in self.cfg.train.keep_last
+            ]
+            self.run_eval = self.cfg.train.run_eval
+        # set random seed
+        with self.accelerator.main_process_first():
+            start = time.monotonic_ns()
+            self._set_random_seed(self.cfg.train.random_seed)
+            end = time.monotonic_ns()
+            if self.accelerator.is_main_process:
+                self.logger.debug(
+                    f"Setting random seed done in {(end - start) / 1e6:.2f}ms"
+                )
+                self.logger.debug(f"Random seed: {self.cfg.train.random_seed}")
+        # setup data_loader
+        with self.accelerator.main_process_first():
+            if self.accelerator.is_main_process:
+                self.logger.info("Building dataset...")
+            start = time.monotonic_ns()
+            self.train_dataloader, self.valid_dataloader = self._build_dataloader()
+            end = time.monotonic_ns()
+            if self.accelerator.is_main_process:
+                self.logger.info(
+                    f"Building dataset done in {(end - start) / 1e6:.2f}ms"
+                )
+        # setup model
+        with self.accelerator.main_process_first():
+            if self.accelerator.is_main_process:
+                self.logger.info("Building model...")
+            start = time.monotonic_ns()
+            self.model = self._build_model()
+            end = time.monotonic_ns()
+            if self.accelerator.is_main_process:
+                self.logger.debug(self.model)
+                self.logger.info(f"Building model done in {(end - start) / 1e6:.2f}ms")
+                self.logger.info(
+                    f"Model parameters: {self._count_parameters(self.model)/1e6:.2f}M"
+                )
+        # optimizer & scheduler
+        with self.accelerator.main_process_first():
+            if self.accelerator.is_main_process:
+                self.logger.info("Building optimizer and scheduler...")
+            start = time.monotonic_ns()
+            self.optimizer = self._build_optimizer()
+            self.scheduler = self._build_scheduler()
+            end = time.monotonic_ns()
+            if self.accelerator.is_main_process:
+                self.logger.info(
+                    f"Building optimizer and scheduler done in {(end - start) / 1e6:.2f}ms"
+                )
+        # accelerate prepare
+        if not self.cfg.train.use_dynamic_batchsize:
+            if self.accelerator.is_main_process:
+                self.logger.info("Initializing accelerate...")
+            start = time.monotonic_ns()
+            (
+                self.train_dataloader,
+                self.valid_dataloader,
+            ) = self.accelerator.prepare(
+                self.train_dataloader,
+                self.valid_dataloader,
+            )
+        if isinstance(self.model, dict):
+            for key in self.model.keys():
+                self.model[key] = self.accelerator.prepare(self.model[key])
+        else:
+            self.model = self.accelerator.prepare(self.model)
+        if isinstance(self.optimizer, dict):
+            for key in self.optimizer.keys():
+                self.optimizer[key] = self.accelerator.prepare(self.optimizer[key])
+        else:
+            self.optimizer = self.accelerator.prepare(self.optimizer)
+        if isinstance(self.scheduler, dict):
+            for key in self.scheduler.keys():
+                self.scheduler[key] = self.accelerator.prepare(self.scheduler[key])
+        else:
+            self.scheduler = self.accelerator.prepare(self.scheduler)
+        end = time.monotonic_ns()
+        if self.accelerator.is_main_process:
+            self.logger.info(
+                f"Initializing accelerate done in {(end - start) / 1e6:.2f}ms"
+            )
+        # create criterion
+        with self.accelerator.main_process_first():
+            if self.accelerator.is_main_process:
+                self.logger.info("Building criterion...")
+            start = time.monotonic_ns()
+            self.criterion = self._build_criterion()
+            end = time.monotonic_ns()
+            if self.accelerator.is_main_process:
+                self.logger.info(
+                    f"Building criterion done in {(end - start) / 1e6:.2f}ms"
+                )
+        # TODO: Resume from ckpt need test/debug
+        with self.accelerator.main_process_first():
+            if args.resume:
+                if self.accelerator.is_main_process:
+                    self.logger.info("Resuming from checkpoint...")
+                start = time.monotonic_ns()
+                ckpt_path = self._load_model(
+                    self.checkpoint_dir,
+                    args.checkpoint_path,
+                    resume_type=args.resume_type,
+                )
+                end = time.monotonic_ns()
+                if self.accelerator.is_main_process:
+                    self.logger.info(
+                        f"Resuming from checkpoint done in {(end - start) / 1e6:.2f}ms"
+                    )
+                self.checkpoints_path = json.load(
+                    open(os.path.join(ckpt_path, "ckpts.json"), "r")
+                )
+            self.checkpoint_dir = os.path.join(self.exp_dir, "checkpoint")
+            if self.accelerator.is_main_process:
+                os.makedirs(self.checkpoint_dir, exist_ok=True)
+            if self.accelerator.is_main_process:
+                self.logger.debug(f"Checkpoint directory: {self.checkpoint_dir}")
+        # save config file path
+        self.config_save_path = os.path.join(self.exp_dir, "args.json")
+        # Only for TTS tasks
+        self.task_type = "TTS"
+        if self.accelerator.is_main_process:
+            self.logger.info("Task type: {}".format(self.task_type))
+    def _init_accelerator(self):
+        self.exp_dir = os.path.join(
+            os.path.abspath(self.cfg.log_dir), self.args.exp_name
+        )
+        project_config = ProjectConfiguration(
+            project_dir=self.exp_dir,
+            logging_dir=os.path.join(self.exp_dir, "log"),
+        )
+        # ddp_kwargs = DistributedDataParallelKwargs(find_unused_parameters=True)
+        self.accelerator = accelerate.Accelerator(
+            gradient_accumulation_steps=self.cfg.train.gradient_accumulation_step,
+            log_with=self.cfg.train.tracker,
+            project_config=project_config,
+            # kwargs_handlers=[ddp_kwargs]
+        )
+        if self.accelerator.is_main_process:
+            os.makedirs(project_config.project_dir, exist_ok=True)
+            os.makedirs(project_config.logging_dir, exist_ok=True)
+        with self.accelerator.main_process_first():
+            self.accelerator.init_trackers(self.args.exp_name)
+    def _build_model(self):
+        model = NaturalSpeech2(cfg=self.cfg.model)
+        return model
+    def _build_dataset(self):
+        return NS2Dataset, NS2Collator
+    def _build_dataloader(self):
+        if self.cfg.train.use_dynamic_batchsize:
+            print("Use Dynamic Batchsize......")
+            Dataset, Collator = self._build_dataset()
+            train_dataset = Dataset(self.cfg, self.cfg.dataset[0], is_valid=False)
+            train_collate = Collator(self.cfg)
+            batch_sampler = batch_by_size(
+                train_dataset.num_frame_indices,
+                train_dataset.get_num_frames,
+                max_tokens=self.cfg.train.max_tokens * self.accelerator.num_processes,
+                max_sentences=self.cfg.train.max_sentences
+                * self.accelerator.num_processes,
+                required_batch_size_multiple=self.accelerator.num_processes,
+            )
+            np.random.seed(980205)
+            np.random.shuffle(batch_sampler)
+            print(batch_sampler[:1])
+            batches = [
+                x[
+                    self.accelerator.local_process_index :: self.accelerator.num_processes
+                ]
+                for x in batch_sampler
+                if len(x) % self.accelerator.num_processes == 0
+            ]
+            train_loader = DataLoader(
+                train_dataset,
+                collate_fn=train_collate,
+                num_workers=self.cfg.train.dataloader.num_worker,
+                batch_sampler=VariableSampler(
+                    batches, drop_last=False, use_random_sampler=True
+                ),
+                pin_memory=self.cfg.train.dataloader.pin_memory,
+            )
+            self.accelerator.wait_for_everyone()
+            valid_dataset = Dataset(self.cfg, self.cfg.dataset[0], is_valid=True)
+            valid_collate = Collator(self.cfg)
+            batch_sampler = batch_by_size(
+                valid_dataset.num_frame_indices,
+                valid_dataset.get_num_frames,
+                max_tokens=self.cfg.train.max_tokens * self.accelerator.num_processes,
+                max_sentences=self.cfg.train.max_sentences
+                * self.accelerator.num_processes,
+                required_batch_size_multiple=self.accelerator.num_processes,
+            )
+            batches = [
+                x[
+                    self.accelerator.local_process_index :: self.accelerator.num_processes
+                ]
+                for x in batch_sampler
+                if len(x) % self.accelerator.num_processes == 0
+            ]
+            valid_loader = DataLoader(
+                valid_dataset,
+                collate_fn=valid_collate,
+                num_workers=self.cfg.train.dataloader.num_worker,
+                batch_sampler=VariableSampler(batches, drop_last=False),
+                pin_memory=self.cfg.train.dataloader.pin_memory,
+            )
+            self.accelerator.wait_for_everyone()
+        else:
+            print("Use Normal Batchsize......")
+            Dataset, Collator = self._build_dataset()
+            train_dataset = Dataset(self.cfg, self.cfg.dataset[0], is_valid=False)
+            train_collate = Collator(self.cfg)
+            train_loader = DataLoader(
+                train_dataset,
+                shuffle=True,
+                collate_fn=train_collate,
+                batch_size=self.cfg.train.batch_size,
+                num_workers=self.cfg.train.dataloader.num_worker,
+                pin_memory=self.cfg.train.dataloader.pin_memory,
+            )
+            valid_dataset = Dataset(self.cfg, self.cfg.dataset[0], is_valid=True)
+            valid_collate = Collator(self.cfg)
+            valid_loader = DataLoader(
+                valid_dataset,
+                shuffle=True,
+                collate_fn=valid_collate,
+                batch_size=self.cfg.train.batch_size,
+                num_workers=self.cfg.train.dataloader.num_worker,
+                pin_memory=self.cfg.train.dataloader.pin_memory,
+            )
+            self.accelerator.wait_for_everyone()
+        return train_loader, valid_loader
+    def _build_optimizer(self):
+        optimizer = torch.optim.AdamW(
+            filter(lambda p: p.requires_grad, self.model.parameters()),
+            **self.cfg.train.adam,
+        )
+        return optimizer
+    def _build_scheduler(self):
+        lr_scheduler = get_scheduler(
+            self.cfg.train.lr_scheduler,
+            optimizer=self.optimizer,
+            num_warmup_steps=self.cfg.train.lr_warmup_steps,
+            num_training_steps=self.cfg.train.num_train_steps,
+        )
+        return lr_scheduler
+    def _build_criterion(self):
+        criterion = torch.nn.L1Loss(reduction="mean")
+        return criterion
+    def write_summary(self, losses, stats):
+        for key, value in losses.items():
+            self.sw.add_scalar(key, value, self.step)
+    def write_valid_summary(self, losses, stats):
+        for key, value in losses.items():
+            self.sw.add_scalar(key, value, self.step)
+    def get_state_dict(self):
+        state_dict = {
+            "model": self.model.state_dict(),
+            "optimizer": self.optimizer.state_dict(),
+            "scheduler": self.scheduler.state_dict(),
+            "step": self.step,
+            "epoch": self.epoch,
+            "batch_size": self.cfg.train.batch_size,
+        }
+        return state_dict
+    def load_model(self, checkpoint):
+        self.step = checkpoint["step"]
+        self.epoch = checkpoint["epoch"]
+        self.model.load_state_dict(checkpoint["model"])
+        self.optimizer.load_state_dict(checkpoint["optimizer"])
+        self.scheduler.load_state_dict(checkpoint["scheduler"])
+    def _train_step(self, batch):
+        train_losses = {}
+        total_loss = 0
+        train_stats = {}
+        code = batch["code"]  # (B, 16, T)
+        pitch = batch["pitch"]  # (B, T)
+        duration = batch["duration"]  # (B, N)
+        phone_id = batch["phone_id"]  # (B, N)
+        ref_code = batch["ref_code"]  # (B, 16, T')
+        phone_mask = batch["phone_mask"]  # (B, N)
+        mask = batch["mask"]  # (B, T)
+        ref_mask = batch["ref_mask"]  # (B, T')
+        diff_out, prior_out = self.model(
+            code=code,
+            pitch=pitch,
+            duration=duration,
+            phone_id=phone_id,
+            ref_code=ref_code,
+            phone_mask=phone_mask,
+            mask=mask,
+            ref_mask=ref_mask,
+        )
+        # pitch loss
+        pitch_loss = log_pitch_loss(prior_out["pitch_pred_log"], pitch, mask=mask)
+        total_loss += pitch_loss
+        train_losses["pitch_loss"] = pitch_loss
+        # duration loss
+        dur_loss = log_dur_loss(prior_out["dur_pred_log"], duration, mask=phone_mask)
+        total_loss += dur_loss
+        train_losses["dur_loss"] = dur_loss
+        x0 = self.model.module.code_to_latent(code)
+        if self.cfg.model.diffusion.diffusion_type == "diffusion":
+            # diff loss x0
+            diff_loss_x0 = diff_loss(diff_out["x0_pred"], x0, mask=mask)
+            total_loss += diff_loss_x0
+            train_losses["diff_loss_x0"] = diff_loss_x0
+            # diff loss noise
+            diff_loss_noise = diff_loss(
+                diff_out["noise_pred"], diff_out["noise"], mask=mask
+            )
+            total_loss += diff_loss_noise * self.cfg.train.diff_noise_loss_lambda
+            train_losses["diff_loss_noise"] = diff_loss_noise
+        elif self.cfg.model.diffusion.diffusion_type == "flow":
+            # diff flow matching loss
+            flow_gt = diff_out["noise"] - x0
+            diff_loss_flow = diff_loss(diff_out["flow_pred"], flow_gt, mask=mask)
+            total_loss += diff_loss_flow
+            train_losses["diff_loss_flow"] = diff_loss_flow
+        # diff loss ce
+        # (nq, B, T); (nq, B, T, 1024)
+        if self.cfg.train.diff_ce_loss_lambda > 0:
+            pred_indices, pred_dist = self.model.module.latent_to_code(
+                diff_out["x0_pred"], nq=code.shape[1]
+            )
+            gt_indices, _ = self.model.module.latent_to_code(x0, nq=code.shape[1])
+            diff_loss_ce = diff_ce_loss(pred_dist, gt_indices, mask=mask)
+            total_loss += diff_loss_ce * self.cfg.train.diff_ce_loss_lambda
+            train_losses["diff_loss_ce"] = diff_loss_ce
+        self.optimizer.zero_grad()
+        # total_loss.backward()
+        self.accelerator.backward(total_loss)
+        if self.accelerator.sync_gradients:
+            self.accelerator.clip_grad_norm_(
+                filter(lambda p: p.requires_grad, self.model.parameters()), 0.5
+            )
+        self.optimizer.step()
+        self.scheduler.step()
+        for item in train_losses:
+            train_losses[item] = train_losses[item].item()
+        if self.cfg.train.diff_ce_loss_lambda > 0:
+            pred_indices_list = pred_indices.long().detach().cpu().numpy()
+            gt_indices_list = gt_indices.long().detach().cpu().numpy()
+            mask_list = batch["mask"].detach().cpu().numpy()
+            for i in range(pred_indices_list.shape[0]):
+                pred_acc = np.sum(
+                    (pred_indices_list[i] == gt_indices_list[i]) * mask_list
+                ) / np.sum(mask_list)
+                train_losses["pred_acc_{}".format(str(i))] = pred_acc
+        train_losses["batch_size"] = code.shape[0]
+        train_losses["max_frame_nums"] = np.max(
+            batch["frame_nums"].detach().cpu().numpy()
+        )
+        return (total_loss.item(), train_losses, train_stats)
+    @torch.inference_mode()
+    def _valid_step(self, batch):
+        valid_losses = {}
+        total_loss = 0
+        valid_stats = {}
+        code = batch["code"]  # (B, 16, T)
+        pitch = batch["pitch"]  # (B, T)
+        duration = batch["duration"]  # (B, N)
+        phone_id = batch["phone_id"]  # (B, N)
+        ref_code = batch["ref_code"]  # (B, 16, T')
+        phone_mask = batch["phone_mask"]  # (B, N)
+        mask = batch["mask"]  # (B, T)
+        ref_mask = batch["ref_mask"]  # (B, T')
+        diff_out, prior_out = self.model(
+            code=code,
+            pitch=pitch,
+            duration=duration,
+            phone_id=phone_id,
+            ref_code=ref_code,
+            phone_mask=phone_mask,
+            mask=mask,
+            ref_mask=ref_mask,
+        )
+        # pitch loss
+        pitch_loss = log_pitch_loss(prior_out["pitch_pred_log"], pitch, mask=mask)
+        total_loss += pitch_loss
+        valid_losses["pitch_loss"] = pitch_loss
+        # duration loss
+        dur_loss = log_dur_loss(prior_out["dur_pred_log"], duration, mask=phone_mask)
+        total_loss += dur_loss
+        valid_losses["dur_loss"] = dur_loss
+        x0 = self.model.module.code_to_latent(code)
+        if self.cfg.model.diffusion.diffusion_type == "diffusion":
+            # diff loss x0
+            diff_loss_x0 = diff_loss(diff_out["x0_pred"], x0, mask=mask)
+            total_loss += diff_loss_x0
+            valid_losses["diff_loss_x0"] = diff_loss_x0
+            # diff loss noise
+            diff_loss_noise = diff_loss(
+                diff_out["noise_pred"], diff_out["noise"], mask=mask
+            )
+            total_loss += diff_loss_noise * self.cfg.train.diff_noise_loss_lambda
+            valid_losses["diff_loss_noise"] = diff_loss_noise
+        elif self.cfg.model.diffusion.diffusion_type == "flow":
+            # diff flow matching loss
+            flow_gt = diff_out["noise"] - x0
+            diff_loss_flow = diff_loss(diff_out["flow_pred"], flow_gt, mask=mask)
+            total_loss += diff_loss_flow
+            valid_losses["diff_loss_flow"] = diff_loss_flow
+        # diff loss ce
+        # (nq, B, T); (nq, B, T, 1024)
+        if self.cfg.train.diff_ce_loss_lambda > 0:
+            pred_indices, pred_dist = self.model.module.latent_to_code(
+                diff_out["x0_pred"], nq=code.shape[1]
+            )
+            gt_indices, _ = self.model.module.latent_to_code(x0, nq=code.shape[1])
+            diff_loss_ce = diff_ce_loss(pred_dist, gt_indices, mask=mask)
+            total_loss += diff_loss_ce * self.cfg.train.diff_ce_loss_lambda
+            valid_losses["diff_loss_ce"] = diff_loss_ce
+        for item in valid_losses:
+            valid_losses[item] = valid_losses[item].item()
+        if self.cfg.train.diff_ce_loss_lambda > 0:
+            pred_indices_list = pred_indices.long().detach().cpu().numpy()
+            gt_indices_list = gt_indices.long().detach().cpu().numpy()
+            mask_list = batch["mask"].detach().cpu().numpy()
+            for i in range(pred_indices_list.shape[0]):
+                pred_acc = np.sum(
+                    (pred_indices_list[i] == gt_indices_list[i]) * mask_list
+                ) / np.sum(mask_list)
+                valid_losses["pred_acc_{}".format(str(i))] = pred_acc
+        return (total_loss.item(), valid_losses, valid_stats)
+    @torch.inference_mode()
+    def _valid_epoch(self):
+        r"""Testing epoch. Should return average loss of a batch (sample) over
+        one epoch. See ``train_loop`` for usage.
+        """
+        if isinstance(self.model, dict):
+            for key in self.model.keys():
+                self.model[key].eval()
+        else:
+            self.model.eval()
+        epoch_sum_loss = 0.0
+        epoch_losses = dict()
+        for batch in self.valid_dataloader:
+            # Put the data to cuda device
+            device = self.accelerator.device
+            for k, v in batch.items():
+                if isinstance(v, torch.Tensor):
+                    batch[k] = v.to(device)
+            total_loss, valid_losses, valid_stats = self._valid_step(batch)
+            epoch_sum_loss = total_loss
+            for key, value in valid_losses.items():
+                epoch_losses[key] = value
+        self.accelerator.wait_for_everyone()
+        return epoch_sum_loss, epoch_losses
+    def _train_epoch(self):
+        r"""Training epoch. Should return average loss of a batch (sample) over
+        one epoch. See ``train_loop`` for usage.
+        """
+        if isinstance(self.model, dict):
+            for key in self.model.keys():
+                self.model[key].train()
+        else:
+            self.model.train()
+        epoch_sum_loss: float = 0.0
+        epoch_losses: dict = {}
+        epoch_step: int = 0
+        for batch in self.train_dataloader:
+            # Put the data to cuda device
+            device = self.accelerator.device
+            for k, v in batch.items():
+                if isinstance(v, torch.Tensor):
+                    batch[k] = v.to(device)
+            # Do training step and BP
+            with self.accelerator.accumulate(self.model):
+                total_loss, train_losses, training_stats = self._train_step(batch)
+            self.batch_count += 1
+            # Update info for each step
+            # TODO: step means BP counts or batch counts?
+            if self.batch_count % self.cfg.train.gradient_accumulation_step == 0:
+                epoch_sum_loss = total_loss
+                for key, value in train_losses.items():
+                    epoch_losses[key] = value
+                if isinstance(train_losses, dict):
+                    for key, loss in train_losses.items():
+                        self.accelerator.log(
+                            {"Epoch/Train {} Loss".format(key): loss},
+                            step=self.step,
+                        )
+                if (
+                    self.accelerator.is_main_process
+                    and self.batch_count
+                    % (1 * self.cfg.train.gradient_accumulation_step)
+                    == 0
+                ):
+                    self.echo_log(train_losses, mode="Training")
+                self.step += 1
+                epoch_step += 1
+        self.accelerator.wait_for_everyone()
+        return epoch_sum_loss, epoch_losses
+    def train_loop(self):
+        r"""Training loop. The public entry of training process."""
+        # Wait everyone to prepare before we move on
+        self.accelerator.wait_for_everyone()
+        # dump config file
+        if self.accelerator.is_main_process:
+            self._dump_cfg(self.config_save_path)
+        # self.optimizer.zero_grad()
+        # Wait to ensure good to go
+        self.accelerator.wait_for_everyone()
+        while self.epoch < self.max_epoch:
+            if self.accelerator.is_main_process:
+                self.logger.info("\n")
+                self.logger.info("-" * 32)
+                self.logger.info("Epoch {}: ".format(self.epoch))
+            # Do training & validating epoch
+            train_total_loss, train_losses = self._train_epoch()
+            if isinstance(train_losses, dict):
+                for key, loss in train_losses.items():
+                    if self.accelerator.is_main_process:
+                        self.logger.info("  |- Train/{} Loss: {:.6f}".format(key, loss))
+                    self.accelerator.log(
+                        {"Epoch/Train {} Loss".format(key): loss},
+                        step=self.epoch,
+                    )
+            valid_total_loss, valid_losses = self._valid_epoch()
+            if isinstance(valid_losses, dict):
+                for key, loss in valid_losses.items():
+                    if self.accelerator.is_main_process:
+                        self.logger.info("  |- Valid/{} Loss: {:.6f}".format(key, loss))
+                    self.accelerator.log(
+                        {"Epoch/Train {} Loss".format(key): loss},
+                        step=self.epoch,
+                    )
+            if self.accelerator.is_main_process:
+                self.logger.info("  |- Train/Loss: {:.6f}".format(train_total_loss))
+                self.logger.info("  |- Valid/Loss: {:.6f}".format(valid_total_loss))
+            self.accelerator.log(
+                {
+                    "Epoch/Train Loss": train_total_loss,
+                    "Epoch/Valid Loss": valid_total_loss,
+                },
+                step=self.epoch,
+            )
+            self.accelerator.wait_for_everyone()
+            if isinstance(self.scheduler, dict):
+                for key in self.scheduler.keys():
+                    self.scheduler[key].step()
+            else:
+                self.scheduler.step()
+            # Check if hit save_checkpoint_stride and run_eval
+            run_eval = False
+            if self.accelerator.is_main_process:
+                save_checkpoint = False
+                hit_dix = []
+                for i, num in enumerate(self.save_checkpoint_stride):
+                    if self.epoch % num == 0:
+                        save_checkpoint = True
+                        hit_dix.append(i)
+                        run_eval |= self.run_eval[i]
+            self.accelerator.wait_for_everyone()
+            if self.accelerator.is_main_process and save_checkpoint:
+                path = os.path.join(
+                    self.checkpoint_dir,
+                    "epoch-{:04d}_step-{:07d}_loss-{:.6f}".format(
+                        self.epoch, self.step, train_total_loss
+                    ),
+                )
+                print("save state......")
+                self.accelerator.save_state(path)
+                print("finish saving state......")
+                json.dump(
+                    self.checkpoints_path,
+                    open(os.path.join(path, "ckpts.json"), "w"),
+                    ensure_ascii=False,
+                    indent=4,
+                )
+                # Remove old checkpoints
+                to_remove = []
+                for idx in hit_dix:
+                    self.checkpoints_path[idx].append(path)
+                    while len(self.checkpoints_path[idx]) > self.keep_last[idx]:
+                        to_remove.append((idx, self.checkpoints_path[idx].pop(0)))
+                # Search conflicts
+                total = set()
+                for i in self.checkpoints_path:
+                    total |= set(i)
+                do_remove = set()
+                for idx, path in to_remove[::-1]:
+                    if path in total:
+                        self.checkpoints_path[idx].insert(0, path)
+                    else:
+                        do_remove.add(path)
+                # Remove old checkpoints
+                for path in do_remove:
+                    shutil.rmtree(path, ignore_errors=True)
+                    if self.accelerator.is_main_process:
+                        self.logger.debug(f"Remove old checkpoint: {path}")
+            self.accelerator.wait_for_everyone()
+            if run_eval:
+                # TODO: run evaluation
+                pass
+            # Update info for each epoch
+            self.epoch += 1
+        # Finish training and save final checkpoint
+        self.accelerator.wait_for_everyone()
+        if self.accelerator.is_main_process:
+            self.accelerator.save_state(
+                os.path.join(
+                    self.checkpoint_dir,
+                    "final_epoch-{:04d}_step-{:07d}_loss-{:.6f}".format(
+                        self.epoch, self.step, valid_total_loss
+                    ),
+                )
+            )
+        self.accelerator.end_training()

Amphion/models/tts/valle/__init__.py ADDED Viewed

File without changes

Amphion/models/vocoders/autoregressive/autoregressive_vocoder_inference.py ADDED Viewed

File without changes

Amphion/models/vocoders/autoregressive/wavenet/conv.py ADDED Viewed

	@@ -0,0 +1,66 @@

+# Copyright (c) 2023 Amphion.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+from torch import nn
+from torch.nn import functional as F
+class Conv1d(nn.Conv1d):
+    """Extended nn.Conv1d for incremental dilated convolutions"""
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        self.clear_buffer()
+        self._linearized_weight = None
+        self.register_backward_hook(self._clear_linearized_weight)
+    def incremental_forward(self, input):
+        # input (B, T, C)
+        # run forward pre hooks
+        for hook in self._forward_pre_hooks.values():
+            hook(self, input)
+        # reshape weight
+        weight = self._get_linearized_weight()
+        kw = self.kernel_size[0]
+        dilation = self.dilation[0]
+        bsz = input.size(0)
+        if kw > 1:
+            input = input.data
+            if self.input_buffer is None:
+                self.input_buffer = input.new(
+                    bsz, kw + (kw - 1) * (dilation - 1), input.size(2)
+                )
+                self.input_buffer.zero_()
+            else:
+                # shift buffer
+                self.input_buffer[:, :-1, :] = self.input_buffer[:, 1:, :].clone()
+            # append next input
+            self.input_buffer[:, -1, :] = input[:, -1, :]
+            input = self.input_buffer
+            if dilation > 1:
+                input = input[:, 0::dilation, :].contiguous()
+        output = F.linear(input.view(bsz, -1), weight, self.bias)
+        return output.view(bsz, 1, -1)
+    def clear_buffer(self):
+        self.input_buffer = None
+    def _get_linearized_weight(self):
+        if self._linearized_weight is None:
+            kw = self.kernel_size[0]
+            # nn.Conv1d
+            if self.weight.size() == (self.out_channels, self.in_channels, kw):
+                weight = self.weight.transpose(1, 2).contiguous()
+            else:
+                # fairseq.modules.conv_tbc.ConvTBC
+                weight = self.weight.transpose(2, 1).transpose(1, 0).contiguous()
+            assert weight.size() == (self.out_channels, kw, self.in_channels)
+            self._linearized_weight = weight.view(self.out_channels, -1)
+        return self._linearized_weight
+    def _clear_linearized_weight(self, *args):
+        self._linearized_weight = None

Amphion/models/vocoders/autoregressive/wavenet/wavenet.py ADDED Viewed

	@@ -0,0 +1,170 @@

+# Copyright (c) 2023 Amphion.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+import math
+from torch import nn
+from torch.nn import functional as F
+from .modules import Conv1d1x1, ResidualConv1dGLU
+from .upsample import ConvInUpsampleNetwork
+def receptive_field_size(
+    total_layers, num_cycles, kernel_size, dilation=lambda x: 2**x
+):
+    """Compute receptive field size
+    Args:
+        total_layers (int): total layers
+        num_cycles (int): cycles
+        kernel_size (int): kernel size
+        dilation (lambda): lambda to compute dilation factor. ``lambda x : 1``
+          to disable dilated convolution.
+    Returns:
+        int: receptive field size in sample
+    """
+    assert total_layers % num_cycles == 0
+    layers_per_cycle = total_layers // num_cycles
+    dilations = [dilation(i % layers_per_cycle) for i in range(total_layers)]
+    return (kernel_size - 1) * sum(dilations) + 1
+class WaveNet(nn.Module):
+    """The WaveNet model that supports local and global conditioning.
+    Args:
+        out_channels (int): Output channels. If input_type is mu-law quantized
+          one-hot vecror. this must equal to the quantize channels. Other wise
+          num_mixtures x 3 (pi, mu, log_scale).
+        layers (int): Number of total layers
+        stacks (int): Number of dilation cycles
+        residual_channels (int): Residual input / output channels
+        gate_channels (int): Gated activation channels.
+        skip_out_channels (int): Skip connection channels.
+        kernel_size (int): Kernel size of convolution layers.
+        dropout (float): Dropout probability.
+        input_dim (int): Number of mel-spec dimension.
+        upsample_scales (list): List of upsample scale.
+          ``np.prod(upsample_scales)`` must equal to hop size. Used only if
+          upsample_conditional_features is enabled.
+        freq_axis_kernel_size (int): Freq-axis kernel_size for transposed
+          convolution layers for upsampling. If you only care about time-axis
+          upsampling, set this to 1.
+        scalar_input (Bool): If True, scalar input ([-1, 1]) is expected, otherwise
+          quantized one-hot vector is expected..
+    """
+    def __init__(self, cfg):
+        super(WaveNet, self).__init__()
+        self.cfg = cfg
+        self.scalar_input = self.cfg.VOCODER.SCALAR_INPUT
+        self.out_channels = self.cfg.VOCODER.OUT_CHANNELS
+        self.cin_channels = self.cfg.VOCODER.INPUT_DIM
+        self.residual_channels = self.cfg.VOCODER.RESIDUAL_CHANNELS
+        self.layers = self.cfg.VOCODER.LAYERS
+        self.stacks = self.cfg.VOCODER.STACKS
+        self.gate_channels = self.cfg.VOCODER.GATE_CHANNELS
+        self.kernel_size = self.cfg.VOCODER.KERNEL_SIZE
+        self.skip_out_channels = self.cfg.VOCODER.SKIP_OUT_CHANNELS
+        self.dropout = self.cfg.VOCODER.DROPOUT
+        self.upsample_scales = self.cfg.VOCODER.UPSAMPLE_SCALES
+        self.mel_frame_pad = self.cfg.VOCODER.MEL_FRAME_PAD
+        assert self.layers % self.stacks == 0
+        layers_per_stack = self.layers // self.stacks
+        if self.scalar_input:
+            self.first_conv = Conv1d1x1(1, self.residual_channels)
+        else:
+            self.first_conv = Conv1d1x1(self.out_channels, self.residual_channels)
+        self.conv_layers = nn.ModuleList()
+        for layer in range(self.layers):
+            dilation = 2 ** (layer % layers_per_stack)
+            conv = ResidualConv1dGLU(
+                self.residual_channels,
+                self.gate_channels,
+                kernel_size=self.kernel_size,
+                skip_out_channels=self.skip_out_channels,
+                bias=True,
+                dilation=dilation,
+                dropout=self.dropout,
+                cin_channels=self.cin_channels,
+            )
+            self.conv_layers.append(conv)
+        self.last_conv_layers = nn.ModuleList(
+            [
+                nn.ReLU(inplace=True),
+                Conv1d1x1(self.skip_out_channels, self.skip_out_channels),
+                nn.ReLU(inplace=True),
+                Conv1d1x1(self.skip_out_channels, self.out_channels),
+            ]
+        )
+        self.upsample_net = ConvInUpsampleNetwork(
+            upsample_scales=self.upsample_scales,
+            cin_pad=self.mel_frame_pad,
+            cin_channels=self.cin_channels,
+        )
+        self.receptive_field = receptive_field_size(
+            self.layers, self.stacks, self.kernel_size
+        )
+    def forward(self, x, mel, softmax=False):
+        """Forward step
+        Args:
+            x (Tensor): One-hot encoded audio signal, shape (B x C x T)
+            mel (Tensor): Local conditioning features,
+              shape (B x cin_channels x T)
+            softmax (bool): Whether applies softmax or not.
+        Returns:
+            Tensor: output, shape B x out_channels x T
+        """
+        B, _, T = x.size()
+        mel = self.upsample_net(mel)
+        assert mel.shape[-1] == x.shape[-1]
+        x = self.first_conv(x)
+        skips = 0
+        for f in self.conv_layers:
+            x, h = f(x, mel)
+            skips += h
+        skips *= math.sqrt(1.0 / len(self.conv_layers))
+        x = skips
+        for f in self.last_conv_layers:
+            x = f(x)
+        x = F.softmax(x, dim=1) if softmax else x
+        return x
+    def clear_buffer(self):
+        self.first_conv.clear_buffer()
+        for f in self.conv_layers:
+            f.clear_buffer()
+        for f in self.last_conv_layers:
+            try:
+                f.clear_buffer()
+            except AttributeError:
+                pass
+    def make_generation_fast_(self):
+        def remove_weight_norm(m):
+            try:
+                nn.utils.remove_weight_norm(m)
+            except ValueError:  # this module didn't have weight norm
+                return
+        self.apply(remove_weight_norm)

Amphion/models/vocoders/diffusion/diffusion_vocoder_inference.py ADDED Viewed

	@@ -0,0 +1,131 @@

+# Copyright (c) 2023 Amphion.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+import torch
+import numpy as np
+from tqdm import tqdm
+from utils.util import pad_mels_to_tensors, pad_f0_to_tensors
+def vocoder_inference(cfg, model, mels, f0s=None, device=None, fast_inference=False):
+    """Inference the vocoder
+    Args:
+        mels: A tensor of mel-specs with the shape (batch_size, num_mels, frames)
+    Returns:
+        audios: A tensor of audios with the shape (batch_size, seq_len)
+    """
+    model.eval()
+    with torch.no_grad():
+        training_noise_schedule = np.array(cfg.model.diffwave.noise_schedule)
+        inference_noise_schedule = (
+            np.array(cfg.model.diffwave.inference_noise_schedule)
+            if fast_inference
+            else np.array(cfg.model.diffwave.noise_schedule)
+        )
+        talpha = 1 - training_noise_schedule
+        talpha_cum = np.cumprod(talpha)
+        beta = inference_noise_schedule
+        alpha = 1 - beta
+        alpha_cum = np.cumprod(alpha)
+        T = []
+        for s in range(len(inference_noise_schedule)):
+            for t in range(len(training_noise_schedule) - 1):
+                if talpha_cum[t + 1] <= alpha_cum[s] <= talpha_cum[t]:
+                    twiddle = (talpha_cum[t] ** 0.5 - alpha_cum[s] ** 0.5) / (
+                        talpha_cum[t] ** 0.5 - talpha_cum[t + 1] ** 0.5
+                    )
+                    T.append(t + twiddle)
+                    break
+        T = np.array(T, dtype=np.float32)
+        mels = mels.to(device)
+        audio = torch.randn(
+            mels.shape[0],
+            cfg.preprocess.hop_size * mels.shape[-1],
+            device=device,
+        )
+        for n in tqdm(range(len(alpha) - 1, -1, -1)):
+            c1 = 1 / alpha[n] ** 0.5
+            c2 = beta[n] / (1 - alpha_cum[n]) ** 0.5
+            audio = c1 * (
+                audio
+                - c2
+                * model(audio, torch.tensor([T[n]], device=audio.device), mels).squeeze(
+                    1
+                )
+            )
+            if n > 0:
+                noise = torch.randn_like(audio)
+                sigma = (
+                    (1.0 - alpha_cum[n - 1]) / (1.0 - alpha_cum[n]) * beta[n]
+                ) ** 0.5
+                audio += sigma * noise
+            audio = torch.clamp(audio, -1.0, 1.0)
+    return audio.detach().cpu()
+def synthesis_audios(cfg, model, mels, f0s=None, batch_size=None, fast_inference=False):
+    """Inference the vocoder
+    Args:
+        mels: A list of mel-specs
+    Returns:
+        audios: A list of audios
+    """
+    # Get the device
+    device = next(model.parameters()).device
+    audios = []
+    # Pad the given list into tensors
+    mel_batches, mel_frames = pad_mels_to_tensors(mels, batch_size)
+    if f0s != None:
+        f0_batches = pad_f0_to_tensors(f0s, batch_size)
+    if f0s == None:
+        for mel_batch, mel_frame in zip(mel_batches, mel_frames):
+            for i in range(mel_batch.shape[0]):
+                mel = mel_batch[i]
+                frame = mel_frame[i]
+                audio = vocoder_inference(
+                    cfg,
+                    model,
+                    mel.unsqueeze(0),
+                    device=device,
+                    fast_inference=fast_inference,
+                ).squeeze(0)
+                # calculate the audio length
+                audio_length = frame * cfg.preprocess.hop_size
+                audio = audio[:audio_length]
+                audios.append(audio)
+    else:
+        for mel_batch, f0_batch, mel_frame in zip(mel_batches, f0_batches, mel_frames):
+            for i in range(mel_batch.shape[0]):
+                mel = mel_batch[i]
+                f0 = f0_batch[i]
+                frame = mel_frame[i]
+                audio = vocoder_inference(
+                    cfg,
+                    model,
+                    mel.unsqueeze(0),
+                    f0s=f0.unsqueeze(0),
+                    device=device,
+                    fast_inference=fast_inference,
+                ).squeeze(0)
+                # calculate the audio length
+                audio_length = frame * cfg.preprocess.hop_size
+                audio = audio[:audio_length]
+                audios.append(audio)
+    return audios

Amphion/models/vocoders/flow/flow_vocoder_dataset.py ADDED Viewed

File without changes

Amphion/models/vocoders/flow/flow_vocoder_inference.py ADDED Viewed

File without changes

Amphion/models/vocoders/gan/discriminator/msd.py ADDED Viewed

	@@ -0,0 +1,88 @@

+# Copyright (c) 2023 Amphion.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+import torch
+import torch.nn.functional as F
+import torch.nn as nn
+from torch.nn import Conv1d, AvgPool1d
+from torch.nn.utils import weight_norm, spectral_norm
+from torch import nn
+from modules.vocoder_blocks import *
+LRELU_SLOPE = 0.1
+class DiscriminatorS(nn.Module):
+    def __init__(self, use_spectral_norm=False):
+        super(DiscriminatorS, self).__init__()
+        norm_f = weight_norm if use_spectral_norm == False else spectral_norm
+        self.convs = nn.ModuleList(
+            [
+                norm_f(Conv1d(1, 128, 15, 1, padding=7)),
+                norm_f(Conv1d(128, 128, 41, 2, groups=4, padding=20)),
+                norm_f(Conv1d(128, 256, 41, 2, groups=16, padding=20)),
+                norm_f(Conv1d(256, 512, 41, 4, groups=16, padding=20)),
+                norm_f(Conv1d(512, 1024, 41, 4, groups=16, padding=20)),
+                norm_f(Conv1d(1024, 1024, 41, 1, groups=16, padding=20)),
+                norm_f(Conv1d(1024, 1024, 5, 1, padding=2)),
+            ]
+        )
+        self.conv_post = norm_f(Conv1d(1024, 1, 3, 1, padding=1))
+    def forward(self, x):
+        fmap = []
+        for l in self.convs:
+            x = l(x)
+            x = F.leaky_relu(x, LRELU_SLOPE)
+            fmap.append(x)
+        x = self.conv_post(x)
+        fmap.append(x)
+        x = torch.flatten(x, 1, -1)
+        return x, fmap
+class MultiScaleDiscriminator(nn.Module):
+    def __init__(self, cfg):
+        super(MultiScaleDiscriminator, self).__init__()
+        self.cfg = cfg
+        self.discriminators = nn.ModuleList(
+            [
+                DiscriminatorS(use_spectral_norm=True),
+                DiscriminatorS(),
+                DiscriminatorS(),
+            ]
+        )
+        self.meanpools = nn.ModuleList(
+            [AvgPool1d(4, 2, padding=2), AvgPool1d(4, 2, padding=2)]
+        )
+    def forward(self, y, y_hat):
+        y_d_rs = []
+        y_d_gs = []
+        fmap_rs = []
+        fmap_gs = []
+        for i, d in enumerate(self.discriminators):
+            if i != 0:
+                y = self.meanpools[i - 1](y)
+                y_hat = self.meanpools[i - 1](y_hat)
+            y_d_r, fmap_r = d(y)
+            y_d_g, fmap_g = d(y_hat)
+            y_d_rs.append(y_d_r)
+            fmap_rs.append(fmap_r)
+            y_d_gs.append(y_d_g)
+            fmap_gs.append(fmap_g)
+        return y_d_rs, y_d_gs, fmap_rs, fmap_gs

Amphion/models/vocoders/gan/gan_vocoder_inference.py ADDED Viewed

	@@ -0,0 +1,96 @@

+# Copyright (c) 2023 Amphion.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+import torch
+from utils.util import pad_mels_to_tensors, pad_f0_to_tensors
+def vocoder_inference(cfg, model, mels, f0s=None, device=None, fast_inference=False):
+    """Inference the vocoder
+    Args:
+        mels: A tensor of mel-specs with the shape (batch_size, num_mels, frames)
+    Returns:
+        audios: A tensor of audios with the shape (batch_size, seq_len)
+    """
+    model.eval()
+    with torch.no_grad():
+        mels = mels.to(device)
+        if f0s != None:
+            f0s = f0s.to(device)
+        if f0s == None and not cfg.preprocess.extract_amplitude_phase:
+            output = model.forward(mels)
+        elif cfg.preprocess.extract_amplitude_phase:
+            (
+                _,
+                _,
+                _,
+                _,
+                output,
+            ) = model.forward(mels)
+        else:
+            output = model.forward(mels, f0s)
+        return output.squeeze(1).detach().cpu()
+def synthesis_audios(cfg, model, mels, f0s=None, batch_size=None, fast_inference=False):
+    """Inference the vocoder
+    Args:
+        mels: A list of mel-specs
+    Returns:
+        audios: A list of audios
+    """
+    # Get the device
+    device = next(model.parameters()).device
+    audios = []
+    # Pad the given list into tensors
+    mel_batches, mel_frames = pad_mels_to_tensors(mels, batch_size)
+    if f0s != None:
+        f0_batches = pad_f0_to_tensors(f0s, batch_size)
+    if f0s == None:
+        for mel_batch, mel_frame in zip(mel_batches, mel_frames):
+            for i in range(mel_batch.shape[0]):
+                mel = mel_batch[i]
+                frame = mel_frame[i]
+                audio = vocoder_inference(
+                    cfg,
+                    model,
+                    mel.unsqueeze(0),
+                    device=device,
+                    fast_inference=fast_inference,
+                ).squeeze(0)
+                # calculate the audio length
+                audio_length = frame * model.cfg.preprocess.hop_size
+                audio = audio[:audio_length]
+                audios.append(audio)
+    else:
+        for mel_batch, f0_batch, mel_frame in zip(mel_batches, f0_batches, mel_frames):
+            for i in range(mel_batch.shape[0]):
+                mel = mel_batch[i]
+                f0 = f0_batch[i]
+                frame = mel_frame[i]
+                audio = vocoder_inference(
+                    cfg,
+                    model,
+                    mel.unsqueeze(0),
+                    f0s=f0.unsqueeze(0),
+                    device=device,
+                    fast_inference=fast_inference,
+                ).squeeze(0)
+                # calculate the audio length
+                audio_length = frame * model.cfg.preprocess.hop_size
+                audio = audio[:audio_length]
+                audios.append(audio)
+    return audios

Amphion/models/vocoders/vocoder_dataset.py ADDED Viewed

	@@ -0,0 +1,264 @@

+# Copyright (c) 2023 Amphion.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+from typing import Iterable
+import torch
+import numpy as np
+import torch.utils.data
+from torch.nn.utils.rnn import pad_sequence
+from utils.data_utils import *
+from torch.utils.data import ConcatDataset, Dataset
+class VocoderDataset(torch.utils.data.Dataset):
+    def __init__(self, cfg, dataset, is_valid=False):
+        """
+        Args:
+            cfg: config
+            dataset: dataset name
+            is_valid: whether to use train or valid dataset
+        """
+        assert isinstance(dataset, str)
+        processed_data_dir = os.path.join(cfg.preprocess.processed_dir, dataset)
+        meta_file = cfg.preprocess.valid_file if is_valid else cfg.preprocess.train_file
+        self.metafile_path = os.path.join(processed_data_dir, meta_file)
+        self.metadata = self.get_metadata()
+        self.data_root = processed_data_dir
+        self.cfg = cfg
+        if cfg.preprocess.use_audio:
+            self.utt2audio_path = {}
+            for utt_info in self.metadata:
+                dataset = utt_info["Dataset"]
+                uid = utt_info["Uid"]
+                utt = "{}_{}".format(dataset, uid)
+                self.utt2audio_path[utt] = os.path.join(
+                    cfg.preprocess.processed_dir,
+                    dataset,
+                    cfg.preprocess.audio_dir,
+                    uid + ".npy",
+                )
+        elif cfg.preprocess.use_label:
+            self.utt2label_path = {}
+            for utt_info in self.metadata:
+                dataset = utt_info["Dataset"]
+                uid = utt_info["Uid"]
+                utt = "{}_{}".format(dataset, uid)
+                self.utt2label_path[utt] = os.path.join(
+                    cfg.preprocess.processed_dir,
+                    dataset,
+                    cfg.preprocess.label_dir,
+                    uid + ".npy",
+                )
+        elif cfg.preprocess.use_one_hot:
+            self.utt2one_hot_path = {}
+            for utt_info in self.metadata:
+                dataset = utt_info["Dataset"]
+                uid = utt_info["Uid"]
+                utt = "{}_{}".format(dataset, uid)
+                self.utt2one_hot_path[utt] = os.path.join(
+                    cfg.preprocess.processed_dir,
+                    dataset,
+                    cfg.preprocess.one_hot_dir,
+                    uid + ".npy",
+                )
+        if cfg.preprocess.use_mel:
+            self.utt2mel_path = {}
+            for utt_info in self.metadata:
+                dataset = utt_info["Dataset"]
+                uid = utt_info["Uid"]
+                utt = "{}_{}".format(dataset, uid)
+                self.utt2mel_path[utt] = os.path.join(
+                    cfg.preprocess.processed_dir,
+                    dataset,
+                    cfg.preprocess.mel_dir,
+                    uid + ".npy",
+                )
+        if cfg.preprocess.use_frame_pitch:
+            self.utt2frame_pitch_path = {}
+            for utt_info in self.metadata:
+                dataset = utt_info["Dataset"]
+                uid = utt_info["Uid"]
+                utt = "{}_{}".format(dataset, uid)
+                self.utt2frame_pitch_path[utt] = os.path.join(
+                    cfg.preprocess.processed_dir,
+                    dataset,
+                    cfg.preprocess.pitch_dir,
+                    uid + ".npy",
+                )
+        if cfg.preprocess.use_uv:
+            self.utt2uv_path = {}
+            for utt_info in self.metadata:
+                dataset = utt_info["Dataset"]
+                uid = utt_info["Uid"]
+                utt = "{}_{}".format(dataset, uid)
+                self.utt2uv_path[utt] = os.path.join(
+                    cfg.preprocess.processed_dir,
+                    dataset,
+                    cfg.preprocess.uv_dir,
+                    uid + ".npy",
+                )
+        if cfg.preprocess.use_amplitude_phase:
+            self.utt2logamp_path = {}
+            self.utt2pha_path = {}
+            self.utt2rea_path = {}
+            self.utt2imag_path = {}
+            for utt_info in self.metadata:
+                dataset = utt_info["Dataset"]
+                uid = utt_info["Uid"]
+                utt = "{}_{}".format(dataset, uid)
+                self.utt2logamp_path[utt] = os.path.join(
+                    cfg.preprocess.processed_dir,
+                    dataset,
+                    cfg.preprocess.log_amplitude_dir,
+                    uid + ".npy",
+                )
+                self.utt2pha_path[utt] = os.path.join(
+                    cfg.preprocess.processed_dir,
+                    dataset,
+                    cfg.preprocess.phase_dir,
+                    uid + ".npy",
+                )
+                self.utt2rea_path[utt] = os.path.join(
+                    cfg.preprocess.processed_dir,
+                    dataset,
+                    cfg.preprocess.real_dir,
+                    uid + ".npy",
+                )
+                self.utt2imag_path[utt] = os.path.join(
+                    cfg.preprocess.processed_dir,
+                    dataset,
+                    cfg.preprocess.imaginary_dir,
+                    uid + ".npy",
+                )
+    def __getitem__(self, index):
+        utt_info = self.metadata[index]
+        dataset = utt_info["Dataset"]
+        uid = utt_info["Uid"]
+        utt = "{}_{}".format(dataset, uid)
+        single_feature = dict()
+        if self.cfg.preprocess.use_mel:
+            mel = np.load(self.utt2mel_path[utt])
+            assert mel.shape[0] == self.cfg.preprocess.n_mel  # [n_mels, T]
+            if "target_len" not in single_feature.keys():
+                single_feature["target_len"] = mel.shape[1]
+            single_feature["mel"] = mel
+        if self.cfg.preprocess.use_frame_pitch:
+            frame_pitch = np.load(self.utt2frame_pitch_path[utt])
+            if "target_len" not in single_feature.keys():
+                single_feature["target_len"] = len(frame_pitch)
+            aligned_frame_pitch = align_length(
+                frame_pitch, single_feature["target_len"]
+            )
+            single_feature["frame_pitch"] = aligned_frame_pitch
+        if self.cfg.preprocess.use_audio:
+            audio = np.load(self.utt2audio_path[utt])
+            single_feature["audio"] = audio
+        return single_feature
+    def get_metadata(self):
+        with open(self.metafile_path, "r", encoding="utf-8") as f:
+            metadata = json.load(f)
+        return metadata
+    def get_dataset_name(self):
+        return self.metadata[0]["Dataset"]
+    def __len__(self):
+        return len(self.metadata)
+class VocoderConcatDataset(ConcatDataset):
+    def __init__(self, datasets: Iterable[Dataset], full_audio_inference=False):
+        """Concatenate a series of datasets with their random inference audio merged."""
+        super().__init__(datasets)
+        self.cfg = self.datasets[0].cfg
+        self.metadata = []
+        # Merge metadata
+        for dataset in self.datasets:
+            self.metadata += dataset.metadata
+        # Merge random inference features
+        if full_audio_inference:
+            self.eval_audios = []
+            self.eval_dataset_names = []
+            if self.cfg.preprocess.use_mel:
+                self.eval_mels = []
+            if self.cfg.preprocess.use_frame_pitch:
+                self.eval_pitchs = []
+            for dataset in self.datasets:
+                self.eval_audios.append(dataset.eval_audio)
+                self.eval_dataset_names.append(dataset.get_dataset_name())
+                if self.cfg.preprocess.use_mel:
+                    self.eval_mels.append(dataset.eval_mel)
+                if self.cfg.preprocess.use_frame_pitch:
+                    self.eval_pitchs.append(dataset.eval_pitch)
+class VocoderCollator(object):
+    """Zero-pads model inputs and targets based on number of frames per step"""
+    def __init__(self, cfg):
+        self.cfg = cfg
+    def __call__(self, batch):
+        packed_batch_features = dict()
+        # mel: [b, n_mels, frame]
+        # frame_pitch: [b, frame]
+        # audios: [b, frame * hop_size]
+        for key in batch[0].keys():
+            if key == "target_len":
+                packed_batch_features["target_len"] = torch.LongTensor(
+                    [b["target_len"] for b in batch]
+                )
+                masks = [
+                    torch.ones((b["target_len"], 1), dtype=torch.long) for b in batch
+                ]
+                packed_batch_features["mask"] = pad_sequence(
+                    masks, batch_first=True, padding_value=0
+                )
+            elif key == "mel":
+                values = [torch.from_numpy(b[key]).T for b in batch]
+                packed_batch_features[key] = pad_sequence(
+                    values, batch_first=True, padding_value=0
+                )
+            else:
+                values = [torch.from_numpy(b[key]) for b in batch]
+                packed_batch_features[key] = pad_sequence(
+                    values, batch_first=True, padding_value=0
+                )
+        return packed_batch_features

Amphion/models/vocoders/vocoder_sampler.py ADDED Viewed

	@@ -0,0 +1,126 @@

+# Copyright (c) 2023 Amphion.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+import math
+import random
+from torch.utils.data import ConcatDataset, Dataset
+from torch.utils.data.sampler import (
+    BatchSampler,
+    RandomSampler,
+    Sampler,
+    SequentialSampler,
+)
+class ScheduledSampler(Sampler):
+    """A sampler that samples data from a given concat-dataset.
+    Args:
+        concat_dataset (ConcatDataset): a concatenated dataset consisting of all datasets
+        batch_size (int): batch size
+        holistic_shuffle (bool): whether to shuffle the whole dataset or not
+        logger (logging.Logger): logger to print warning message
+    Usage:
+        For cfg.train.batch_size = 3, cfg.train.holistic_shuffle = False, cfg.train.drop_last = True:
+        >>> list(ScheduledSampler(ConcatDataset([0, 1, 2], [3, 4, 5], [6, 7, 8]])))
+        [3, 4, 5, 0, 1, 2, 6, 7, 8]
+    """
+    def __init__(
+        self, concat_dataset, batch_size, holistic_shuffle, logger=None, type="train"
+    ):
+        if not isinstance(concat_dataset, ConcatDataset):
+            raise ValueError(
+                "concat_dataset must be an instance of ConcatDataset, but got {}".format(
+                    type(concat_dataset)
+                )
+            )
+        if not isinstance(batch_size, int):
+            raise ValueError(
+                "batch_size must be an integer, but got {}".format(type(batch_size))
+            )
+        if not isinstance(holistic_shuffle, bool):
+            raise ValueError(
+                "holistic_shuffle must be a boolean, but got {}".format(
+                    type(holistic_shuffle)
+                )
+            )
+        self.concat_dataset = concat_dataset
+        self.batch_size = batch_size
+        self.holistic_shuffle = holistic_shuffle
+        affected_dataset_name = []
+        affected_dataset_len = []
+        for dataset in concat_dataset.datasets:
+            dataset_len = len(dataset)
+            dataset_name = dataset.get_dataset_name()
+            if dataset_len < batch_size:
+                affected_dataset_name.append(dataset_name)
+                affected_dataset_len.append(dataset_len)
+        self.type = type
+        for dataset_name, dataset_len in zip(
+            affected_dataset_name, affected_dataset_len
+        ):
+            if not type == "valid":
+                logger.warning(
+                    "The {} dataset {} has a length of {}, which is smaller than the batch size {}. This may cause unexpected behavior.".format(
+                        type, dataset_name, dataset_len, batch_size
+                    )
+                )
+    def __len__(self):
+        # the number of batches with drop last
+        num_of_batches = sum(
+            [
+                math.floor(len(dataset) / self.batch_size)
+                for dataset in self.concat_dataset.datasets
+            ]
+        )
+        return num_of_batches * self.batch_size
+    def __iter__(self):
+        iters = []
+        for dataset in self.concat_dataset.datasets:
+            iters.append(
+                SequentialSampler(dataset).__iter__()
+                if self.holistic_shuffle
+                else RandomSampler(dataset).__iter__()
+            )
+        init_indices = [0] + self.concat_dataset.cumulative_sizes[:-1]
+        output_batches = []
+        for dataset_idx in range(len(self.concat_dataset.datasets)):
+            cur_batch = []
+            for idx in iters[dataset_idx]:
+                cur_batch.append(idx + init_indices[dataset_idx])
+                if len(cur_batch) == self.batch_size:
+                    output_batches.append(cur_batch)
+                    cur_batch = []
+                if self.type == "valid" and len(cur_batch) > 0:
+                    output_batches.append(cur_batch)
+                    cur_batch = []
+        # force drop last in training
+        random.shuffle(output_batches)
+        output_indices = [item for sublist in output_batches for item in sublist]
+        return iter(output_indices)
+def build_samplers(concat_dataset: Dataset, cfg, logger, type):
+    sampler = ScheduledSampler(
+        concat_dataset,
+        cfg.train.batch_size,
+        cfg.train.sampler.holistic_shuffle,
+        logger,
+        type,
+    )
+    batch_sampler = BatchSampler(
+        sampler,
+        cfg.train.batch_size,
+        cfg.train.sampler.drop_last if not type == "valid" else False,
+    )
+    return sampler, batch_sampler

Amphion/modules/activation_functions/snake.py ADDED Viewed

	@@ -0,0 +1,122 @@

+# Copyright (c) 2023 Amphion.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+import torch
+from torch import nn, pow, sin
+from torch.nn import Parameter
+class Snake(nn.Module):
+    r"""Implementation of a sine-based periodic activation function.
+    Alpha is initialized to 1 by default, higher values means higher frequency.
+    It will be trained along with the rest of your model.
+    Args:
+        in_features: shape of the input
+        alpha: trainable parameter
+    Shape:
+        - Input: (B, C, T)
+        - Output: (B, C, T), same shape as the input
+    References:
+        This activation function is from this paper by Liu Ziyin, Tilman Hartwig,
+        Masahito Ueda: https://arxiv.org/abs/2006.08195
+    Examples:
+        >>> a1 = Snake(256)
+        >>> x = torch.randn(256)
+        >>> x = a1(x)
+    """
+    def __init__(
+        self, in_features, alpha=1.0, alpha_trainable=True, alpha_logscale=False
+    ):
+        super(Snake, self).__init__()
+        self.in_features = in_features
+        # initialize alpha
+        self.alpha_logscale = alpha_logscale
+        if self.alpha_logscale:  # log scale alphas initialized to zeros
+            self.alpha = Parameter(torch.zeros(in_features) * alpha)
+        else:  # linear scale alphas initialized to ones
+            self.alpha = Parameter(torch.ones(in_features) * alpha)
+        self.alpha.requires_grad = alpha_trainable
+        self.no_div_by_zero = 0.000000001
+    def forward(self, x):
+        r"""Forward pass of the function. Applies the function to the input elementwise.
+        Snake ∶= x + 1/a * sin^2 (ax)
+        """
+        alpha = self.alpha.unsqueeze(0).unsqueeze(-1)  # line up with x to [B, C, T]
+        if self.alpha_logscale:
+            alpha = torch.exp(alpha)
+        x = x + (1.0 / (alpha + self.no_div_by_zero)) * pow(sin(x * alpha), 2)
+        return x
+class SnakeBeta(nn.Module):
+    r"""A modified Snake function which uses separate parameters for the magnitude
+    of the periodic components. Alpha is initialized to 1 by default,
+    higher values means higher frequency. Beta is initialized to 1 by default,
+    higher values means higher magnitude. Both will be trained along with the
+    rest of your model.
+    Args:
+        in_features: shape of the input
+        alpha: trainable parameter that controls frequency
+        beta: trainable parameter that controls magnitude
+    Shape:
+        - Input: (B, C, T)
+        - Output: (B, C, T), same shape as the input
+    References:
+        This activation function is a modified version based on this paper by Liu Ziyin,
+        Tilman Hartwig, Masahito Ueda: https://arxiv.org/abs/2006.08195
+    Examples:
+        >>> a1 = SnakeBeta(256)
+        >>> x = torch.randn(256)
+        >>> x = a1(x)
+    """
+    def __init__(
+        self, in_features, alpha=1.0, alpha_trainable=True, alpha_logscale=False
+    ):
+        super(SnakeBeta, self).__init__()
+        self.in_features = in_features
+        # initialize alpha
+        self.alpha_logscale = alpha_logscale
+        if self.alpha_logscale:  # log scale alphas initialized to zeros
+            self.alpha = Parameter(torch.zeros(in_features) * alpha)
+            self.beta = Parameter(torch.zeros(in_features) * alpha)
+        else:  # linear scale alphas initialized to ones
+            self.alpha = Parameter(torch.ones(in_features) * alpha)
+            self.beta = Parameter(torch.ones(in_features) * alpha)
+        self.alpha.requires_grad = alpha_trainable
+        self.beta.requires_grad = alpha_trainable
+        self.no_div_by_zero = 0.000000001
+    def forward(self, x):
+        r"""Forward pass of the function. Applies the function to the input elementwise.
+        SnakeBeta ∶= x + 1/b * sin^2 (xa)
+        """
+        alpha = self.alpha.unsqueeze(0).unsqueeze(-1)  # line up with x to [B, C, T]
+        beta = self.beta.unsqueeze(0).unsqueeze(-1)
+        if self.alpha_logscale:
+            alpha = torch.exp(alpha)
+            beta = torch.exp(beta)
+        x = x + (1.0 / (beta + self.no_div_by_zero)) * pow(sin(x * alpha), 2)
+        return x

Amphion/modules/diffusion/bidilconv/bidilated_conv.py ADDED Viewed

	@@ -0,0 +1,102 @@

+# Copyright (c) 2023 Amphion.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+import math
+import torch.nn as nn
+from modules.general.utils import Conv1d, zero_module
+from .residual_block import ResidualBlock
+class BiDilConv(nn.Module):
+    r"""Dilated CNN architecture with residual connections, default diffusion decoder.
+    Args:
+        input_channel: The number of input channels.
+        base_channel: The number of base channels.
+        n_res_block: The number of residual blocks.
+        conv_kernel_size: The kernel size of convolutional layers.
+        dilation_cycle_length: The cycle length of dilation.
+        conditioner_size: The size of conditioner.
+    """
+    def __init__(
+        self,
+        input_channel,
+        base_channel,
+        n_res_block,
+        conv_kernel_size,
+        dilation_cycle_length,
+        conditioner_size,
+        output_channel: int = -1,
+    ):
+        super().__init__()
+        self.input_channel = input_channel
+        self.base_channel = base_channel
+        self.n_res_block = n_res_block
+        self.conv_kernel_size = conv_kernel_size
+        self.dilation_cycle_length = dilation_cycle_length
+        self.conditioner_size = conditioner_size
+        self.output_channel = output_channel if output_channel > 0 else input_channel
+        self.input = nn.Sequential(
+            Conv1d(
+                input_channel,
+                base_channel,
+                1,
+            ),
+            nn.ReLU(),
+        )
+        self.residual_blocks = nn.ModuleList(
+            [
+                ResidualBlock(
+                    channels=base_channel,
+                    kernel_size=conv_kernel_size,
+                    dilation=2 ** (i % dilation_cycle_length),
+                    d_context=conditioner_size,
+                )
+                for i in range(n_res_block)
+            ]
+        )
+        self.out_proj = nn.Sequential(
+            Conv1d(
+                base_channel,
+                base_channel,
+                1,
+            ),
+            nn.ReLU(),
+            zero_module(
+                Conv1d(
+                    base_channel,
+                    self.output_channel,
+                    1,
+                ),
+            ),
+        )
+    def forward(self, x, y, context=None):
+        """
+        Args:
+            x: Noisy mel-spectrogram [B x ``n_mel`` x L]
+            y: FILM embeddings with the shape of (B, ``base_channel``)
+            context: Context with the shape of [B x ``d_context`` x L], default to None.
+        """
+        h = self.input(x)
+        skip = None
+        for i in range(self.n_res_block):
+            h, skip_connection = self.residual_blocks[i](h, y, context)
+            skip = skip_connection if skip is None else skip_connection + skip
+        out = skip / math.sqrt(self.n_res_block)
+        out = self.out_proj(out)
+        return out

Amphion/modules/diffusion/karras/sample.py ADDED Viewed

	@@ -0,0 +1,185 @@

+# Copyright (c) 2023 Amphion.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+from abc import ABC, abstractmethod
+import numpy as np
+import torch as th
+from scipy.stats import norm
+import torch.distributed as dist
+def create_named_schedule_sampler(name, diffusion):
+    """
+    Create a ScheduleSampler from a library of pre-defined samplers.
+    :param name: the name of the sampler.
+    :param diffusion: the diffusion object to sample for.
+    """
+    if name == "uniform":
+        return UniformSampler(diffusion)
+    elif name == "loss-second-moment":
+        return LossSecondMomentResampler(diffusion)
+    elif name == "lognormal":
+        return LogNormalSampler()
+    else:
+        raise NotImplementedError(f"unknown schedule sampler: {name}")
+class ScheduleSampler(ABC):
+    """
+    A distribution over timesteps in the diffusion process, intended to reduce
+    variance of the objective.
+    By default, samplers perform unbiased importance sampling, in which the
+    objective's mean is unchanged.
+    However, subclasses may override sample() to change how the resampled
+    terms are reweighted, allowing for actual changes in the objective.
+    """
+    @abstractmethod
+    def weights(self):
+        """
+        Get a numpy array of weights, one per diffusion step.
+        The weights needn't be normalized, but must be positive.
+        """
+    def sample(self, batch_size, device):
+        """
+        Importance-sample timesteps for a batch.
+        :param batch_size: the number of timesteps.
+        :param device: the torch device to save to.
+        :return: a tuple (timesteps, weights):
+                 - timesteps: a tensor of timestep indices.
+                 - weights: a tensor of weights to scale the resulting losses.
+        """
+        w = self.weights()
+        p = w / np.sum(w)
+        indices_np = np.random.choice(len(p), size=(batch_size,), p=p)
+        indices = th.from_numpy(indices_np).long().to(device)
+        weights_np = 1 / (len(p) * p[indices_np])
+        weights = th.from_numpy(weights_np).float().to(device)
+        return indices, weights
+class UniformSampler(ScheduleSampler):
+    def __init__(self, diffusion):
+        self.diffusion = diffusion
+        self._weights = np.ones([diffusion.num_timesteps])
+    def weights(self):
+        return self._weights
+class LossAwareSampler(ScheduleSampler):
+    def update_with_local_losses(self, local_ts, local_losses):
+        """
+        Update the reweighting using losses from a model.
+        Call this method from each rank with a batch of timesteps and the
+        corresponding losses for each of those timesteps.
+        This method will perform synchronization to make sure all of the ranks
+        maintain the exact same reweighting.
+        :param local_ts: an integer Tensor of timesteps.
+        :param local_losses: a 1D Tensor of losses.
+        """
+        batch_sizes = [
+            th.tensor([0], dtype=th.int32, device=local_ts.device)
+            for _ in range(dist.get_world_size())
+        ]
+        dist.all_gather(
+            batch_sizes,
+            th.tensor([len(local_ts)], dtype=th.int32, device=local_ts.device),
+        )
+        # Pad all_gather batches to be the maximum batch size.
+        batch_sizes = [x.item() for x in batch_sizes]
+        max_bs = max(batch_sizes)
+        timestep_batches = [th.zeros(max_bs).to(local_ts) for bs in batch_sizes]
+        loss_batches = [th.zeros(max_bs).to(local_losses) for bs in batch_sizes]
+        dist.all_gather(timestep_batches, local_ts)
+        dist.all_gather(loss_batches, local_losses)
+        timesteps = [
+            x.item() for y, bs in zip(timestep_batches, batch_sizes) for x in y[:bs]
+        ]
+        losses = [x.item() for y, bs in zip(loss_batches, batch_sizes) for x in y[:bs]]
+        self.update_with_all_losses(timesteps, losses)
+    @abstractmethod
+    def update_with_all_losses(self, ts, losses):
+        """
+        Update the reweighting using losses from a model.
+        Sub-classes should override this method to update the reweighting
+        using losses from the model.
+        This method directly updates the reweighting without synchronizing
+        between workers. It is called by update_with_local_losses from all
+        ranks with identical arguments. Thus, it should have deterministic
+        behavior to maintain state across workers.
+        :param ts: a list of int timesteps.
+        :param losses: a list of float losses, one per timestep.
+        """
+class LossSecondMomentResampler(LossAwareSampler):
+    def __init__(self, diffusion, history_per_term=10, uniform_prob=0.001):
+        self.diffusion = diffusion
+        self.history_per_term = history_per_term
+        self.uniform_prob = uniform_prob
+        self._loss_history = np.zeros(
+            [diffusion.num_timesteps, history_per_term], dtype=np.float64
+        )
+        self._loss_counts = np.zeros([diffusion.num_timesteps], dtype=np.int)
+    def weights(self):
+        if not self._warmed_up():
+            return np.ones([self.diffusion.num_timesteps], dtype=np.float64)
+        weights = np.sqrt(np.mean(self._loss_history**2, axis=-1))
+        weights /= np.sum(weights)
+        weights *= 1 - self.uniform_prob
+        weights += self.uniform_prob / len(weights)
+        return weights
+    def update_with_all_losses(self, ts, losses):
+        for t, loss in zip(ts, losses):
+            if self._loss_counts[t] == self.history_per_term:
+                # Shift out the oldest loss term.
+                self._loss_history[t, :-1] = self._loss_history[t, 1:]
+                self._loss_history[t, -1] = loss
+            else:
+                self._loss_history[t, self._loss_counts[t]] = loss
+                self._loss_counts[t] += 1
+    def _warmed_up(self):
+        return (self._loss_counts == self.history_per_term).all()
+class LogNormalSampler:
+    def __init__(self, p_mean=-1.2, p_std=1.2, even=False):
+        self.p_mean = p_mean
+        self.p_std = p_std
+        self.even = even
+        if self.even:
+            self.inv_cdf = lambda x: norm.ppf(x, loc=p_mean, scale=p_std)
+            self.rank, self.size = dist.get_rank(), dist.get_world_size()
+    def sample(self, bs, device):
+        if self.even:
+            # buckets = [1/G]
+            start_i, end_i = self.rank * bs, (self.rank + 1) * bs
+            global_batch_size = self.size * bs
+            locs = (th.arange(start_i, end_i) + th.rand(bs)) / global_batch_size
+            log_sigmas = th.tensor(self.inv_cdf(locs), dtype=th.float32, device=device)
+        else:
+            log_sigmas = self.p_mean + self.p_std * th.randn(bs, device=device)
+        sigmas = th.exp(log_sigmas)
+        weights = th.ones_like(sigmas)
+        return sigmas, weights

Amphion/modules/diffusion/unet/attention.py ADDED Viewed

	@@ -0,0 +1,241 @@

+# Copyright (c) 2023 Amphion.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from modules.general.utils import Conv1d, normalization, zero_module
+from .basic import UNetBlock
+class AttentionBlock(UNetBlock):
+    r"""A spatial transformer encoder block that allows spatial positions to attend
+    to each other. Reference from `latent diffusion repo
+    <https://github.com/Stability-AI/generative-models/blob/main/sgm/modules/attention.py#L531>`_.
+    Args:
+        channels: Number of channels in the input.
+        num_head_channels: Number of channels per attention head.
+        num_heads: Number of attention heads. Overrides ``num_head_channels`` if set.
+        encoder_channels: Number of channels in the encoder output for cross-attention.
+            If ``None``, then self-attention is performed.
+        use_self_attention: Whether to use self-attention before cross-attention, only applicable if encoder_channels is set.
+        dims: Number of spatial dimensions, i.e. 1 for temporal signals, 2 for images.
+        h_dim: The dimension of the height, would be applied if ``dims`` is 2.
+        encoder_hdim: The dimension of the height of the encoder output, would be applied if ``dims`` is 2.
+        p_dropout: Dropout probability.
+    """
+    def __init__(
+        self,
+        channels: int,
+        num_head_channels: int = 32,
+        num_heads: int = -1,
+        encoder_channels: int = None,
+        use_self_attention: bool = False,
+        dims: int = 1,
+        h_dim: int = 100,
+        encoder_hdim: int = 384,
+        p_dropout: float = 0.0,
+    ):
+        super().__init__()
+        self.channels = channels
+        self.p_dropout = p_dropout
+        self.dims = dims
+        if dims == 1:
+            self.channels = channels
+        elif dims == 2:
+            # We consider the channel as product of channel and height, i.e. C x H
+            # This is because we want to apply attention on the audio signal, which is 1D
+            self.channels = channels * h_dim
+        else:
+            raise ValueError(f"invalid number of dimensions: {dims}")
+        if num_head_channels == -1:
+            assert (
+                self.channels % num_heads == 0
+            ), f"q,k,v channels {self.channels} is not divisible by num_heads {num_heads}"
+            self.num_heads = num_heads
+            self.num_head_channels = self.channels // num_heads
+        else:
+            assert (
+                self.channels % num_head_channels == 0
+            ), f"q,k,v channels {self.channels} is not divisible by num_head_channels {num_head_channels}"
+            self.num_heads = self.channels // num_head_channels
+            self.num_head_channels = num_head_channels
+        if encoder_channels is not None:
+            self.use_self_attention = use_self_attention
+            if dims == 1:
+                self.encoder_channels = encoder_channels
+            elif dims == 2:
+                self.encoder_channels = encoder_channels * encoder_hdim
+            else:
+                raise ValueError(f"invalid number of dimensions: {dims}")
+            if use_self_attention:
+                self.self_attention = BasicAttentionBlock(
+                    self.channels,
+                    self.num_head_channels,
+                    self.num_heads,
+                    p_dropout=self.p_dropout,
+                )
+            self.cross_attention = BasicAttentionBlock(
+                self.channels,
+                self.num_head_channels,
+                self.num_heads,
+                self.encoder_channels,
+                p_dropout=self.p_dropout,
+            )
+        else:
+            self.encoder_channels = None
+            self.self_attention = BasicAttentionBlock(
+                self.channels,
+                self.num_head_channels,
+                self.num_heads,
+                p_dropout=self.p_dropout,
+            )
+    def forward(self, x: torch.Tensor, encoder_output: torch.Tensor = None):
+        r"""
+        Args:
+            x: input tensor with shape [B x ``channels`` x ...]
+            encoder_output: feature tensor with shape [B x ``encoder_channels`` x ...], if ``None``, then self-attention is performed.
+        Returns:
+            output tensor with shape [B x ``channels`` x ...]
+        """
+        shape = x.size()
+        x = x.reshape(shape[0], self.channels, -1).contiguous()
+        if self.encoder_channels is None:
+            assert (
+                encoder_output is None
+            ), "encoder_output must be None for self-attention."
+            h = self.self_attention(x)
+        else:
+            assert (
+                encoder_output is not None
+            ), "encoder_output must be given for cross-attention."
+            encoder_output = encoder_output.reshape(
+                shape[0], self.encoder_channels, -1
+            ).contiguous()
+            if self.use_self_attention:
+                x = self.self_attention(x)
+            h = self.cross_attention(x, encoder_output)
+        return h.reshape(*shape).contiguous()
+class BasicAttentionBlock(nn.Module):
+    def __init__(
+        self,
+        channels: int,
+        num_head_channels: int = 32,
+        num_heads: int = -1,
+        context_channels: int = None,
+        p_dropout: float = 0.0,
+    ):
+        super().__init__()
+        self.channels = channels
+        self.p_dropout = p_dropout
+        self.context_channels = context_channels
+        if num_head_channels == -1:
+            assert (
+                self.channels % num_heads == 0
+            ), f"q,k,v channels {self.channels} is not divisible by num_heads {num_heads}"
+            self.num_heads = num_heads
+            self.num_head_channels = self.channels // num_heads
+        else:
+            assert (
+                self.channels % num_head_channels == 0
+            ), f"q,k,v channels {self.channels} is not divisible by num_head_channels {num_head_channels}"
+            self.num_heads = self.channels // num_head_channels
+            self.num_head_channels = num_head_channels
+        if context_channels is not None:
+            self.to_q = nn.Sequential(
+                normalization(self.channels),
+                Conv1d(self.channels, self.channels, 1),
+            )
+            self.to_kv = Conv1d(context_channels, 2 * self.channels, 1)
+        else:
+            self.to_qkv = nn.Sequential(
+                normalization(self.channels),
+                Conv1d(self.channels, 3 * self.channels, 1),
+            )
+        self.linear = Conv1d(self.channels, self.channels)
+        self.proj_out = nn.Sequential(
+            normalization(self.channels),
+            Conv1d(self.channels, self.channels, 1),
+            nn.GELU(),
+            nn.Dropout(p=self.p_dropout),
+            zero_module(Conv1d(self.channels, self.channels, 1)),
+        )
+    def forward(self, q: torch.Tensor, kv: torch.Tensor = None):
+        r"""
+        Args:
+            q: input tensor with shape [B, ``channels``, L]
+            kv: feature tensor with shape [B, ``context_channels``, T], if ``None``, then self-attention is performed.
+        Returns:
+            output tensor with shape [B, ``channels``, L]
+        """
+        N, C, L = q.size()
+        if self.context_channels is not None:
+            assert kv is not None, "kv must be given for cross-attention."
+            q = (
+                self.to_q(q)
+                .reshape(self.num_heads, self.num_head_channels, -1)
+                .transpose(-1, -2)
+                .contiguous()
+            )
+            kv = (
+                self.to_kv(kv)
+                .reshape(2, self.num_heads, self.num_head_channels, -1)
+                .transpose(-1, -2)
+                .chunk(2)
+            )
+            k, v = (
+                kv[0].squeeze(0).contiguous(),
+                kv[1].squeeze(0).contiguous(),
+            )
+        else:
+            qkv = (
+                self.to_qkv(q)
+                .reshape(3, self.num_heads, self.num_head_channels, -1)
+                .transpose(-1, -2)
+                .chunk(3)
+            )
+            q, k, v = (
+                qkv[0].squeeze(0).contiguous(),
+                qkv[1].squeeze(0).contiguous(),
+                qkv[2].squeeze(0).contiguous(),
+            )
+        h = F.scaled_dot_product_attention(q, k, v, dropout_p=self.p_dropout).transpose(
+            -1, -2
+        )
+        h = h.reshape(N, -1, L).contiguous()
+        h = self.linear(h)
+        x = q + h
+        h = self.proj_out(x)
+        return x + h

Amphion/modules/diffusion/unet/resblock.py ADDED Viewed

	@@ -0,0 +1,178 @@

+# Copyright (c) 2023 Amphion.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from .basic import UNetBlock
+from modules.general.utils import (
+    append_dims,
+    ConvNd,
+    normalization,
+    zero_module,
+)
+class ResBlock(UNetBlock):
+    r"""A residual block that can optionally change the number of channels.
+    Args:
+        channels: the number of input channels.
+        emb_channels: the number of timestep embedding channels.
+        dropout: the rate of dropout.
+        out_channels: if specified, the number of out channels.
+        use_conv: if True and out_channels is specified, use a spatial
+            convolution instead of a smaller 1x1 convolution to change the
+            channels in the skip connection.
+        dims: determines if the signal is 1D, 2D, or 3D.
+        up: if True, use this block for upsampling.
+        down: if True, use this block for downsampling.
+    """
+    def __init__(
+        self,
+        channels,
+        emb_channels,
+        dropout: float = 0.0,
+        out_channels=None,
+        use_conv=False,
+        use_scale_shift_norm=False,
+        dims=2,
+        up=False,
+        down=False,
+    ):
+        super().__init__()
+        self.channels = channels
+        self.emb_channels = emb_channels
+        self.dropout = dropout
+        self.out_channels = out_channels or channels
+        self.use_conv = use_conv
+        self.use_scale_shift_norm = use_scale_shift_norm
+        self.in_layers = nn.Sequential(
+            normalization(channels),
+            nn.SiLU(),
+            ConvNd(dims, channels, self.out_channels, 3, padding=1),
+        )
+        self.updown = up or down
+        if up:
+            self.h_upd = Upsample(channels, False, dims)
+            self.x_upd = Upsample(channels, False, dims)
+        elif down:
+            self.h_upd = Downsample(channels, False, dims)
+            self.x_upd = Downsample(channels, False, dims)
+        else:
+            self.h_upd = self.x_upd = nn.Identity()
+        self.emb_layers = nn.Sequential(
+            nn.SiLU(),
+            ConvNd(
+                dims,
+                emb_channels,
+                2 * self.out_channels if use_scale_shift_norm else self.out_channels,
+                1,
+            ),
+        )
+        self.out_layers = nn.Sequential(
+            normalization(self.out_channels),
+            nn.SiLU(),
+            nn.Dropout(p=dropout),
+            zero_module(
+                ConvNd(dims, self.out_channels, self.out_channels, 3, padding=1)
+            ),
+        )
+        if self.out_channels == channels:
+            self.skip_connection = nn.Identity()
+        elif use_conv:
+            self.skip_connection = ConvNd(
+                dims, channels, self.out_channels, 3, padding=1
+            )
+        else:
+            self.skip_connection = ConvNd(dims, channels, self.out_channels, 1)
+    def forward(self, x, emb):
+        """
+        Apply the block to a Tensor, conditioned on a timestep embedding.
+            x: an [N x C x ...] Tensor of features.
+            emb: an [N x emb_channels x ...] Tensor of timestep embeddings.
+        :return: an [N x C x ...] Tensor of outputs.
+        """
+        if self.updown:
+            in_rest, in_conv = self.in_layers[:-1], self.in_layers[-1]
+            h = in_rest(x)
+            h = self.h_upd(h)
+            x = self.x_upd(x)
+            h = in_conv(h)
+        else:
+            h = self.in_layers(x)
+        emb_out = self.emb_layers(emb)
+        emb_out = append_dims(emb_out, h.dim())
+        if self.use_scale_shift_norm:
+            out_norm, out_rest = self.out_layers[0], self.out_layers[1:]
+            scale, shift = torch.chunk(emb_out, 2, dim=1)
+            h = out_norm(h) * (1 + scale) + shift
+            h = out_rest(h)
+        else:
+            h = h + emb_out
+            h = self.out_layers(h)
+        return self.skip_connection(x) + h
+class Upsample(nn.Module):
+    r"""An upsampling layer with an optional convolution.
+    Args:
+        channels: channels in the inputs and outputs.
+        dims: determines if the signal is 1D, 2D, or 3D. If 3D, then
+            upsampling occurs in the inner-two dimensions.
+        out_channels: if specified, the number of out channels.
+    """
+    def __init__(self, channels, dims=2, out_channels=None):
+        super().__init__()
+        self.channels = channels
+        self.out_channels = out_channels or channels
+        self.dims = dims
+        self.conv = ConvNd(dims, self.channels, self.out_channels, 3, padding=1)
+    def forward(self, x):
+        assert x.shape[1] == self.channels
+        if self.dims == 3:
+            x = F.interpolate(
+                x, (x.shape[2], x.shape[3] * 2, x.shape[4] * 2), mode="nearest"
+            )
+        else:
+            x = F.interpolate(x, scale_factor=2, mode="nearest")
+        x = self.conv(x)
+        return x
+class Downsample(nn.Module):
+    r"""A downsampling layer with an optional convolution.
+    Args:
+        channels: channels in the inputs and outputs.
+        dims: determines if the signal is 1D, 2D, or 3D. If 3D, then
+            downsampling occurs in the inner-two dimensions.
+        out_channels: if specified, the number of output channels.
+    """
+    def __init__(self, channels, dims=2, out_channels=None):
+        super().__init__()
+        self.channels = channels
+        self.out_channels = out_channels or channels
+        self.dims = dims
+        stride = 2 if dims != 3 else (1, 2, 2)
+        self.op = ConvNd(
+            dims, self.channels, self.out_channels, 3, stride=stride, padding=1
+        )
+    def forward(self, x):
+        assert x.shape[1] == self.channels
+        return self.op(x)

Amphion/modules/diffusion/unet/unet.py ADDED Viewed

	@@ -0,0 +1,310 @@

+# Copyright (c) 2023 Amphion.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+import torch
+import torch.nn as nn
+from modules.encoder.position_encoder import PositionEncoder
+from modules.general.utils import append_dims, ConvNd, normalization, zero_module
+from .attention import AttentionBlock
+from .resblock import Downsample, ResBlock, Upsample
+class UNet(nn.Module):
+    r"""The full UNet model with attention and timestep embedding.
+    Args:
+        dims: determines if the signal is 1D (temporal), 2D(spatial).
+        in_channels: channels in the input Tensor.
+        model_channels: base channel count for the model.
+        out_channels: channels in the output Tensor.
+        num_res_blocks: number of residual blocks per downsample.
+        channel_mult: channel multiplier for each level of the UNet.
+        num_attn_blocks: number of attention blocks at place.
+        attention_resolutions: a collection of downsample rates at which attention will
+            take place. May be a set, list, or tuple. For example, if this contains 4,
+            then at 4x downsampling, attention will be used.
+        num_heads: the number of attention heads in each attention layer.
+        num_head_channels: if specified, ignore num_heads and instead use a fixed
+            channel width per attention head.
+        d_context: if specified, use for cross-attention channel project.
+        p_dropout: the dropout probability.
+        use_self_attention: Apply self attention before cross attention.
+        num_classes: if specified (as an int), then this model will be class-conditional
+            with ``num_classes`` classes.
+        use_extra_film: if specified, use an extra FiLM-like conditioning mechanism.
+        d_emb: if specified, use for FiLM-like conditioning.
+        use_scale_shift_norm: use a FiLM-like conditioning mechanism.
+        resblock_updown: use residual blocks for up/downsampling.
+    """
+    def __init__(
+        self,
+        dims: int = 1,
+        in_channels: int = 100,
+        model_channels: int = 128,
+        out_channels: int = 100,
+        h_dim: int = 128,
+        num_res_blocks: int = 1,
+        channel_mult: tuple = (1, 2, 4),
+        num_attn_blocks: int = 1,
+        attention_resolutions: tuple = (1, 2, 4),
+        num_heads: int = 1,
+        num_head_channels: int = -1,
+        d_context: int = None,
+        context_hdim: int = 128,
+        p_dropout: float = 0.0,
+        num_classes: int = -1,
+        use_extra_film: str = None,
+        d_emb: int = None,
+        use_scale_shift_norm: bool = True,
+        resblock_updown: bool = False,
+    ):
+        super().__init__()
+        self.dims = dims
+        self.in_channels = in_channels
+        self.model_channels = model_channels
+        self.out_channels = out_channels
+        self.num_res_blocks = num_res_blocks
+        self.channel_mult = channel_mult
+        self.num_attn_blocks = num_attn_blocks
+        self.attention_resolutions = attention_resolutions
+        self.num_heads = num_heads
+        self.num_head_channels = num_head_channels
+        self.d_context = d_context
+        self.p_dropout = p_dropout
+        self.num_classes = num_classes
+        self.use_extra_film = use_extra_film
+        self.d_emb = d_emb
+        self.use_scale_shift_norm = use_scale_shift_norm
+        self.resblock_updown = resblock_updown
+        time_embed_dim = model_channels * 4
+        self.pos_enc = PositionEncoder(model_channels, time_embed_dim)
+        assert (
+            num_classes == -1 or use_extra_film is None
+        ), "You cannot set both num_classes and use_extra_film."
+        if self.num_classes > 0:
+            # TODO: if used for singer, norm should be 1, correct?
+            self.label_emb = nn.Embedding(num_classes, time_embed_dim, max_norm=1.0)
+        elif use_extra_film is not None:
+            assert (
+                d_emb is not None
+            ), "d_emb must be specified if use_extra_film is not None"
+            assert use_extra_film in [
+                "add",
+                "concat",
+            ], f"use_extra_film only supported by add or concat. Your input is {use_extra_film}"
+            self.use_extra_film = use_extra_film
+            self.film_emb = ConvNd(dims, d_emb, time_embed_dim, 1)
+            if use_extra_film == "concat":
+                time_embed_dim *= 2
+        # Input blocks
+        ch = input_ch = int(channel_mult[0] * model_channels)
+        self.input_blocks = nn.ModuleList(
+            [UNetSequential(ConvNd(dims, in_channels, ch, 3, padding=1))]
+        )
+        self._feature_size = ch
+        input_block_chans = [ch]
+        ds = 1
+        for level, mult in enumerate(channel_mult):
+            for _ in range(num_res_blocks):
+                layers = [
+                    ResBlock(
+                        ch,
+                        time_embed_dim,
+                        p_dropout,
+                        out_channels=int(mult * model_channels),
+                        dims=dims,
+                        use_scale_shift_norm=use_scale_shift_norm,
+                    )
+                ]
+                ch = int(mult * model_channels)
+                if ds in attention_resolutions:
+                    for _ in range(num_attn_blocks):
+                        layers.append(
+                            AttentionBlock(
+                                ch,
+                                num_heads=num_heads,
+                                num_head_channels=num_head_channels,
+                                encoder_channels=d_context,
+                                dims=dims,
+                                h_dim=h_dim // (level + 1),
+                                encoder_hdim=context_hdim,
+                                p_dropout=p_dropout,
+                            )
+                        )
+                self.input_blocks.append(UNetSequential(*layers))
+                self._feature_size += ch
+                input_block_chans.append(ch)
+            if level != len(channel_mult) - 1:
+                out_ch = ch
+                self.input_blocks.append(
+                    UNetSequential(
+                        ResBlock(
+                            ch,
+                            time_embed_dim,
+                            p_dropout,
+                            out_channels=out_ch,
+                            dims=dims,
+                            use_scale_shift_norm=use_scale_shift_norm,
+                            down=True,
+                        )
+                        if resblock_updown
+                        else Downsample(ch, dims=dims, out_channels=out_ch)
+                    )
+                )
+                ch = out_ch
+                input_block_chans.append(ch)
+                ds *= 2
+                self._feature_size += ch
+        # Middle blocks
+        self.middle_block = UNetSequential(
+            ResBlock(
+                ch,
+                time_embed_dim,
+                p_dropout,
+                dims=dims,
+                use_scale_shift_norm=use_scale_shift_norm,
+            ),
+            AttentionBlock(
+                ch,
+                num_heads=num_heads,
+                num_head_channels=num_head_channels,
+                encoder_channels=d_context,
+                dims=dims,
+                h_dim=h_dim // (level + 1),
+                encoder_hdim=context_hdim,
+                p_dropout=p_dropout,
+            ),
+            ResBlock(
+                ch,
+                time_embed_dim,
+                p_dropout,
+                dims=dims,
+                use_scale_shift_norm=use_scale_shift_norm,
+            ),
+        )
+        self._feature_size += ch
+        # Output blocks
+        self.output_blocks = nn.ModuleList([])
+        for level, mult in tuple(enumerate(channel_mult))[::-1]:
+            for i in range(num_res_blocks + 1):
+                ich = input_block_chans.pop()
+                layers = [
+                    ResBlock(
+                        ch + ich,
+                        time_embed_dim,
+                        p_dropout,
+                        out_channels=int(model_channels * mult),
+                        dims=dims,
+                        use_scale_shift_norm=use_scale_shift_norm,
+                    )
+                ]
+                ch = int(model_channels * mult)
+                if ds in attention_resolutions:
+                    for _ in range(num_attn_blocks):
+                        layers.append(
+                            AttentionBlock(
+                                ch,
+                                num_heads=num_heads,
+                                num_head_channels=num_head_channels,
+                                encoder_channels=d_context,
+                                dims=dims,
+                                h_dim=h_dim // (level + 1),
+                                encoder_hdim=context_hdim,
+                                p_dropout=p_dropout,
+                            )
+                        )
+                if level and i == num_res_blocks:
+                    out_ch = ch
+                    layers.append(
+                        ResBlock(
+                            ch,
+                            time_embed_dim,
+                            p_dropout,
+                            out_channels=out_ch,
+                            dims=dims,
+                            use_scale_shift_norm=use_scale_shift_norm,
+                            up=True,
+                        )
+                        if resblock_updown
+                        else Upsample(ch, dims=dims, out_channels=out_ch)
+                    )
+                    ds //= 2
+                self.output_blocks.append(UNetSequential(*layers))
+                self._feature_size += ch
+        # Final proj out
+        self.out = nn.Sequential(
+            normalization(ch),
+            nn.SiLU(),
+            zero_module(ConvNd(dims, input_ch, out_channels, 3, padding=1)),
+        )
+    def forward(self, x, timesteps=None, context=None, y=None, **kwargs):
+        r"""Apply the model to an input batch.
+        Args:
+            x: an [N x C x ...] Tensor of inputs.
+            timesteps: a 1-D batch of timesteps, i.e. [N].
+            context: conditioning Tensor with shape of [N x ``d_context`` x ...] plugged
+            in via cross attention.
+            y: an [N] Tensor of labels, if **class-conditional**.
+            an [N x ``d_emb`` x ...] Tensor if **film-embed conditional**.
+        Returns:
+            an [N x C x ...] Tensor of outputs.
+        """
+        assert (y is None) or (
+            (y is not None)
+            and ((self.num_classes > 0) or (self.use_extra_film is not None))
+        ), f"y must be specified if num_classes or use_extra_film is not None. \nGot num_classes: {self.num_classes}\t\nuse_extra_film: {self.use_extra_film}\t\n"
+        hs = []
+        emb = self.pos_enc(timesteps)
+        emb = append_dims(emb, x.dim())
+        if self.num_classes > 0:
+            assert y.size() == (x.size(0),)
+            emb = emb + self.label_emb(y)
+        elif self.use_extra_film is not None:
+            assert y.size() == (x.size(0), self.d_emb, *x.size()[2:])
+            y = self.film_emb(y)
+            if self.use_extra_film == "add":
+                emb = emb + y
+            elif self.use_extra_film == "concat":
+                emb = torch.cat([emb, y], dim=1)
+        h = x
+        for module in self.input_blocks:
+            h = module(h, emb, context)
+            hs.append(h)
+        h = self.middle_block(h, emb, context)
+        for module in self.output_blocks:
+            h = torch.cat([h, hs.pop()], dim=1)
+            h = module(h, emb, context)
+        return self.out(h)
+class UNetSequential(nn.Sequential):
+    r"""A sequential module that passes embeddings to the children that support it."""
+    def forward(self, x, emb=None, context=None):
+        for layer in self:
+            if isinstance(layer, ResBlock):
+                x = layer(x, emb)
+            elif isinstance(layer, AttentionBlock):
+                x = layer(x, context)
+            else:
+                x = layer(x)
+        return x

Amphion/modules/encoder/__init__.py ADDED Viewed

	@@ -0,0 +1 @@


1	+ from .token_encoder import TokenEmbedding

Amphion/modules/general/utils.py ADDED Viewed

	@@ -0,0 +1,100 @@

+# Copyright (c) 2023 Amphion.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+import torch
+import torch.nn as nn
+def normalization(channels: int, groups: int = 32):
+    r"""Make a standard normalization layer, i.e. GroupNorm.
+    Args:
+        channels: number of input channels.
+        groups: number of groups for group normalization.
+    Returns:
+        a ``nn.Module`` for normalization.
+    """
+    assert groups > 0, f"invalid number of groups: {groups}"
+    return nn.GroupNorm(groups, channels)
+def Linear(*args, **kwargs):
+    r"""Wrapper of ``nn.Linear`` with kaiming_normal_ initialization."""
+    layer = nn.Linear(*args, **kwargs)
+    nn.init.kaiming_normal_(layer.weight)
+    return layer
+def Conv1d(*args, **kwargs):
+    r"""Wrapper of ``nn.Conv1d`` with kaiming_normal_ initialization."""
+    layer = nn.Conv1d(*args, **kwargs)
+    nn.init.kaiming_normal_(layer.weight)
+    return layer
+def Conv2d(*args, **kwargs):
+    r"""Wrapper of ``nn.Conv2d`` with kaiming_normal_ initialization."""
+    layer = nn.Conv2d(*args, **kwargs)
+    nn.init.kaiming_normal_(layer.weight)
+    return layer
+def ConvNd(dims: int = 1, *args, **kwargs):
+    r"""Wrapper of N-dimension convolution with kaiming_normal_ initialization.
+    Args:
+        dims: number of dimensions of the convolution.
+    """
+    if dims == 1:
+        return Conv1d(*args, **kwargs)
+    elif dims == 2:
+        return Conv2d(*args, **kwargs)
+    else:
+        raise ValueError(f"invalid number of dimensions: {dims}")
+def zero_module(module: nn.Module):
+    r"""Zero out the parameters of a module and return it."""
+    nn.init.zeros_(module.weight)
+    nn.init.zeros_(module.bias)
+    return module
+def scale_module(module: nn.Module, scale):
+    r"""Scale the parameters of a module and return it."""
+    for p in module.parameters():
+        p.detach().mul_(scale)
+    return module
+def mean_flat(tensor: torch.Tensor):
+    r"""Take the mean over all non-batch dimensions."""
+    return tensor.mean(dim=tuple(range(1, tensor.dim())))
+def append_dims(x, target_dims):
+    r"""Appends dimensions to the end of a tensor until
+    it has target_dims dimensions.
+    """
+    dims_to_append = target_dims - x.dim()
+    if dims_to_append < 0:
+        raise ValueError(
+            f"input has {x.dim()} dims but target_dims is {target_dims}, which is less"
+        )
+    return x[(...,) + (None,) * dims_to_append]
+def append_zero(x, count=1):
+    r"""Appends ``count`` zeros to the end of a tensor along the last dimension."""
+    assert count > 0, f"invalid count: {count}"
+    return torch.cat([x, x.new_zeros((*x.size()[:-1], count))], dim=-1)
+class Transpose(nn.Identity):
+    """(N, T, D) -> (N, D, T)"""
+    def forward(self, input: torch.Tensor) -> torch.Tensor:
+        return input.transpose(1, 2)

Amphion/modules/norms/__init__.py ADDED Viewed

	@@ -0,0 +1 @@


1	+ from .norm import AdaptiveLayerNorm, LayerNorm, BalancedBasicNorm, IdentityNorm