niobures commited on Oct 25, 2025

Commit

2e62044

verified ·

1 Parent(s): ca4e3a2

RNNoise (models)

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

.gitattributes +7 -0
models/WaveRNNModel/.gitattributes +1 -0
models/WaveRNNModel/.gitignore +48 -0
models/WaveRNNModel/LICENSE.txt +21 -0
models/WaveRNNModel/README.md +102 -0
models/WaveRNNModel/__init__.py +13 -0
models/WaveRNNModel/assets/WaveRNN.png +0 -0
models/WaveRNNModel/assets/tacotron_wavernn.png +3 -0
models/WaveRNNModel/assets/training_viz.gif +3 -0
models/WaveRNNModel/assets/wavernn_alt_model_hrz2.png +3 -0
models/WaveRNNModel/data/dataset.pkl +3 -0
models/WaveRNNModel/data/text_dict.pkl +3 -0
models/WaveRNNModel/gen_tacotron.py +178 -0
models/WaveRNNModel/gen_wavernn.py +142 -0
models/WaveRNNModel/hparams.py +101 -0
models/WaveRNNModel/loss_plot.py +70 -0
models/WaveRNNModel/model_outputs/ljspeech_lsa_smooth_attention.tacotron.zip +3 -0
models/WaveRNNModel/model_outputs/ljspeech_mol.wavernn.zip +3 -0
models/WaveRNNModel/models/__init__.py +0 -0
models/WaveRNNModel/models/deepmind_version.py +176 -0
models/WaveRNNModel/models/fatchord_version.py +435 -0
models/WaveRNNModel/models/tacotron.py +469 -0
models/WaveRNNModel/notebooks/NB1 - Fit a Sine Wave.ipynb +0 -0
models/WaveRNNModel/notebooks/NB2 - Fit a Short Sample.ipynb +0 -0
models/WaveRNNModel/notebooks/NB3 - Fit a 30min Sample.ipynb +0 -0
models/WaveRNNModel/notebooks/NB4a - Alternative Model (Preprocessing).ipynb +0 -0
models/WaveRNNModel/notebooks/NB4b - Alternative Model (Training).ipynb +0 -0
models/WaveRNNModel/notebooks/Pruning - Scratchpad.ipynb +0 -0
models/WaveRNNModel/notebooks/__init__.py +0 -0
models/WaveRNNModel/notebooks/models/wavernn.py +172 -0
models/WaveRNNModel/notebooks/outputs/nb1/model_output.wav +0 -0
models/WaveRNNModel/notebooks/outputs/nb2/3k_steps.wav +3 -0
models/WaveRNNModel/notebooks/outputs/nb3/12k_steps.wav +3 -0
models/WaveRNNModel/notebooks/utils/__init__.py +0 -0
models/WaveRNNModel/notebooks/utils/display.py +40 -0
models/WaveRNNModel/notebooks/utils/dsp.py +70 -0
models/WaveRNNModel/preprocess.py +103 -0
models/WaveRNNModel/quick_start.py +122 -0
models/WaveRNNModel/quick_start/tts_weights/latest_weights.pyt +3 -0
models/WaveRNNModel/quick_start/voc_weights/latest_weights.pyt +3 -0
models/WaveRNNModel/requirements.txt +6 -0
models/WaveRNNModel/sentences.txt +6 -0
models/WaveRNNModel/source.txt +1 -0
models/WaveRNNModel/train_tacotron.py +203 -0
models/WaveRNNModel/train_wavernn.py +164 -0
models/WaveRNNModel/utils/__init__.py +106 -0
models/WaveRNNModel/utils/checkpoints.py +128 -0
models/WaveRNNModel/utils/dataset.py +232 -0
models/WaveRNNModel/utils/display.py +121 -0
models/WaveRNNModel/utils/distribution.py +132 -0

.gitattributes CHANGED Viewed

@@ -49,3 +49,10 @@ models/ailia-models/code/babble_15dB.wav filter=lfs diff=lfs merge=lfs -text
 models/ailia-models/code/denoised.wav filter=lfs diff=lfs merge=lfs -text
 models/rnnoise-wrapper/weights_5h_b_500k.hdf5 filter=lfs diff=lfs merge=lfs -text
 models/rnnoise-wrapper/weights_5h_ru_500k.hdf5 filter=lfs diff=lfs merge=lfs -text

 models/ailia-models/code/denoised.wav filter=lfs diff=lfs merge=lfs -text
 models/rnnoise-wrapper/weights_5h_b_500k.hdf5 filter=lfs diff=lfs merge=lfs -text
 models/rnnoise-wrapper/weights_5h_ru_500k.hdf5 filter=lfs diff=lfs merge=lfs -text
+models/WaveRNNModel/assets/tacotron_wavernn.png filter=lfs diff=lfs merge=lfs -text
+models/WaveRNNModel/assets/training_viz.gif filter=lfs diff=lfs merge=lfs -text
+models/WaveRNNModel/assets/wavernn_alt_model_hrz2.png filter=lfs diff=lfs merge=lfs -text
+models/WaveRNNModel/notebooks/outputs/nb2/3k_steps.wav filter=lfs diff=lfs merge=lfs -text
+models/WaveRNNModel/notebooks/outputs/nb3/12k_steps.wav filter=lfs diff=lfs merge=lfs -text
+models/WaveRNNModel/quick_start/tts_weights/latest_weights.pyt filter=lfs diff=lfs merge=lfs -text
+models/WaveRNNModel/quick_start/voc_weights/latest_weights.pyt filter=lfs diff=lfs merge=lfs -text

models/WaveRNNModel/.gitattributes ADDED Viewed

	@@ -0,0 +1 @@


1	+ *.ipynb linguist-language=Python

models/WaveRNNModel/.gitignore ADDED Viewed

	@@ -0,0 +1,48 @@

+# IDE files
+.idea
+.vscode
+# Mac files
+.DS_Store
+# Environments
+.env
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+pip-wheel-metadata/
+share/python-wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+# Jupyter Notebook
+.ipynb_checkpoints

models/WaveRNNModel/LICENSE.txt ADDED Viewed

	@@ -0,0 +1,21 @@

+MIT License
+Copyright (c) 2019 fatchord (https://github.com/fatchord)
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.

models/WaveRNNModel/README.md ADDED Viewed

	@@ -0,0 +1,102 @@

+# WaveRNN
+##### (Update: Vanilla Tacotron One TTS system just implemented - more coming soon!)
+![Tacotron with WaveRNN diagrams](assets/tacotron_wavernn.png)
+Pytorch implementation of Deepmind's WaveRNN model from [Efficient Neural Audio Synthesis](https://arxiv.org/abs/1802.08435v1)
+# Installation
+Ensure you have:
+* Python >= 3.6
+* [Pytorch 1 with CUDA](https://pytorch.org/)
+Then install the rest with pip:
+> pip install -r requirements.txt
+# How to Use
+### Quick Start
+If you want to use TTS functionality immediately you can simply use:
+> python quick_start.py
+This will generate everything in the default sentences.txt file and output to a new 'quick_start' folder where you can playback the wav files and take a look at the attention plots
+You can also use that script to generate custom tts sentences and/or use '-u' to generate unbatched (better audio quality):
+> python quick_start.py -u --input_text "What will happen if I run this command?"
+### Training your own Models
+![Attenion and Mel Training GIF](assets/training_viz.gif)
+Download the [LJSpeech](https://keithito.com/LJ-Speech-Dataset/) Dataset.
+Edit **hparams.py**, point **wav_path** to your dataset and run:
+> python preprocess.py
+or use preprocess.py --path to point directly to the dataset
+___
+Here's my recommendation on what order to run things:
+1 - Train Tacotron with:
+> python train_tacotron.py
+2 - You can leave that finish training or at any point you can use:
+> python train_tacotron.py --force_gta
+this will force tactron to create a GTA dataset even if it hasn't finish training.
+3 - Train WaveRNN with:
+> python train_wavernn.py --gta
+NB: You can always just run train_wavernn.py without --gta if you're not interested in TTS.
+4 - Generate Sentences with both models using:
+> python gen_tacotron.py wavernn
+this will generate default sentences. If you want generate custom sentences you can use
+> python gen_tacotron.py --input_text "this is whatever you want it to be" wavernn
+And finally, you can always use --help on any of those scripts to see what options are available :)
+# Samples
+[Can be found here.](https://fatchord.github.io/model_outputs/)
+# Pretrained Models
+Currently there are two pretrained models available in the /pretrained/ folder':
+Both are trained on LJSpeech
+* WaveRNN (Mixture of Logistics output) trained to 800k steps
+* Tacotron trained to 180k steps
+____
+### References
+* [Efficient Neural Audio Synthesis](https://arxiv.org/abs/1802.08435v1)
+* [Tacotron: Towards End-to-End Speech Synthesis](https://arxiv.org/abs/1703.10135)
+* [Natural TTS Synthesis by Conditioning WaveNet on Mel Spectrogram Predictions](https://arxiv.org/abs/1712.05884)
+### Acknowlegements
+* [https://github.com/keithito/tacotron](https://github.com/keithito/tacotron)
+* [https://github.com/r9y9/wavenet_vocoder](https://github.com/r9y9/wavenet_vocoder)
+* Special thanks to github users [G-Wang](https://github.com/G-Wang), [geneing](https://github.com/geneing) & [erogol](https://github.com/erogol)

models/WaveRNNModel/__init__.py ADDED Viewed

	@@ -0,0 +1,13 @@

+import sys
+import os
+from pathlib import Path
+# 获取当前包的绝对路径（即 WaveRNN_master 的目录）
+package_dir = Path(__file__).resolve().parent
+# 将该路径加入 sys.path，使其成为模块搜索的根目录
+if str(package_dir) not in sys.path:
+    sys.path.insert(0, str(package_dir))
+# 设置环境变量 PYTHONPATH（可选，增强兼容性）
+os.environ["PYTHONPATH"] = str(package_dir) + os.pathsep + os.environ.get("PYTHONPATH", "")

models/WaveRNNModel/assets/WaveRNN.png ADDED Viewed

models/WaveRNNModel/assets/tacotron_wavernn.png ADDED Viewed

Git LFS Details

SHA256: 1357e893810079e088224833c4677bdf59eb7b3a3ea52e0836ef5ab3d2c73544
Pointer size: 131 Bytes
Size of remote file: 203 kB

models/WaveRNNModel/assets/training_viz.gif ADDED Viewed

Git LFS Details

SHA256: 73728a59e80755d3e924b060ec8c88f4d1e91aa0469f7e18240cf869deb88aca
Pointer size: 132 Bytes
Size of remote file: 8.58 MB

models/WaveRNNModel/assets/wavernn_alt_model_hrz2.png ADDED Viewed

Git LFS Details

SHA256: 31b5fb4a0b8a5bce580719f699707be9f2b36d7671269b840b975750a3bec827
Pointer size: 131 Bytes
Size of remote file: 200 kB

models/WaveRNNModel/data/dataset.pkl ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:5f323c9bfdfcd5709ad210f538851303c62f63e4285acaee1af791d7da671d88
+size 234790

models/WaveRNNModel/data/text_dict.pkl ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:3a9c7752b430df10a503697b4c563a7ce82cbf732ca211dfeed5d0bffae6e5a6
+size 1531658

models/WaveRNNModel/gen_tacotron.py ADDED Viewed

	@@ -0,0 +1,178 @@

+import torch
+from models.fatchord_version import WaveRNN
+from utils import hparams as hp
+from utils.text.symbols import symbols
+from utils.paths import Paths
+from models.tacotron import Tacotron
+import argparse
+from utils.text import text_to_sequence
+from utils.display import save_attention, simple_table
+from utils.dsp import reconstruct_waveform, save_wav
+import numpy as np
+def gen_tacotron_from_inputtext(args_list=None):
+    # Parse Arguments
+    parser = argparse.ArgumentParser(description='TTS Generator')
+    parser.add_argument('--input_text', '-i', type=str, help='[string] Type in something here and TTS will generate it!')
+    parser.add_argument('--tts_weights', type=str, help='[string/path] Load in different Tacotron weights')
+    parser.add_argument('--save_attention', '-a', dest='save_attn', action='store_true', help='Save Attention Plots')
+    parser.add_argument('--force_cpu', '-c', action='store_true', help='Forces CPU-only training, even when in CUDA capable environment')
+    parser.add_argument('--hp_file', metavar='FILE', default='hparams.py', help='The file to use for the hyperparameters')
+    parser.set_defaults(input_text=None)
+    parser.set_defaults(weights_path=None)
+    # name of subcommand goes to args.vocoder
+    subparsers = parser.add_subparsers(required=True, dest='vocoder')
+    wr_parser = subparsers.add_parser('wavernn', aliases=['wr'])
+    wr_parser.add_argument('--batched', '-b', dest='batched', action='store_true', help='Fast Batched Generation')
+    wr_parser.add_argument('--unbatched', '-u', dest='batched', action='store_false', help='Slow Unbatched Generation')
+    wr_parser.add_argument('--overlap', '-o', type=int, help='[int] number of crossover samples')
+    wr_parser.add_argument('--target', '-t', type=int, help='[int] number of samples in each batch index')
+    wr_parser.add_argument('--voc_weights', type=str, help='[string/path] Load in different WaveRNN weights')
+    wr_parser.set_defaults(batched=None)
+    gl_parser = subparsers.add_parser('griffinlim', aliases=['gl'])
+    gl_parser.add_argument('--iters', type=int, default=32, help='[int] number of griffinlim iterations')
+    args = parser.parse_args(args=args_list)
+    if args.vocoder in ['griffinlim', 'gl']:
+        args.vocoder = 'griffinlim'
+    elif args.vocoder in ['wavernn', 'wr']:
+        args.vocoder = 'wavernn'
+    else:
+        raise argparse.ArgumentError('Must provide a valid vocoder type!')
+    if not hp.is_configured():
+        print("args.hp_file:",args.hp_file)
+        hp.configure(args.hp_file)  # Load hparams from file
+    # set defaults for any arguments that depend on hparams
+    if args.vocoder == 'wavernn':
+        if args.target is None:
+            args.target = hp.voc_target
+        if args.overlap is None:
+            args.overlap = hp.voc_overlap
+        if args.batched is None:
+            args.batched = hp.voc_gen_batched
+        batched = args.batched
+        target = args.target
+        overlap = args.overlap
+    input_text = args.input_text
+    tts_weights = args.tts_weights
+    save_attn = args.save_attn
+    paths = Paths(hp.data_path, hp.voc_model_id, hp.tts_model_id)
+    if not args.force_cpu and torch.cuda.is_available():
+        device = torch.device('cuda')
+    else:
+        device = torch.device('cpu')
+    print('Using device:', device)
+    if args.vocoder == 'wavernn':
+        print('\nInitialising WaveRNN Model...\n')
+        # Instantiate WaveRNN Model
+        voc_model = WaveRNN(rnn_dims=hp.voc_rnn_dims,
+                            fc_dims=hp.voc_fc_dims,
+                            bits=hp.bits,
+                            pad=hp.voc_pad,
+                            upsample_factors=hp.voc_upsample_factors,
+                            feat_dims=hp.num_mels,
+                            compute_dims=hp.voc_compute_dims,
+                            res_out_dims=hp.voc_res_out_dims,
+                            res_blocks=hp.voc_res_blocks,
+                            hop_length=hp.hop_length,
+                            sample_rate=hp.sample_rate,
+                            mode=hp.voc_mode).to(device)
+        voc_load_path = args.voc_weights if args.voc_weights else paths.voc_latest_weights
+        voc_model.load(voc_load_path)
+    print('\nInitialising Tacotron Model...\n')
+    # Instantiate Tacotron Model
+    tts_model = Tacotron(embed_dims=hp.tts_embed_dims,
+                         num_chars=len(symbols),
+                         encoder_dims=hp.tts_encoder_dims,
+                         decoder_dims=hp.tts_decoder_dims,
+                         n_mels=hp.num_mels,
+                         fft_bins=hp.num_mels,
+                         postnet_dims=hp.tts_postnet_dims,
+                         encoder_K=hp.tts_encoder_K,
+                         lstm_dims=hp.tts_lstm_dims,
+                         postnet_K=hp.tts_postnet_K,
+                         num_highways=hp.tts_num_highways,
+                         dropout=hp.tts_dropout,
+                         stop_threshold=hp.tts_stop_threshold).to(device)
+    tts_load_path = tts_weights if tts_weights else paths.tts_latest_weights
+    tts_model.load(tts_load_path)
+    if input_text:
+        inputs = [text_to_sequence(input_text.strip(), hp.tts_cleaner_names)]
+    else:
+        with open('sentences.txt') as f:
+            inputs = [text_to_sequence(l.strip(), hp.tts_cleaner_names) for l in f]
+    if args.vocoder == 'wavernn':
+        voc_k = voc_model.get_step() // 1000
+        tts_k = tts_model.get_step() // 1000
+        simple_table([('Tacotron', str(tts_k) + 'k'),
+                    ('r', tts_model.r),
+                    ('Vocoder Type', 'WaveRNN'),
+                    ('WaveRNN', str(voc_k) + 'k'),
+                    ('Generation Mode', 'Batched' if batched else 'Unbatched'),
+                    ('Target Samples', target if batched else 'N/A'),
+                    ('Overlap Samples', overlap if batched else 'N/A')])
+    elif args.vocoder == 'griffinlim':
+        tts_k = tts_model.get_step() // 1000
+        simple_table([('Tacotron', str(tts_k) + 'k'),
+                    ('r', tts_model.r),
+                    ('Vocoder Type', 'Griffin-Lim'),
+                    ('GL Iters', args.iters)])
+    for i, x in enumerate(inputs, 1):
+        print(f'\n| Generating {i}/{len(inputs)}')
+        _, m, attention = tts_model.generate(x)
+        # Fix mel spectrogram scaling to be from 0 to 1
+        m = (m + 4) / 8
+        np.clip(m, 0, 1, out=m)
+        if args.vocoder == 'griffinlim':
+            v_type = args.vocoder
+        elif args.vocoder == 'wavernn' and args.batched:
+            v_type = 'wavernn_batched'
+        else:
+            v_type = 'wavernn_unbatched'
+        if input_text:
+            print("path:",paths.tts_output)
+            save_path = paths.tts_output/f'__input_{input_text[:10]}_{v_type}_{tts_k}k.wav'
+        else:
+            print("path:",paths.tts_output)
+            save_path = paths.tts_output/f'{i}_{v_type}_{tts_k}k.wav'
+        if save_attn: save_attention(attention, save_path)
+        if args.vocoder == 'wavernn':
+            m = torch.tensor(m).unsqueeze(0)
+            voc_model.generate(m, save_path, batched, hp.voc_target, hp.voc_overlap, hp.mu_law)
+        elif args.vocoder == 'griffinlim':
+            wav = reconstruct_waveform(m, n_iter=args.iters)
+            save_wav(wav, save_path)
+    print('\n\nDone.\n')
+    return save_path
+if __name__ == "__main__":
+   gen_tacotron_from_inputtext()

models/WaveRNNModel/gen_wavernn.py ADDED Viewed

	@@ -0,0 +1,142 @@

+from utils.dataset import get_vocoder_datasets
+from utils.dsp import *
+from models.fatchord_version import WaveRNN
+from utils.paths import Paths
+from utils.display import simple_table
+import torch
+import argparse
+from pathlib import Path
+def gen_testset(model: WaveRNN, test_set, samples, batched, target, overlap, save_path: Path):
+    k = model.get_step() // 1000
+    for i, (m, x) in enumerate(test_set, 1):
+        if i > samples: break
+        print('\n| Generating: %i/%i' % (i, samples))
+        x = x[0].numpy()
+        bits = 16 if hp.voc_mode == 'MOL' else hp.bits
+        if hp.mu_law and hp.voc_mode != 'MOL':
+            x = decode_mu_law(x, 2**bits, from_labels=True)
+        else:
+            x = label_2_float(x, bits)
+        save_wav(x, save_path/f'{k}k_steps_{i}_target.wav')
+        batch_str = f'gen_batched_target{target}_overlap{overlap}' if batched else 'gen_NOT_BATCHED'
+        save_str = str(save_path/f'{k}k_steps_{i}_{batch_str}.wav')
+        _ = model.generate(m, save_str, batched, target, overlap, hp.mu_law)
+def gen_from_file(model: WaveRNN, load_path: Path, save_path: Path, batched, target, overlap):
+    k = model.get_step() // 1000
+    file_name = load_path.stem
+    suffix = load_path.suffix
+    if suffix == ".wav":
+        wav = load_wav(load_path)
+        save_wav(wav, save_path/f'__{file_name}__{k}k_steps_target.wav')
+        mel = melspectrogram(wav)
+    elif suffix == ".npy":
+        mel = np.load(load_path)
+        if mel.ndim != 2 or mel.shape[0] != hp.num_mels:
+            raise ValueError(f'Expected a numpy array shaped (n_mels, n_hops), but got {wav.shape}!')
+        _max = np.max(mel)
+        _min = np.min(mel)
+        if _max >= 1.01 or _min <= -0.01:
+            raise ValueError(f'Expected spectrogram range in [0,1] but was instead [{_min}, {_max}]')
+    else:
+        raise ValueError(f"Expected an extension of .wav or .npy, but got {suffix}!")
+    mel = torch.tensor(mel).unsqueeze(0)
+    batch_str = f'gen_batched_target{target}_overlap{overlap}' if batched else 'gen_NOT_BATCHED'
+    save_str = save_path/f'__{file_name}__{k}k_steps_{batch_str}.wav'
+    _ = model.generate(mel, save_str, batched, target, overlap, hp.mu_law)
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description='Generate WaveRNN Samples')
+    parser.add_argument('--batched', '-b', dest='batched', action='store_true', help='Fast Batched Generation')
+    parser.add_argument('--unbatched', '-u', dest='batched', action='store_false', help='Slow Unbatched Generation')
+    parser.add_argument('--samples', '-s', type=int, help='[int] number of utterances to generate')
+    parser.add_argument('--target', '-t', type=int, help='[int] number of samples in each batch index')
+    parser.add_argument('--overlap', '-o', type=int, help='[int] number of crossover samples')
+    parser.add_argument('--file', '-f', type=str, help='[string/path] for testing a wav outside dataset')
+    parser.add_argument('--voc_weights', '-w', type=str, help='[string/path] Load in different WaveRNN weights')
+    parser.add_argument('--gta', '-g', dest='gta', action='store_true', help='Generate from GTA testset')
+    parser.add_argument('--force_cpu', '-c', action='store_true', help='Forces CPU-only training, even when in CUDA capable environment')
+    parser.add_argument('--hp_file', metavar='FILE', default='hparams.py', help='The file to use for the hyperparameters')
+    parser.set_defaults(batched=None)
+    args = parser.parse_args()
+    hp.configure(args.hp_file)  # Load hparams from file
+    # set defaults for any arguments that depend on hparams
+    if args.target is None:
+        args.target = hp.voc_target
+    if args.overlap is None:
+        args.overlap = hp.voc_overlap
+    if args.batched is None:
+        args.batched = hp.voc_gen_batched
+    if args.samples is None:
+        args.samples = hp.voc_gen_at_checkpoint
+    batched = args.batched
+    samples = args.samples
+    target = args.target
+    overlap = args.overlap
+    file = args.file
+    gta = args.gta
+    if not args.force_cpu and torch.cuda.is_available():
+        device = torch.device('cuda')
+    else:
+        device = torch.device('cpu')
+    print('Using device:', device)
+    print('\nInitialising Model...\n')
+    model = WaveRNN(rnn_dims=hp.voc_rnn_dims,
+                    fc_dims=hp.voc_fc_dims,
+                    bits=hp.bits,
+                    pad=hp.voc_pad,
+                    upsample_factors=hp.voc_upsample_factors,
+                    feat_dims=hp.num_mels,
+                    compute_dims=hp.voc_compute_dims,
+                    res_out_dims=hp.voc_res_out_dims,
+                    res_blocks=hp.voc_res_blocks,
+                    hop_length=hp.hop_length,
+                    sample_rate=hp.sample_rate,
+                    mode=hp.voc_mode).to(device)
+    paths = Paths(hp.data_path, hp.voc_model_id, hp.tts_model_id)
+    voc_weights = args.voc_weights if args.voc_weights else paths.voc_latest_weights
+    model.load(voc_weights)
+    simple_table([('Generation Mode', 'Batched' if batched else 'Unbatched'),
+                  ('Target Samples', target if batched else 'N/A'),
+                  ('Overlap Samples', overlap if batched else 'N/A')])
+    if file:
+        file = Path(file).expanduser()
+        gen_from_file(model, file, paths.voc_output, batched, target, overlap)
+    else:
+        _, test_set = get_vocoder_datasets(paths.data, 1, gta)
+        gen_testset(model, test_set, samples, batched, target, overlap, paths.voc_output)
+    print('\n\nExiting...\n')

models/WaveRNNModel/hparams.py ADDED Viewed

	@@ -0,0 +1,101 @@

+# CONFIG -----------------------------------------------------------------------------------------------------------#
+# Here are the input and output data paths (Note: you can override wav_path in preprocess.py)
+wav_path = 'E:\\智能语音处理系统\\Noise-suppression-and-speech-recognition-systems-master\\WaveRNNModel\\data\\LJSpeech-1.1\\wavs'
+data_path = 'E:\\智能语音处理系统\\Noise-suppression-and-speech-recognition-systems-master\\WaveRNNModel\\data'
+# model ids are separate - that way you can use a new tts with an old wavernn and vice versa
+# NB: expect undefined behaviour if models were trained on different DSP settings
+voc_model_id = 'ljspeech_mol'
+tts_model_id = 'ljspeech_lsa_smooth_attention'
+# set this to True if you are only interested in WaveRNN
+ignore_tts = False
+# DSP --------------------------------------------------------------------------------------------------------------#
+# Settings for all models
+sample_rate = 22050
+n_fft = 2048
+fft_bins = n_fft // 2 + 1
+num_mels = 80
+hop_length = 275                    # 12.5ms - in line with Tacotron 2 paper
+win_length = 1100                   # 50ms - same reason as above
+fmin = 40
+min_level_db = -100
+ref_level_db = 20
+bits = 9                            # bit depth of signal
+mu_law = True                       # Recommended to suppress noise if using raw bits in hp.voc_mode below
+peak_norm = False                   # Normalise to the peak of each wav file
+# WAVERNN / VOCODER ------------------------------------------------------------------------------------------------#
+# Model Hparams
+voc_mode = 'MOL'                    # either 'RAW' (softmax on raw bits) or 'MOL' (sample from mixture of logistics)
+voc_upsample_factors = (5, 5, 11)   # NB - this needs to correctly factorise hop_length
+voc_rnn_dims = 512
+voc_fc_dims = 512
+voc_compute_dims = 128
+voc_res_out_dims = 128
+voc_res_blocks = 10
+# Training
+voc_batch_size = 32
+voc_lr = 1e-4
+voc_checkpoint_every = 25_000
+voc_gen_at_checkpoint = 5           # number of samples to generate at each checkpoint
+voc_total_steps = 1_000_000         # Total number of training steps
+voc_test_samples = 50               # How many unseen samples to put aside for testing
+voc_pad = 2                         # this will pad the input so that the resnet can 'see' wider than input length
+voc_seq_len = hop_length * 5        # must be a multiple of hop_length
+voc_clip_grad_norm = 4              # set to None if no gradient clipping needed
+# Generating / Synthesizing
+voc_gen_batched = True              # very fast (realtime+) single utterance batched generation
+voc_target = 11_000                 # target number of samples to be generated in each batch entry
+voc_overlap = 550                   # number of samples for crossfading between batches
+# TACOTRON/TTS -----------------------------------------------------------------------------------------------------#
+# Model Hparams
+tts_embed_dims = 256                # embedding dimension for the graphemes/phoneme inputs
+tts_encoder_dims = 128
+tts_decoder_dims = 256
+tts_postnet_dims = 128
+tts_encoder_K = 16
+tts_lstm_dims = 512
+tts_postnet_K = 8
+tts_num_highways = 4
+tts_dropout = 0.5
+tts_cleaner_names = ['english_cleaners']
+tts_stop_threshold = -3.4           # Value below which audio generation ends.
+                                    # For example, for a range of [-4, 4], this
+                                    # will terminate the sequence at the first
+                                    # frame that has all values < -3.4
+# Training
+#tts_schedule = [(7,  1e-3,  10_000,  32),   # progressive training schedule
+#                (5,  1e-4, 100_000,  32),   # (r, lr, step, batch_size)
+#                (2,  1e-4, 180_000,  16),
+#                (2,  1e-4, 350_000,  8)]
+tts_schedule = [(7,  1e-3,  10_000,  32)]   # progressive training schedule
+                #(5,  1e-4, 100_000,  64),   # (r, lr, step, batch_size)
+                #(2,  1e-4, 180_000,  64),
+                #(2,  1e-4, 350_000,  64)]
+tts_max_mel_len = 1250              # if you have a couple of extremely long spectrograms you might want to use this
+tts_bin_lengths = True              # bins the spectrogram lengths before sampling in data loader - speeds up training
+tts_clip_grad_norm = 1.0            # clips the gradient norm to prevent explosion - set to None if not needed
+tts_checkpoint_every = 2_000        # checkpoints the model every X steps
+# TODO: tts_phoneme_prob = 0.0              # [0 <-> 1] probability for feeding model phonemes vrs graphemes
+# ------------------------------------------------------------------------------------------------------------------#

models/WaveRNNModel/loss_plot.py ADDED Viewed

	@@ -0,0 +1,70 @@

+import matplotlib.pyplot as plt
+import re
+import numpy as np
+# 设置中文字体
+plt.rcParams['font.sans-serif'] = ['SimHei']  # 简体中文（根据系统调整）
+plt.rcParams['axes.unicode_minus'] = False    # 解决负号显示问题
+# 从txt文件读取日志数据
+def parse_log_file(file_path):
+    epochs = []
+    losses = []
+    with open(file_path, 'r', encoding='utf-8') as f:
+        for line in f:
+            # 使用正则匹配有效行
+            match = re.search(
+                r'Epoch:\s+(\d+)/*.*Loss:\s+(\d+\.\d+)',
+                line.strip()
+            )
+            if match:
+                epoch = int(match.group(1))
+                loss = float(match.group(2))
+                epochs.append(epoch)
+                losses.append(loss)
+    return epochs, losses
+# 文件路径
+log_file = "E:\\智能语音处理系统\\Noise-suppression-and-speech-recognition-systems-master\\WaveRNNModel\\checkpoints\\ljspeech_lsa_smooth_attention.tacotron\\log_test.txt"
+# 提取数据
+try:
+    epochs_read, losses = parse_log_file(log_file)
+    print(epochs_read)
+    epochs=np.arange(len(epochs_read))
+    print(epochs)
+except FileNotFoundError:
+    print(f"错误：文件 {log_file} 不存在，请检查路径！")
+    exit()
+except Exception as e:
+    print(f"解析文件时出错: {str(e)}")
+    exit()
+# 绘制曲线
+plt.figure(figsize=(10, 6))
+plt.plot(epochs, losses, 'b-', linewidth=2, label='训练损失')
+# 图表美化
+plt.title('训练损失随轮次变化曲线', fontsize=14)
+plt.xlabel('训练轮次 (Epoch)', fontsize=12)
+plt.ylabel('损失值 (Loss)', fontsize=12)
+#plt.xticks(range(1, len(epochs)))  # 强制显示所有epoch刻度
+plt.grid(True, linestyle='--', alpha=0.7)
+plt.legend()
+# 标注最低损失
+min_loss = min(losses)
+min_idx = losses.index(min_loss)
+plt.annotate(
+    f'最低损失: {min_loss:.3f}',
+    xy=(epochs[min_idx], min_loss),
+    xytext=(epochs[min_idx]-3, min_loss+0.1),
+    arrowprops=dict(arrowstyle='->', color='red'),
+    fontsize=10,
+    color='red'
+)
+plt.tight_layout()
+plt.show()

models/WaveRNNModel/model_outputs/ljspeech_lsa_smooth_attention.tacotron.zip ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:ed4c5cc52ae740080b0bcea155133d430f01dcb1a2d0097ff9aaef9ee698886a
+size 45040845

models/WaveRNNModel/model_outputs/ljspeech_mol.wavernn.zip ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:78a9cff91b58f6163f4cc9e9e878961829f07c7bf40e71778f0bf5447a4900fc
+size 15610590

models/WaveRNNModel/models/__init__.py ADDED Viewed

File without changes

models/WaveRNNModel/models/deepmind_version.py ADDED Viewed

	@@ -0,0 +1,176 @@

+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from utils.display import *
+from utils.dsp import *
+import numpy as np
+class WaveRNN(nn.Module):
+    def __init__(self, hidden_size=896, quantisation=256):
+        super(WaveRNN, self).__init__()
+        self.hidden_size = hidden_size
+        self.split_size = hidden_size // 2
+        # The main matmul
+        self.R = nn.Linear(self.hidden_size, 3 * self.hidden_size, bias=False)
+        # Output fc layers
+        self.O1 = nn.Linear(self.split_size, self.split_size)
+        self.O2 = nn.Linear(self.split_size, quantisation)
+        self.O3 = nn.Linear(self.split_size, self.split_size)
+        self.O4 = nn.Linear(self.split_size, quantisation)
+        # Input fc layers
+        self.I_coarse = nn.Linear(2, 3 * self.split_size, bias=False)
+        self.I_fine = nn.Linear(3, 3 * self.split_size, bias=False)
+        # biases for the gates
+        self.bias_u = nn.Parameter(torch.zeros(self.hidden_size))
+        self.bias_r = nn.Parameter(torch.zeros(self.hidden_size))
+        self.bias_e = nn.Parameter(torch.zeros(self.hidden_size))
+        # display num params
+        self.num_params()
+    def forward(self, prev_y, prev_hidden, current_coarse):
+        # Main matmul - the projection is split 3 ways
+        R_hidden = self.R(prev_hidden)
+        R_u, R_r, R_e, = torch.split(R_hidden, self.hidden_size, dim=1)
+        # Project the prev input
+        coarse_input_proj = self.I_coarse(prev_y)
+        I_coarse_u, I_coarse_r, I_coarse_e = \
+            torch.split(coarse_input_proj, self.split_size, dim=1)
+        # Project the prev input and current coarse sample
+        fine_input = torch.cat([prev_y, current_coarse], dim=1)
+        fine_input_proj = self.I_fine(fine_input)
+        I_fine_u, I_fine_r, I_fine_e = \
+            torch.split(fine_input_proj, self.split_size, dim=1)
+        # concatenate for the gates
+        I_u = torch.cat([I_coarse_u, I_fine_u], dim=1)
+        I_r = torch.cat([I_coarse_r, I_fine_r], dim=1)
+        I_e = torch.cat([I_coarse_e, I_fine_e], dim=1)
+        # Compute all gates for coarse and fine
+        u = F.sigmoid(R_u + I_u + self.bias_u)
+        r = F.sigmoid(R_r + I_r + self.bias_r)
+        e = F.tanh(r * R_e + I_e + self.bias_e)
+        hidden = u * prev_hidden + (1. - u) * e
+        # Split the hidden state
+        hidden_coarse, hidden_fine = torch.split(hidden, self.split_size, dim=1)
+        # Compute outputs
+        out_coarse = self.O2(F.relu(self.O1(hidden_coarse)))
+        out_fine = self.O4(F.relu(self.O3(hidden_fine)))
+        return out_coarse, out_fine, hidden
+    def generate(self, seq_len):
+        device = next(self.parameters()).device  # use same device as parameters
+        with torch.no_grad():
+            # First split up the biases for the gates
+            b_coarse_u, b_fine_u = torch.split(self.bias_u, self.split_size)
+            b_coarse_r, b_fine_r = torch.split(self.bias_r, self.split_size)
+            b_coarse_e, b_fine_e = torch.split(self.bias_e, self.split_size)
+            # Lists for the two output seqs
+            c_outputs, f_outputs = [], []
+            # Some initial inputs
+            out_coarse = torch.tensor([0], dtype=torch.long, device=device)
+            out_fine = torch.tensor([0], dtype=torch.long, device=device)
+            # We'll meed a hidden state
+            hidden = self.get_initial_hidden()
+            # Need a clock for display
+            start = time.time()
+            # Loop for generation
+            for i in range(seq_len):
+                # Split into two hidden states
+                hidden_coarse, hidden_fine = \
+                    torch.split(hidden, self.split_size, dim=1)
+                # Scale and concat previous predictions
+                out_coarse = out_coarse.unsqueeze(0).float() / 127.5 - 1.
+                out_fine = out_fine.unsqueeze(0).float() / 127.5 - 1.
+                prev_outputs = torch.cat([out_coarse, out_fine], dim=1)
+                # Project input
+                coarse_input_proj = self.I_coarse(prev_outputs)
+                I_coarse_u, I_coarse_r, I_coarse_e = \
+                    torch.split(coarse_input_proj, self.split_size, dim=1)
+                # Project hidden state and split 6 ways
+                R_hidden = self.R(hidden)
+                R_coarse_u , R_fine_u, \
+                R_coarse_r, R_fine_r, \
+                R_coarse_e, R_fine_e = torch.split(R_hidden, self.split_size, dim=1)
+                # Compute the coarse gates
+                u = F.sigmoid(R_coarse_u + I_coarse_u + b_coarse_u)
+                r = F.sigmoid(R_coarse_r + I_coarse_r + b_coarse_r)
+                e = F.tanh(r * R_coarse_e + I_coarse_e + b_coarse_e)
+                hidden_coarse = u * hidden_coarse + (1. - u) * e
+                # Compute the coarse output
+                out_coarse = self.O2(F.relu(self.O1(hidden_coarse)))
+                posterior = F.softmax(out_coarse, dim=1)
+                distrib = torch.distributions.Categorical(posterior)
+                out_coarse = distrib.sample()
+                c_outputs.append(out_coarse)
+                # Project the [prev outputs and predicted coarse sample]
+                coarse_pred = out_coarse.float() / 127.5 - 1.
+                fine_input = torch.cat([prev_outputs, coarse_pred.unsqueeze(0)], dim=1)
+                fine_input_proj = self.I_fine(fine_input)
+                I_fine_u, I_fine_r, I_fine_e = \
+                    torch.split(fine_input_proj, self.split_size, dim=1)
+                # Compute the fine gates
+                u = F.sigmoid(R_fine_u + I_fine_u + b_fine_u)
+                r = F.sigmoid(R_fine_r + I_fine_r + b_fine_r)
+                e = F.tanh(r * R_fine_e + I_fine_e + b_fine_e)
+                hidden_fine = u * hidden_fine + (1. - u) * e
+                # Compute the fine output
+                out_fine = self.O4(F.relu(self.O3(hidden_fine)))
+                posterior = F.softmax(out_fine, dim=1)
+                distrib = torch.distributions.Categorical(posterior)
+                out_fine = distrib.sample()
+                f_outputs.append(out_fine)
+                # Put the hidden state back together
+                hidden = torch.cat([hidden_coarse, hidden_fine], dim=1)
+                # Display progress
+                speed = (i + 1) / (time.time() - start)
+                stream('Gen: %i/%i -- Speed: %i',  (i + 1, seq_len, speed))
+            coarse = torch.stack(c_outputs).squeeze(1).cpu().data.numpy()
+            fine = torch.stack(f_outputs).squeeze(1).cpu().data.numpy()
+            output = combine_signal(coarse, fine)
+        return output, coarse, fine
+    def get_initial_hidden(self, batch_size=1):
+        device = next(self.parameters()).device  # use same device as parameters
+        return torch.zeros(batch_size, self.hidden_size, device=device)
+    def num_params(self, print_out=True):
+        parameters = filter(lambda p: p.requires_grad, self.parameters())
+        parameters = sum([np.prod(p.size()) for p in parameters]) / 1_000_000
+        if print_out:
+            print('Trainable Parameters: %.3f million' % parameters)
+        return parameters

models/WaveRNNModel/models/fatchord_version.py ADDED Viewed

	@@ -0,0 +1,435 @@

+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from utils.distribution import sample_from_discretized_mix_logistic
+from utils.display import *
+from utils.dsp import *
+import os
+import numpy as np
+from pathlib import Path
+from typing import Union
+class ResBlock(nn.Module):
+    def __init__(self, dims):
+        super().__init__()
+        self.conv1 = nn.Conv1d(dims, dims, kernel_size=1, bias=False)
+        self.conv2 = nn.Conv1d(dims, dims, kernel_size=1, bias=False)
+        self.batch_norm1 = nn.BatchNorm1d(dims)
+        self.batch_norm2 = nn.BatchNorm1d(dims)
+    def forward(self, x):
+        residual = x
+        x = self.conv1(x)
+        x = self.batch_norm1(x)
+        x = F.relu(x)
+        x = self.conv2(x)
+        x = self.batch_norm2(x)
+        return x + residual
+class MelResNet(nn.Module):
+    def __init__(self, res_blocks, in_dims, compute_dims, res_out_dims, pad):
+        super().__init__()
+        k_size = pad * 2 + 1
+        self.conv_in = nn.Conv1d(in_dims, compute_dims, kernel_size=k_size, bias=False)
+        self.batch_norm = nn.BatchNorm1d(compute_dims)
+        self.layers = nn.ModuleList()
+        for i in range(res_blocks):
+            self.layers.append(ResBlock(compute_dims))
+        self.conv_out = nn.Conv1d(compute_dims, res_out_dims, kernel_size=1)
+    def forward(self, x):
+        x = self.conv_in(x)
+        x = self.batch_norm(x)
+        x = F.relu(x)
+        for f in self.layers: x = f(x)
+        x = self.conv_out(x)
+        return x
+class Stretch2d(nn.Module):
+    def __init__(self, x_scale, y_scale):
+        super().__init__()
+        self.x_scale = x_scale
+        self.y_scale = y_scale
+    def forward(self, x):
+        b, c, h, w = x.size()
+        x = x.unsqueeze(-1).unsqueeze(3)
+        x = x.repeat(1, 1, 1, self.y_scale, 1, self.x_scale)
+        return x.view(b, c, h * self.y_scale, w * self.x_scale)
+class UpsampleNetwork(nn.Module):
+    def __init__(self, feat_dims, upsample_scales, compute_dims,
+                 res_blocks, res_out_dims, pad):
+        super().__init__()
+        total_scale = np.cumproduct(upsample_scales)[-1]
+        self.indent = pad * total_scale
+        self.resnet = MelResNet(res_blocks, feat_dims, compute_dims, res_out_dims, pad)
+        self.resnet_stretch = Stretch2d(total_scale, 1)
+        self.up_layers = nn.ModuleList()
+        for scale in upsample_scales:
+            k_size = (1, scale * 2 + 1)
+            padding = (0, scale)
+            stretch = Stretch2d(scale, 1)
+            conv = nn.Conv2d(1, 1, kernel_size=k_size, padding=padding, bias=False)
+            conv.weight.data.fill_(1. / k_size[1])
+            self.up_layers.append(stretch)
+            self.up_layers.append(conv)
+    def forward(self, m):
+        aux = self.resnet(m).unsqueeze(1)
+        aux = self.resnet_stretch(aux)
+        aux = aux.squeeze(1)
+        m = m.unsqueeze(1)
+        for f in self.up_layers: m = f(m)
+        m = m.squeeze(1)[:, :, self.indent:-self.indent]
+        return m.transpose(1, 2), aux.transpose(1, 2)
+class WaveRNN(nn.Module):
+    def __init__(self, rnn_dims, fc_dims, bits, pad, upsample_factors,
+                 feat_dims, compute_dims, res_out_dims, res_blocks,
+                 hop_length, sample_rate, mode='RAW'):
+        super().__init__()
+        self.mode = mode
+        self.pad = pad
+        if self.mode == 'RAW':
+            self.n_classes = 2 ** bits
+        elif self.mode == 'MOL':
+            self.n_classes = 30
+        else:
+            RuntimeError("Unknown model mode value - ", self.mode)
+        # List of rnns to call `flatten_parameters()` on
+        self._to_flatten = []
+        self.rnn_dims = rnn_dims
+        self.aux_dims = res_out_dims // 4
+        self.hop_length = hop_length
+        self.sample_rate = sample_rate
+        self.upsample = UpsampleNetwork(feat_dims, upsample_factors, compute_dims, res_blocks, res_out_dims, pad)
+        self.I = nn.Linear(feat_dims + self.aux_dims + 1, rnn_dims)
+        self.rnn1 = nn.GRU(rnn_dims, rnn_dims, batch_first=True)
+        self.rnn2 = nn.GRU(rnn_dims + self.aux_dims, rnn_dims, batch_first=True)
+        self._to_flatten += [self.rnn1, self.rnn2]
+        self.fc1 = nn.Linear(rnn_dims + self.aux_dims, fc_dims)
+        self.fc2 = nn.Linear(fc_dims + self.aux_dims, fc_dims)
+        self.fc3 = nn.Linear(fc_dims, self.n_classes)
+        self.register_buffer('step', torch.zeros(1, dtype=torch.long))
+        self.num_params()
+        # Avoid fragmentation of RNN parameters and associated warning
+        self._flatten_parameters()
+    def forward(self, x, mels):
+        device = next(self.parameters()).device  # use same device as parameters
+        # Although we `_flatten_parameters()` on init, when using DataParallel
+        # the model gets replicated, making it no longer guaranteed that the
+        # weights are contiguous in GPU memory. Hence, we must call it again
+        self._flatten_parameters()
+        self.step += 1
+        bsize = x.size(0)
+        h1 = torch.zeros(1, bsize, self.rnn_dims, device=device)
+        h2 = torch.zeros(1, bsize, self.rnn_dims, device=device)
+        mels, aux = self.upsample(mels)
+        aux_idx = [self.aux_dims * i for i in range(5)]
+        a1 = aux[:, :, aux_idx[0]:aux_idx[1]]
+        a2 = aux[:, :, aux_idx[1]:aux_idx[2]]
+        a3 = aux[:, :, aux_idx[2]:aux_idx[3]]
+        a4 = aux[:, :, aux_idx[3]:aux_idx[4]]
+        x = torch.cat([x.unsqueeze(-1), mels, a1], dim=2)
+        x = self.I(x)
+        res = x
+        x, _ = self.rnn1(x, h1)
+        x = x + res
+        res = x
+        x = torch.cat([x, a2], dim=2)
+        x, _ = self.rnn2(x, h2)
+        x = x + res
+        x = torch.cat([x, a3], dim=2)
+        x = F.relu(self.fc1(x))
+        x = torch.cat([x, a4], dim=2)
+        x = F.relu(self.fc2(x))
+        return self.fc3(x)
+    def generate(self, mels, save_path: Union[str, Path], batched, target, overlap, mu_law):
+        self.eval()
+        device = next(self.parameters()).device  # use same device as parameters
+        mu_law = mu_law if self.mode == 'RAW' else False
+        output = []
+        start = time.time()
+        rnn1 = self.get_gru_cell(self.rnn1)
+        rnn2 = self.get_gru_cell(self.rnn2)
+        with torch.no_grad():
+            mels = torch.as_tensor(mels, device=device)
+            wave_len = (mels.size(-1) - 1) * self.hop_length
+            mels = self.pad_tensor(mels.transpose(1, 2), pad=self.pad, side='both')
+            mels, aux = self.upsample(mels.transpose(1, 2))
+            if batched:
+                mels = self.fold_with_overlap(mels, target, overlap)
+                aux = self.fold_with_overlap(aux, target, overlap)
+            b_size, seq_len, _ = mels.size()
+            h1 = torch.zeros(b_size, self.rnn_dims, device=device)
+            h2 = torch.zeros(b_size, self.rnn_dims, device=device)
+            x = torch.zeros(b_size, 1, device=device)
+            d = self.aux_dims
+            aux_split = [aux[:, :, d * i:d * (i + 1)] for i in range(4)]
+            for i in range(seq_len):
+                m_t = mels[:, i, :]
+                a1_t, a2_t, a3_t, a4_t = \
+                    (a[:, i, :] for a in aux_split)
+                x = torch.cat([x, m_t, a1_t], dim=1)
+                x = self.I(x)
+                h1 = rnn1(x, h1)
+                x = x + h1
+                inp = torch.cat([x, a2_t], dim=1)
+                h2 = rnn2(inp, h2)
+                x = x + h2
+                x = torch.cat([x, a3_t], dim=1)
+                x = F.relu(self.fc1(x))
+                x = torch.cat([x, a4_t], dim=1)
+                x = F.relu(self.fc2(x))
+                logits = self.fc3(x)
+                if self.mode == 'MOL':
+                    sample = sample_from_discretized_mix_logistic(logits.unsqueeze(0).transpose(1, 2))
+                    output.append(sample.view(-1))
+                    # x = torch.FloatTensor([[sample]]).cuda()
+                    x = sample.transpose(0, 1)
+                elif self.mode == 'RAW':
+                    posterior = F.softmax(logits, dim=1)
+                    distrib = torch.distributions.Categorical(posterior)
+                    sample = 2 * distrib.sample().float() / (self.n_classes - 1.) - 1.
+                    output.append(sample)
+                    x = sample.unsqueeze(-1)
+                else:
+                    raise RuntimeError("Unknown model mode value - ", self.mode)
+                if i % 100 == 0: self.gen_display(i, seq_len, b_size, start)
+        output = torch.stack(output).transpose(0, 1)
+        output = output.cpu().numpy()
+        output = output.astype(np.float64)
+        if mu_law:
+            output = decode_mu_law(output, self.n_classes, False)
+        if batched:
+            output = self.xfade_and_unfold(output, target, overlap)
+        else:
+            output = output[0]
+        # Fade-out at the end to avoid signal cutting out suddenly
+        fade_out = np.linspace(1, 0, 20 * self.hop_length)
+        output = output[:wave_len]
+        output[-20 * self.hop_length:] *= fade_out
+        save_wav(output, save_path)
+        self.train()
+        return output
+    def gen_display(self, i, seq_len, b_size, start):
+        gen_rate = (i + 1) / (time.time() - start) * b_size / 1000
+        pbar = progbar(i, seq_len)
+        msg = f'| {pbar} {i*b_size}/{seq_len*b_size} | Batch Size: {b_size} | Gen Rate: {gen_rate:.1f}kHz | '
+        stream(msg)
+    def get_gru_cell(self, gru):
+        gru_cell = nn.GRUCell(gru.input_size, gru.hidden_size)
+        gru_cell.weight_hh.data = gru.weight_hh_l0.data
+        gru_cell.weight_ih.data = gru.weight_ih_l0.data
+        gru_cell.bias_hh.data = gru.bias_hh_l0.data
+        gru_cell.bias_ih.data = gru.bias_ih_l0.data
+        return gru_cell
+    def pad_tensor(self, x, pad, side='both'):
+        # NB - this is just a quick method i need right now
+        # i.e., it won't generalise to other shapes/dims
+        b, t, c = x.size()
+        total = t + 2 * pad if side == 'both' else t + pad
+        padded = torch.zeros(b, total, c, device=x.device)
+        if side == 'before' or side == 'both':
+            padded[:, pad:pad + t, :] = x
+        elif side == 'after':
+            padded[:, :t, :] = x
+        return padded
+    def fold_with_overlap(self, x, target, overlap):
+        ''' Fold the tensor with overlap for quick batched inference.
+            Overlap will be used for crossfading in xfade_and_unfold()
+        Args:
+            x (tensor)    : Upsampled conditioning features.
+                            shape=(1, timesteps, features)
+            target (int)  : Target timesteps for each index of batch
+            overlap (int) : Timesteps for both xfade and rnn warmup
+        Return:
+            (tensor) : shape=(num_folds, target + 2 * overlap, features)
+        Details:
+            x = [[h1, h2, ... hn]]
+            Where each h is a vector of conditioning features
+            Eg: target=2, overlap=1 with x.size(1)=10
+            folded = [[h1, h2, h3, h4],
+                      [h4, h5, h6, h7],
+                      [h7, h8, h9, h10]]
+        '''
+        _, total_len, features = x.size()
+        # Calculate variables needed
+        num_folds = (total_len - overlap) // (target + overlap)
+        extended_len = num_folds * (overlap + target) + overlap
+        remaining = total_len - extended_len
+        # Pad if some time steps poking out
+        if remaining != 0:
+            num_folds += 1
+            padding = target + 2 * overlap - remaining
+            x = self.pad_tensor(x, padding, side='after')
+        folded = torch.zeros(num_folds, target + 2 * overlap, features, device=x.device)
+        # Get the values for the folded tensor
+        for i in range(num_folds):
+            start = i * (target + overlap)
+            end = start + target + 2 * overlap
+            folded[i] = x[:, start:end, :]
+        return folded
+    def xfade_and_unfold(self, y, target, overlap):
+        ''' Applies a crossfade and unfolds into a 1d array.
+        Args:
+            y (ndarry)    : Batched sequences of audio samples
+                            shape=(num_folds, target + 2 * overlap)
+                            dtype=np.float64
+            overlap (int) : Timesteps for both xfade and rnn warmup
+        Return:
+            (ndarry) : audio samples in a 1d array
+                       shape=(total_len)
+                       dtype=np.float64
+        Details:
+            y = [[seq1],
+                 [seq2],
+                 [seq3]]
+            Apply a gain envelope at both ends of the sequences
+            y = [[seq1_in, seq1_target, seq1_out],
+                 [seq2_in, seq2_target, seq2_out],
+                 [seq3_in, seq3_target, seq3_out]]
+            Stagger and add up the groups of samples:
+            [seq1_in, seq1_target, (seq1_out + seq2_in), seq2_target, ...]
+        '''
+        num_folds, length = y.shape
+        target = length - 2 * overlap
+        total_len = num_folds * (target + overlap) + overlap
+        # Need some silence for the rnn warmup
+        silence_len = overlap // 2
+        fade_len = overlap - silence_len
+        silence = np.zeros((silence_len), dtype=np.float64)
+        linear = np.ones((silence_len), dtype=np.float64)
+        # Equal power crossfade
+        t = np.linspace(-1, 1, fade_len, dtype=np.float64)
+        fade_in = np.sqrt(0.5 * (1 + t))
+        fade_out = np.sqrt(0.5 * (1 - t))
+        # Concat the silence to the fades
+        fade_in = np.concatenate([silence, fade_in])
+        fade_out = np.concatenate([linear, fade_out])
+        # Apply the gain to the overlap samples
+        y[:, :overlap] *= fade_in
+        y[:, -overlap:] *= fade_out
+        unfolded = np.zeros((total_len), dtype=np.float64)
+        # Loop to add up all the samples
+        for i in range(num_folds):
+            start = i * (target + overlap)
+            end = start + target + 2 * overlap
+            unfolded[start:end] += y[i]
+        return unfolded
+    def get_step(self):
+        return self.step.data.item()
+    def log(self, path, msg):
+        with open(path, 'a') as f:
+            print(msg, file=f)
+    def load(self, path: Union[str, Path]):
+        # Use device of model params as location for loaded state
+        device = next(self.parameters()).device
+        self.load_state_dict(torch.load(path, map_location=device), strict=False)
+    def save(self, path: Union[str, Path]):
+        # No optimizer argument because saving a model should not include data
+        # only relevant in the training process - it should only be properties
+        # of the model itself. Let caller take care of saving optimzier state.
+        torch.save(self.state_dict(), path)
+    def num_params(self, print_out=True):
+        parameters = filter(lambda p: p.requires_grad, self.parameters())
+        parameters = sum([np.prod(p.size()) for p in parameters]) / 1_000_000
+        if print_out:
+            print('Trainable Parameters: %.3fM' % parameters)
+        return parameters
+    def _flatten_parameters(self):
+        """Calls `flatten_parameters` on all the rnns used by the WaveRNN. Used
+        to improve efficiency and avoid PyTorch yelling at us."""
+        [m.flatten_parameters() for m in self._to_flatten]

models/WaveRNNModel/models/tacotron.py ADDED Viewed

	@@ -0,0 +1,469 @@

+import os
+import numpy as np
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from pathlib import Path
+from typing import Union
+class HighwayNetwork(nn.Module):
+    def __init__(self, size):
+        super().__init__()
+        self.W1 = nn.Linear(size, size)
+        self.W2 = nn.Linear(size, size)
+        self.W1.bias.data.fill_(0.)
+    def forward(self, x):
+        x1 = self.W1(x)
+        x2 = self.W2(x)
+        g = torch.sigmoid(x2)
+        y = g * F.relu(x1) + (1. - g) * x
+        return y
+class Encoder(nn.Module):
+    def __init__(self, embed_dims, num_chars, cbhg_channels, K, num_highways, dropout):
+        super().__init__()
+        self.embedding = nn.Embedding(num_chars, embed_dims)
+        self.pre_net = PreNet(embed_dims)
+        self.cbhg = CBHG(K=K, in_channels=cbhg_channels, channels=cbhg_channels,
+                         proj_channels=[cbhg_channels, cbhg_channels],
+                         num_highways=num_highways)
+    def forward(self, x):
+        x = self.embedding(x)
+        x = self.pre_net(x)
+        x.transpose_(1, 2)
+        x = self.cbhg(x)
+        return x
+class BatchNormConv(nn.Module):
+    def __init__(self, in_channels, out_channels, kernel, relu=True):
+        super().__init__()
+        self.conv = nn.Conv1d(in_channels, out_channels, kernel, stride=1, padding=kernel // 2, bias=False)
+        self.bnorm = nn.BatchNorm1d(out_channels)
+        self.relu = relu
+    def forward(self, x):
+        x = self.conv(x)
+        x = F.relu(x) if self.relu is True else x
+        return self.bnorm(x)
+class CBHG(nn.Module):
+    def __init__(self, K, in_channels, channels, proj_channels, num_highways):
+        super().__init__()
+        # List of all rnns to call `flatten_parameters()` on
+        self._to_flatten = []
+        self.bank_kernels = [i for i in range(1, K + 1)]
+        self.conv1d_bank = nn.ModuleList()
+        for k in self.bank_kernels:
+            conv = BatchNormConv(in_channels, channels, k)
+            self.conv1d_bank.append(conv)
+        self.maxpool = nn.MaxPool1d(kernel_size=2, stride=1, padding=1)
+        self.conv_project1 = BatchNormConv(len(self.bank_kernels) * channels, proj_channels[0], 3)
+        self.conv_project2 = BatchNormConv(proj_channels[0], proj_channels[1], 3, relu=False)
+        # Fix the highway input if necessary
+        if proj_channels[-1] != channels:
+            self.highway_mismatch = True
+            self.pre_highway = nn.Linear(proj_channels[-1], channels, bias=False)
+        else:
+            self.highway_mismatch = False
+        self.highways = nn.ModuleList()
+        for i in range(num_highways):
+            hn = HighwayNetwork(channels)
+            self.highways.append(hn)
+        self.rnn = nn.GRU(channels, channels, batch_first=True, bidirectional=True)
+        self._to_flatten.append(self.rnn)
+        # Avoid fragmentation of RNN parameters and associated warning
+        self._flatten_parameters()
+    def forward(self, x):
+        # Although we `_flatten_parameters()` on init, when using DataParallel
+        # the model gets replicated, making it no longer guaranteed that the
+        # weights are contiguous in GPU memory. Hence, we must call it again
+        self._flatten_parameters()
+        # Save these for later
+        residual = x
+        seq_len = x.size(-1)
+        conv_bank = []
+        # Convolution Bank
+        for conv in self.conv1d_bank:
+            c = conv(x) # Convolution
+            conv_bank.append(c[:, :, :seq_len])
+        # Stack along the channel axis
+        conv_bank = torch.cat(conv_bank, dim=1)
+        # dump the last padding to fit residual
+        x = self.maxpool(conv_bank)[:, :, :seq_len]
+        # Conv1d projections
+        x = self.conv_project1(x)
+        x = self.conv_project2(x)
+        # Residual Connect
+        x = x + residual
+        # Through the highways
+        x = x.transpose(1, 2)
+        if self.highway_mismatch is True:
+            x = self.pre_highway(x)
+        for h in self.highways: x = h(x)
+        # And then the RNN
+        x, _ = self.rnn(x)
+        return x
+    def _flatten_parameters(self):
+        """Calls `flatten_parameters` on all the rnns used by the WaveRNN. Used
+        to improve efficiency and avoid PyTorch yelling at us."""
+        [m.flatten_parameters() for m in self._to_flatten]
+class PreNet(nn.Module):
+    def __init__(self, in_dims, fc1_dims=256, fc2_dims=128, dropout=0.5):
+        super().__init__()
+        self.fc1 = nn.Linear(in_dims, fc1_dims)
+        self.fc2 = nn.Linear(fc1_dims, fc2_dims)
+        self.p = dropout
+    def forward(self, x):
+        x = self.fc1(x)
+        x = F.relu(x)
+        x = F.dropout(x, self.p, training=self.training)
+        x = self.fc2(x)
+        x = F.relu(x)
+        x = F.dropout(x, self.p, training=self.training)
+        return x
+class Attention(nn.Module):
+    def __init__(self, attn_dims):
+        super().__init__()
+        self.W = nn.Linear(attn_dims, attn_dims, bias=False)
+        self.v = nn.Linear(attn_dims, 1, bias=False)
+    def forward(self, encoder_seq_proj, query, t):
+        # print(encoder_seq_proj.shape)
+        # Transform the query vector
+        query_proj = self.W(query).unsqueeze(1)
+        # Compute the scores
+        u = self.v(torch.tanh(encoder_seq_proj + query_proj))
+        scores = F.softmax(u, dim=1)
+        return scores.transpose(1, 2)
+class LSA(nn.Module):
+    def __init__(self, attn_dim, kernel_size=31, filters=32):
+        super().__init__()
+        self.conv = nn.Conv1d(2, filters, padding=(kernel_size - 1) // 2, kernel_size=kernel_size, bias=False)
+        self.L = nn.Linear(filters, attn_dim, bias=True)
+        self.W = nn.Linear(attn_dim, attn_dim, bias=True)
+        self.v = nn.Linear(attn_dim, 1, bias=False)
+        self.cumulative = None
+        self.attention = None
+    def init_attention(self, encoder_seq_proj):
+        device = next(self.parameters()).device  # use same device as parameters
+        b, t, c = encoder_seq_proj.size()
+        self.cumulative = torch.zeros(b, t, device=device)
+        self.attention = torch.zeros(b, t, device=device)
+    def forward(self, encoder_seq_proj, query, t):
+        if t == 0: self.init_attention(encoder_seq_proj)
+        processed_query = self.W(query).unsqueeze(1)
+        location = torch.cat([self.cumulative.unsqueeze(1), self.attention.unsqueeze(1)], dim=1)
+        processed_loc = self.L(self.conv(location).transpose(1, 2))
+        u = self.v(torch.tanh(processed_query + encoder_seq_proj + processed_loc))
+        u = u.squeeze(-1)
+        # Smooth Attention
+        scores = torch.sigmoid(u) / torch.sigmoid(u).sum(dim=1, keepdim=True)
+        # scores = F.softmax(u, dim=1)
+        self.attention = scores
+        self.cumulative += self.attention
+        return scores.unsqueeze(-1).transpose(1, 2)
+class Decoder(nn.Module):
+    # Class variable because its value doesn't change between classes
+    # yet ought to be scoped by class because its a property of a Decoder
+    max_r = 20
+    def __init__(self, n_mels, decoder_dims, lstm_dims):
+        super().__init__()
+        self.register_buffer('r', torch.tensor(1, dtype=torch.int))
+        self.n_mels = n_mels
+        self.prenet = PreNet(n_mels)
+        self.attn_net = LSA(decoder_dims)
+        self.attn_rnn = nn.GRUCell(decoder_dims + decoder_dims // 2, decoder_dims)
+        self.rnn_input = nn.Linear(2 * decoder_dims, lstm_dims)
+        self.res_rnn1 = nn.LSTMCell(lstm_dims, lstm_dims)
+        self.res_rnn2 = nn.LSTMCell(lstm_dims, lstm_dims)
+        self.mel_proj = nn.Linear(lstm_dims, n_mels * self.max_r, bias=False)
+    def zoneout(self, prev, current, p=0.1):
+        device = next(self.parameters()).device  # Use same device as parameters
+        mask = torch.zeros(prev.size(), device=device).bernoulli_(p)
+        return prev * mask + current * (1 - mask)
+    def forward(self, encoder_seq, encoder_seq_proj, prenet_in,
+                hidden_states, cell_states, context_vec, t):
+        # Need this for reshaping mels
+        batch_size = encoder_seq.size(0)
+        # Unpack the hidden and cell states
+        attn_hidden, rnn1_hidden, rnn2_hidden = hidden_states
+        rnn1_cell, rnn2_cell = cell_states
+        # PreNet for the Attention RNN
+        prenet_out = self.prenet(prenet_in)
+        # Compute the Attention RNN hidden state
+        attn_rnn_in = torch.cat([context_vec, prenet_out], dim=-1)
+        attn_hidden = self.attn_rnn(attn_rnn_in.squeeze(1), attn_hidden)
+        # Compute the attention scores
+        scores = self.attn_net(encoder_seq_proj, attn_hidden, t)
+        # Dot product to create the context vector
+        context_vec = scores @ encoder_seq
+        context_vec = context_vec.squeeze(1)
+        # Concat Attention RNN output w. Context Vector & project
+        x = torch.cat([context_vec, attn_hidden], dim=1)
+        x = self.rnn_input(x)
+        # Compute first Residual RNN
+        rnn1_hidden_next, rnn1_cell = self.res_rnn1(x, (rnn1_hidden, rnn1_cell))
+        if self.training:
+            rnn1_hidden = self.zoneout(rnn1_hidden, rnn1_hidden_next)
+        else:
+            rnn1_hidden = rnn1_hidden_next
+        x = x + rnn1_hidden
+        # Compute second Residual RNN
+        rnn2_hidden_next, rnn2_cell = self.res_rnn2(x, (rnn2_hidden, rnn2_cell))
+        if self.training:
+            rnn2_hidden = self.zoneout(rnn2_hidden, rnn2_hidden_next)
+        else:
+            rnn2_hidden = rnn2_hidden_next
+        x = x + rnn2_hidden
+        # Project Mels
+        mels = self.mel_proj(x)
+        mels = mels.view(batch_size, self.n_mels, self.max_r)[:, :, :self.r]
+        hidden_states = (attn_hidden, rnn1_hidden, rnn2_hidden)
+        cell_states = (rnn1_cell, rnn2_cell)
+        return mels, scores, hidden_states, cell_states, context_vec
+class Tacotron(nn.Module):
+    def __init__(self, embed_dims, num_chars, encoder_dims, decoder_dims, n_mels, fft_bins, postnet_dims,
+                 encoder_K, lstm_dims, postnet_K, num_highways, dropout, stop_threshold):
+        super().__init__()
+        self.n_mels = n_mels
+        self.lstm_dims = lstm_dims
+        self.decoder_dims = decoder_dims
+        self.encoder = Encoder(embed_dims, num_chars, encoder_dims,
+                               encoder_K, num_highways, dropout)
+        self.encoder_proj = nn.Linear(decoder_dims, decoder_dims, bias=False)
+        self.decoder = Decoder(n_mels, decoder_dims, lstm_dims)
+        self.postnet = CBHG(postnet_K, n_mels, postnet_dims, [256, 80], num_highways)
+        self.post_proj = nn.Linear(postnet_dims * 2, fft_bins, bias=False)
+        self.init_model()
+        self.num_params()
+        self.register_buffer('step', torch.zeros(1, dtype=torch.long))
+        self.register_buffer('stop_threshold', torch.tensor(stop_threshold, dtype=torch.float32))
+    @property
+    def r(self):
+        return self.decoder.r.item()
+    @r.setter
+    def r(self, value):
+        self.decoder.r = self.decoder.r.new_tensor(value, requires_grad=False)
+    def forward(self, x, m, generate_gta=False):
+        device = next(self.parameters()).device  # use same device as parameters
+        self.step += 1
+        if generate_gta:
+            self.eval()
+        else:
+            self.train()
+        batch_size, _, steps  = m.size()
+        # Initialise all hidden states and pack into tuple
+        attn_hidden = torch.zeros(batch_size, self.decoder_dims, device=device)
+        rnn1_hidden = torch.zeros(batch_size, self.lstm_dims, device=device)
+        rnn2_hidden = torch.zeros(batch_size, self.lstm_dims, device=device)
+        hidden_states = (attn_hidden, rnn1_hidden, rnn2_hidden)
+        # Initialise all lstm cell states and pack into tuple
+        rnn1_cell = torch.zeros(batch_size, self.lstm_dims, device=device)
+        rnn2_cell = torch.zeros(batch_size, self.lstm_dims, device=device)
+        cell_states = (rnn1_cell, rnn2_cell)
+        # <GO> Frame for start of decoder loop
+        go_frame = torch.zeros(batch_size, self.n_mels, device=device)
+        # Need an initial context vector
+        context_vec = torch.zeros(batch_size, self.decoder_dims, device=device)
+        # Project the encoder outputs to avoid
+        # unnecessary matmuls in the decoder loop
+        encoder_seq = self.encoder(x)
+        encoder_seq_proj = self.encoder_proj(encoder_seq)
+        # Need a couple of lists for outputs
+        mel_outputs, attn_scores = [], []
+        # Run the decoder loop
+        for t in range(0, steps, self.r):
+            prenet_in = m[:, :, t - 1] if t > 0 else go_frame
+            mel_frames, scores, hidden_states, cell_states, context_vec = \
+                self.decoder(encoder_seq, encoder_seq_proj, prenet_in,
+                             hidden_states, cell_states, context_vec, t)
+            mel_outputs.append(mel_frames)
+            attn_scores.append(scores)
+        # Concat the mel outputs into sequence
+        mel_outputs = torch.cat(mel_outputs, dim=2)
+        # Post-Process for Linear Spectrograms
+        postnet_out = self.postnet(mel_outputs)
+        linear = self.post_proj(postnet_out)
+        linear = linear.transpose(1, 2)
+        # For easy visualisation
+        attn_scores = torch.cat(attn_scores, 1)
+        # attn_scores = attn_scores.cpu().data.numpy()
+        return mel_outputs, linear, attn_scores
+    def generate(self, x, steps=2000):
+        self.eval()
+        device = next(self.parameters()).device  # use same device as parameters
+        batch_size = 1
+        x = torch.as_tensor(x, dtype=torch.long, device=device).unsqueeze(0)
+        # Need to initialise all hidden states and pack into tuple for tidyness
+        attn_hidden = torch.zeros(batch_size, self.decoder_dims, device=device)
+        rnn1_hidden = torch.zeros(batch_size, self.lstm_dims, device=device)
+        rnn2_hidden = torch.zeros(batch_size, self.lstm_dims, device=device)
+        hidden_states = (attn_hidden, rnn1_hidden, rnn2_hidden)
+        # Need to initialise all lstm cell states and pack into tuple for tidyness
+        rnn1_cell = torch.zeros(batch_size, self.lstm_dims, device=device)
+        rnn2_cell = torch.zeros(batch_size, self.lstm_dims, device=device)
+        cell_states = (rnn1_cell, rnn2_cell)
+        # Need a <GO> Frame for start of decoder loop
+        go_frame = torch.zeros(batch_size, self.n_mels, device=device)
+        # Need an initial context vector
+        context_vec = torch.zeros(batch_size, self.decoder_dims, device=device)
+        # Project the encoder outputs to avoid
+        # unnecessary matmuls in the decoder loop
+        encoder_seq = self.encoder(x)
+        encoder_seq_proj = self.encoder_proj(encoder_seq)
+        # Need a couple of lists for outputs
+        mel_outputs, attn_scores = [], []
+        # Run the decoder loop
+        for t in range(0, steps, self.r):
+            prenet_in = mel_outputs[-1][:, :, -1] if t > 0 else go_frame
+            mel_frames, scores, hidden_states, cell_states, context_vec = \
+            self.decoder(encoder_seq, encoder_seq_proj, prenet_in,
+                         hidden_states, cell_states, context_vec, t)
+            mel_outputs.append(mel_frames)
+            attn_scores.append(scores)
+            # Stop the loop if silent frames present
+            if (mel_frames < self.stop_threshold).all() and t > 10: break
+        # Concat the mel outputs into sequence
+        mel_outputs = torch.cat(mel_outputs, dim=2)
+        # Post-Process for Linear Spectrograms
+        postnet_out = self.postnet(mel_outputs)
+        linear = self.post_proj(postnet_out)
+        linear = linear.transpose(1, 2)[0].cpu().data.numpy()
+        mel_outputs = mel_outputs[0].cpu().data.numpy()
+        # For easy visualisation
+        attn_scores = torch.cat(attn_scores, 1)
+        attn_scores = attn_scores.cpu().data.numpy()[0]
+        self.train()
+        return mel_outputs, linear, attn_scores
+    def init_model(self):
+        for p in self.parameters():
+            if p.dim() > 1: nn.init.xavier_uniform_(p)
+    def get_step(self):
+        return self.step.data.item()
+    def reset_step(self):
+        # assignment to parameters or buffers is overloaded, updates internal dict entry
+        self.step = self.step.data.new_tensor(1)
+    def log(self, path, msg):
+        with open(path, 'a') as f:
+            print(msg, file=f)
+    def load(self, path: Union[str, Path]):
+        # Use device of model params as location for loaded state
+        device = next(self.parameters()).device
+        state_dict = torch.load(path, map_location=device)
+        # Backwards compatibility with old saved models
+        if 'r' in state_dict and not 'decoder.r' in state_dict:
+            self.r = state_dict['r']
+        self.load_state_dict(state_dict, strict=False)
+    def save(self, path: Union[str, Path]):
+        # No optimizer argument because saving a model should not include data
+        # only relevant in the training process - it should only be properties
+        # of the model itself. Let caller take care of saving optimzier state.
+        torch.save(self.state_dict(), path)
+    def num_params(self, print_out=True):
+        parameters = filter(lambda p: p.requires_grad, self.parameters())
+        parameters = sum([np.prod(p.size()) for p in parameters]) / 1_000_000
+        if print_out:
+            print('Trainable Parameters: %.3fM' % parameters)
+        return parameters

models/WaveRNNModel/notebooks/NB1 - Fit a Sine Wave.ipynb ADDED Viewed