ameerazam08 commited on Aug 7, 2024

Commit

79cf5f5

verified ·

1 Parent(s): bd4b078

Upload folder using huggingface_hub

Browse files

Files changed (44) hide show

.gitignore +8 -0
LICENSE +28 -0
README.md +35 -0
acoustic_forced_alignment/.gitignore +4 -0
acoustic_forced_alignment/README.md +215 -0
acoustic_forced_alignment/align_tg_words.py +86 -0
acoustic_forced_alignment/assets/2001000001.lab +1 -0
acoustic_forced_alignment/assets/2001000001.wav +0 -0
acoustic_forced_alignment/build_dataset.py +72 -0
acoustic_forced_alignment/check_tg.py +30 -0
acoustic_forced_alignment/combine_tg.py +113 -0
acoustic_forced_alignment/dictionaries/opencpop-extension.txt +601 -0
acoustic_forced_alignment/distribution.py +14 -0
acoustic_forced_alignment/enhance_tg.py +214 -0
acoustic_forced_alignment/reformat_wavs.py +43 -0
acoustic_forced_alignment/requirements.txt +11 -0
acoustic_forced_alignment/select_test_set.py +104 -0
acoustic_forced_alignment/slice_tg.py +99 -0
acoustic_forced_alignment/summary_pitch.py +70 -0
acoustic_forced_alignment/validate_labels.py +89 -0
acoustic_forced_alignment/validate_lengths.py +47 -0
midi-recognition/README.md +10 -0
midi-recognition/extract_midi.py +107 -0
midi-recognition/merge_wavs.py +70 -0
variance-temp-solution/.gitignore +12 -0
variance-temp-solution/README.md +178 -0
variance-temp-solution/add_ph_num.py +80 -0
variance-temp-solution/assets/.gitkeep +0 -0
variance-temp-solution/assets/rmvpe/model.pt +3 -0
variance-temp-solution/convert_ds.py +293 -0
variance-temp-solution/convert_txt.py +33 -0
variance-temp-solution/correct_cents.py +171 -0
variance-temp-solution/eliminate_short.py +91 -0
variance-temp-solution/estimate_midi.py +88 -0
variance-temp-solution/get_pitch.py +92 -0
variance-temp-solution/requirements.txt +5 -0
variance-temp-solution/rmvpe/__init__.py +1 -0
variance-temp-solution/rmvpe/constants.py +9 -0
variance-temp-solution/rmvpe/deepunet.py +173 -0
variance-temp-solution/rmvpe/inference.py +49 -0
variance-temp-solution/rmvpe/model.py +32 -0
variance-temp-solution/rmvpe/seq.py +10 -0
variance-temp-solution/rmvpe/spec.py +68 -0
variance-temp-solution/rmvpe/utils.py +43 -0

.gitignore ADDED Viewed

	@@ -0,0 +1,8 @@

+.idea
+.vscode
+*.pyc
+__pycache__/
+local_tools/
+/venv/
+.ipynb_checkpoints/

LICENSE ADDED Viewed

	@@ -0,0 +1,28 @@

+BSD 3-Clause License
+Copyright (c) 2023, Team OpenVPI
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+1. Redistributions of source code must retain the above copyright notice, this
+   list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright notice,
+   this list of conditions and the following disclaimer in the documentation
+   and/or other materials provided with the distribution.
+3. Neither the name of the copyright holder nor the names of its
+   contributors may be used to endorse or promote products derived from
+   this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

README.md ADDED Viewed

	@@ -0,0 +1,35 @@

+# MakeDiffSinger
+Pipelines and tools to build your own DiffSinger dataset.
+For the recommended standard dataset making pipelines, see:
+- acoustic-forced-alignment: make dataset from scratch with MFA for acoustic model training
+- variance-temp-solution: temporary solution to extend acoustic datasets into variance datasets
+For other useful pipelines and tools for making a dataset, welcome to raise issues or submit PRs.
+## DiffSinger dataset structure
+- dataset1/
+  - raw/
+    - wavs/
+      - recording1.wav
+      - recording2.wav
+      - ...
+    - transcriptions.csv
+- dataset2/
+  - raw/
+    - wavs/
+      - ...
+    - transcriptions.csv
+- ...
+## Essential tools to process and label your datasets
+Dataset tools now have their own repository: [dataset-tools](https://github.com/openvpi/dataset-tools).
+There are mainly 3 components:
+- AudioSlicer: Slice your recordings into short segments
+- MinLabel: Label *.lab files containing word transcriptions for acoustic model training.
+- SlurCutter: Edit MIDI sequence in *.ds files for variance model training.

acoustic_forced_alignment/.gitignore ADDED Viewed

	@@ -0,0 +1,4 @@

+assets/mfa-*/
+assets/*.zip
+segments/
+textgrids/

acoustic_forced_alignment/README.md ADDED Viewed

	@@ -0,0 +1,215 @@

+# Making Datasets from Scratch (Forced Alignment)
+This pipeline will guide you to build your dataset from raw recordings with MFA (Montreal Forced Aligner).
+## 0. Requirements
+This pipeline will require your dictionary having its corresponding MFA pretrained model. You can see currently supported dictionaries and download their MFA models in the table below:
+|  dictionary name   |    dictionary file     |                                          MFA model                                           |
+|:------------------:|:----------------------:|:--------------------------------------------------------------------------------------------:|
+| Opencpop extension | opencpop-extension.txt | [link](https://huggingface.co/datasets/fox7005/tool/resolve/main/mfa-opencpop-extension.zip) |
+Your recordings must meet the following conditions:
+1. They must be in one single folder. Files in sub-folders will be ignored.
+2. They must be in WAV format.
+3. They must have a sampling rate higher than 32 kHz.
+4. They should be clean, unaccompanied voices with no significant noise or reverb.
+5. They should contain only voices from one single human.
+<font color="red">**NOTICE:**</font> Before you train a model, you must obtain permission from the copyright holder of the dataset and make sure the provider is fully aware that you will train a model from their data, that you will or will not distribute the synthesized voices and model weights, and the potential risks of this kind of activity.
+## 1. Clone repo and install dependencies
+```bash
+git clone https://github.com/openvpi/MakeDiffSinger.git
+cd MakeDiffSinger/acoustic-forced-alignment
+conda create -n mfa python=3.8 --yes  # you must use a Conda environment!
+conda activate mfa
+conda install -c conda-forge montreal-forced-aligner==2.0.6 --yes  # install MFA
+pip install -r requirements.txt  # install other requirements
+```
+## 2. Prepare recordings and transcriptions
+### 2.1 Audio slicing
+The raw data must be sliced into segments of about 5-15 seconds. We recommend using [AudioSlicer](../README.md#essential-tools-to-process-and-label-your-datasets), a simple GUI application that can automatically slice audio files via silence detection.
+Run the following command to validate your segment lengths and count the total length of your sliced segments:
+```bash
+python validate_lengths.py --dir path/to/your/segments/
+```
+### 2.2 Label the segments
+All segments should have their transcriptions (or lyrics) annotated. See [assets/2001000001.wav](assets/2001000001.wav) and its corresponding label [assets/2001000001.lab](assets/2001000001.lab) as an example.
+Each segment should have one annotation file with the same filename as it and `.lab` extension, and placed in the same directory. In the annotation file, you should write all syllables sung or spoken in this segment. Syllables should be split by space, and only syllables that appears in the dictionary are allowed. In addition, all phonemes in the dictionary should be covered in the annotations. Please note that `SP`, `AP` and `<PAD>` should not be included in the labels although they are in your final phoneme set.
+We developed [MinLabel](../README.md#essential-tools-to-process-and-label-your-datasets), a simple yet efficient tool to help finishing this step.
+Once you finish labeling, run the following command to validate your labels:
+```bash
+python validate_labels.py --dir path/to/your/segments/ --dictionary path/to/your/dictionary.txt
+```
+This will ensure:
+- All recordings have their corresponding labels.
+- There are no unrecognizable phonemes that does not appear in the dictionary.
+- All phonemes in the dictionary are covered by the labels.
+If there are failed checks, please fix them and run again.
+A summary of your phoneme coverage will be generated. If there are some phonemes that have extremely few occurrences (for example, less than 20), it is highly recommended to add more recordings to cover these phonemes.
+## 3. Forced Alignment
+### 3.1 Reformat recordings
+Given the transcriptions of each segment, we are able to align the phoneme sequence to its corresponding audio, thus obtaining position and duration information of each phoneme.
+We use [Montreal Forced Aligner](https://github.com/MontrealCorpusTools/Montreal-Forced-Aligner) to do forced phoneme alignment.
+MFA fails on some platforms if the WAVs are not in 16kHz 16bit PCM format. The following command will reformat your recordings and copy the labels to another temporary directory. You may delete those temporary files afterwards.
+```bash
+python reformat_wavs.py --src path/to/your/segments/ --dst path/to/tmp/dir/
+```
+NOTE: `--normalize` can be added to normalize the audio files with respect to the peak value of the whole segments. This is especially helpful on aspiration detection during TextGrid enhancement if the original segments are too quite.
+### 3.2 Run MFA on the corpus
+MFA will align your labels to your recordings and save the results to TextGrid files.
+Download the MFA model and run the following command:
+```bash
+mfa align path/to/your/segments/ path/to/your/dictionary.txt path/to/your/model.zip path/to/your/textgrids/ --beam 100 --clean --overwrite
+```
+Run the following command to check if all TextGrids are successfully generated:
+```bash
+python check_tg.py --wavs path/to/your/segments/ --tg path/to/your/textgrids/
+```
+If the checks above fails, or the results are not good, please try another `--beam` value and run the MFA again. TextGrids generated by MFA are still raw and need further processing, so please do not edit them at this time.
+### 3.3 Enhance and finish the TextGrids
+MFA results might not be good on some long utterances. In this section, we:
+- Try to reduce errors for long utterances
+- Detect `AP`s and add `SP`s which have not been labeled before.
+Run:
+```bash
+python enhance_tg.py --wavs path/to/your/segments/ --dictionary path/to/your/dictionary.txt --src path/to/raw/textgrids/ --dst path/to/final/textgrids/
+```
+NOTE: There are other useful arguments of this script. If you understand them, you can try to get better results through adjusting those parameters.
+The final TextGrids can be saved for future use.
+If you are interested in the word-level pitch distribution of your dataset, run the following command:
+```bash
+python summary_pitch.py --wavs path/to/your/segments/ --tg path/to/final/textgrids/
+```
+### 3.4 (Optional) Manual TextGrids refinement
+With steps above, the TextGrids we get contains 2 tiers: the words and the phones. Manual refinement to your TextGrids may take lots of effort but will boost the performance and stability of your model.
+This section is a recommended (but not required) way to refine your TextGrids manually. Before you start, an additional dependency to achieve natural sorting needs to be installed:
+```bash
+pip install natsort
+```
+#### 3.4.1 Combine the recordings and TextGrids
+A full dataset can contain hundreds or thousands of auto-sliced recording segments and their corresponding TextGrids. The following command will combine them into long ones:
+```bash
+python combine_tg.py --wavs path/to/your/segments/ --tg path/to/your/final/textgrids/ --out path/to/your/combined/textgrids/
+```
+This will combine all items with same name except their suffixes and add a `sentences` tier in the combined TextGrids. The new sentences tier controls how the long combined recordings are split into short sentences. If you have other suffix pattern (default: `"_\d+"`) or want to change the bit-depth (default: PCM_16) of the combined recordings, see `python combine_tg.py --help`.
+#### 3.4.2 Manual editing
+TextGrids can be viewed and edited with [Praat](https://github.com/praat/praat) or [vLabeler](https://github.com/sdercolin/vlabeler) (recommended).
+The editing mainly involves the sentences tier and the phones tier. When editing, please ensure the sentences tier is aligned with the words and phones tier; but it is not required to align the words tier to the phones tier. If you want to remove a sentence or not to include one area in any sentences, just leave an empty mark on that area.
+#### 3.4.3 Slice the recordings and TextGrids
+After manual editing is finished, the words tier can be automatically re-aligned to the phones tier. Run:
+```bash
+python align_tg_words.py --tg path/to/your/combined/textgrids --dictionary path/to/your/dictionary.txt --overwrite
+```
+NOTE 1: This will overwrite your TextGrid files. You can back them up before running the command, or specify another output directory with `--out` option.
+NOTE 2: This script is also compatible with segmented 2-tier TextGrids.
+Then the TextGrids and recordings can be sliced according to the boundaries stored in the sentences tiers. Run:
+```bash
+python slice_tg.py --wavs path/to/your/combined/textgrids/ --out path/to/your/sliced/textgrids/refined/
+```
+By default, the output segments will be re-numbered like `item_000`, `item_001`, ..., `item_XXX`. If you want to use the marks stored in the sentences tier as the filenames, or want to change the bit-depth (default: PCM_16) of the sliced recordings, or control other behaviors, see `python slice_tg.py --help`.
+Now you can use these manually refined and re-sliced TextGrids and recordings for further steps.
+## 4. Build the final dataset
+The TextGrids need to be collected into a transcriptions.csv file as the final transcriptions. The CSV file will include the following columns:
+- name: the segment name
+- ph_seq: the phoneme sequence
+- ph_dur: the phoneme duration
+The recordings will be arranged like [this](../README.md#diffsinger-dataset-structure).
+Run:
+```bash
+python build_dataset.py --wavs path/to/your/segments/ --tg path/to/final/textgrids/ --dataset path/to/your/dataset/
+```
+NOTE 1: This will insert random silence parts around each segments by default for better `SP` stability. If you do not need these silence parts, for example, if your TextGrids have been manually refined, please use the `--skip_silence_insertion` option.
+NOTE 2: `--wav_subtype` can be used to specify the bit-depth of the saved WAV files. Options are `PCM_16` (default), `PCM_24`, `PCM_32`, `FLOAT`, and `DOUBLE`.
+After doing all things above, you should put it into data/ of the DiffSinger main repository. Now, your dataset can be used to train DiffSinger acoustic models. If you want to train DiffSinger variance models, please follow instructions [here](../variance-temp-solution/README.md).
+## 5. Write configuration file
+Copy the template configration file from `configs/templates` in the DiffSinger repository to your data folder, or a new folder if working with multi-speaker model. Specify required fields in the configurations, check `DiffSinger/docs/ConfigurationSchemas.md` for help on the meanings of those fields.
+For automatic validation set selection, you can leave the following field as empty. If the field is not empty, the script will prompt a overwrite confirmation later.
+```yaml
+...
+test_prefixes:
+...
+```
+And run:
+```bash
+python select_test_set.py path/to/your/config.yaml [--rel_path <PATH>]
+```
+NOTE 1: `--rel_path` is probably necessary if there are relative paths in your config file. If only absolute paths exist in it, you can omit this argument.
+NOTE 2: There are other useful arguments of this script. You can use them to change the total number of validation samples.

acoustic_forced_alignment/align_tg_words.py ADDED Viewed

	@@ -0,0 +1,86 @@

+import pathlib
+import click
+import textgrid
+import tqdm
+@click.command(help='Align words tiers in TextGrids to phones tiers')
+@click.option('--tg', required=True, help='Path to TextGrids (2-tier or 3-tier format)')
+@click.option('--dictionary', required=True, help='Path to the dictionary file')
+@click.option(
+    '--out', required=False,
+    help='Path to save the aligned TextGrids (defaults to the input directory)'
+)
+@click.option('--overwrite', is_flag=True, help='Overwrite existing files')
+def align_tg_words(tg, dictionary, out, overwrite):
+    tg_path_in = pathlib.Path(tg)
+    dict_path = pathlib.Path(dictionary)
+    tg_path_out = pathlib.Path(out) if out is not None else tg_path_in
+    tg_path_out.mkdir(parents=True, exist_ok=True)
+    with open(dict_path, 'r', encoding='utf8') as f:
+        rules = [ln.strip().split('\t') for ln in f.readlines()]
+    dictionary = {
+        'SP': ['SP'],
+        'AP': ['AP']
+    }
+    phoneme_set = {'SP', 'AP'}
+    for r in rules:
+        phonemes = r[1].split()
+        dictionary[r[0]] = phonemes
+        phoneme_set.update(phonemes)
+    for tgfile in tqdm.tqdm(tg_path_in.glob('*.TextGrid')):
+        tg = textgrid.TextGrid()
+        tg.read(tgfile)
+        old_words_tier: textgrid.IntervalTier = tg[-2]
+        if old_words_tier.name != 'words':
+            raise ValueError(
+                f'Invalid tier name or order in \'{tgfile}\'. '
+                f'The words tier should be the 1st tier of a 2-tier TextGrid, '
+                f'or the 2nd tier of a 3-tier TextGrid.'
+            )
+        phones_tier: textgrid.IntervalTier = tg[-1]
+        new_words_tier = textgrid.IntervalTier(name='words')
+        word_seq = [i.mark for i in old_words_tier]
+        word_div = []
+        ph_seq = [i.mark for i in phones_tier]
+        ph_dur = [i.duration() for i in phones_tier]
+        idx = 0
+        for i, word in enumerate(word_seq):
+            if word not in dictionary:
+                raise ValueError(f'Error invalid word in \'{tgfile}\' at {i}: {word}')
+            word_ph_seq = dictionary[word]
+            ph_num = len(word_ph_seq)
+            word_div.append(ph_num)
+            if word_ph_seq != ph_seq[idx: idx + ph_num]:
+                print(
+                    f'Warning: word and phones mismatch in \'{tgfile}\' '
+                    f'at word {i}, phone {idx}: {word} => {ph_seq[idx: idx + len(word_ph_seq)]}'
+                )
+            idx += ph_num
+        for i, phone in enumerate(ph_seq):
+            if phone not in phoneme_set:
+                raise ValueError(f'Error: invalid phone in \'{tgfile}\' at {i}: {phone}')
+        if sum(word_div) != len(ph_dur):
+            raise ValueError(
+                f'Error: word_div does not sum to number of phones in \'{tgfile}\'. '
+                f'Check the warnings above for more detailed mismatching positions.'
+            )
+        start = 0.
+        idx = 0
+        for j in range(len(word_seq)):
+            end = start + sum(ph_dur[idx: idx + word_div[j]])
+            new_words_tier.add(minTime=start, maxTime=end, mark=word_seq[j])
+            start = end
+            idx += word_div[j]
+        tg.tiers[-2] = new_words_tier
+        tg_file_out = tg_path_out / tgfile.name
+        if tg_file_out.exists() and not overwrite:
+            raise FileExistsError(str(tg_file_out))
+        tg.write(tg_file_out)
+if __name__ == '__main__':
+    align_tg_words()

acoustic_forced_alignment/assets/2001000001.lab ADDED Viewed

	@@ -0,0 +1 @@


1	+ gan shou ting zai wo fa duan de zhi jian

acoustic_forced_alignment/assets/2001000001.wav ADDED Viewed

Binary file (360 kB). View file

acoustic_forced_alignment/build_dataset.py ADDED Viewed

	@@ -0,0 +1,72 @@

+import csv
+import pathlib
+import random
+import click
+import librosa
+import numpy as np
+import soundfile
+import tqdm
+from textgrid import TextGrid
+@click.command(help='Collect phoneme alignments into transcriptions.csv')
+@click.option('--wavs', required=True, help='Path to the segments directory')
+@click.option('--tg', required=True, help='Path to the final TextGrids directory')
+@click.option('--dataset', required=True, help='Path to dataset directory')
+@click.option('--skip_silence_insertion', is_flag=True, show_default=True,
+              help='Do not insert silence around segments')
+@click.option('--wav_subtype', default="PCM_16", show_default=True,
+              help='WAV subtype')
+def build_dataset(wavs, tg, dataset, skip_silence_insertion, wav_subtype):
+    wavs = pathlib.Path(wavs)
+    tg_dir = pathlib.Path(tg)
+    del tg
+    dataset = pathlib.Path(dataset)
+    filelist = list(wavs.glob('*.wav'))
+    dataset.mkdir(parents=True, exist_ok=True)
+    (dataset / 'wavs').mkdir(exist_ok=True)
+    transcriptions = []
+    samplerate = 44100
+    min_sil = int(0.1 * samplerate)
+    max_sil = int(0.5 * samplerate)
+    for wavfile in tqdm.tqdm(filelist):
+        y, _ = librosa.load(wavfile, sr=samplerate, mono=True)
+        tgfile = tg_dir / wavfile.with_suffix('.TextGrid').name
+        tg = TextGrid()
+        tg.read(str(tgfile))
+        ph_seq = [ph.mark for ph in tg[1]]
+        ph_dur = [ph.maxTime - ph.minTime for ph in tg[1]]
+        if not skip_silence_insertion:
+            if random.random() < 0.5:
+                len_sil = random.randrange(min_sil, max_sil)
+                y = np.concatenate((np.zeros((len_sil,), dtype=np.float32), y))
+                if ph_seq[0] == 'SP':
+                    ph_dur[0] += len_sil / samplerate
+                else:
+                    ph_seq.insert(0, 'SP')
+                    ph_dur.insert(0, len_sil / samplerate)
+            if random.random() < 0.5:
+                len_sil = random.randrange(min_sil, max_sil)
+                y = np.concatenate((y, np.zeros((len_sil,), dtype=np.float32)))
+                if ph_seq[-1] == 'SP':
+                    ph_dur[-1] += len_sil / samplerate
+                else:
+                    ph_seq.append('SP')
+                    ph_dur.append(len_sil / samplerate)
+        ph_seq = ' '.join(ph_seq)
+        ph_dur = ' '.join([str(round(d, 6)) for d in ph_dur])
+        soundfile.write(dataset / 'wavs' / wavfile.name, y, samplerate, subtype=wav_subtype)
+        transcriptions.append({'name': wavfile.stem, 'ph_seq': ph_seq, 'ph_dur': ph_dur})
+    with open(dataset / 'transcriptions.csv', 'w', encoding='utf8', newline='') as f:
+        writer = csv.DictWriter(f, fieldnames=['name', 'ph_seq', 'ph_dur'])
+        writer.writeheader()
+        writer.writerows(transcriptions)
+    print(f'All wavs and transcriptions saved in {dataset}')
+if __name__ == '__main__':
+    build_dataset()

acoustic_forced_alignment/check_tg.py ADDED Viewed

	@@ -0,0 +1,30 @@

+import pathlib
+import click
+import tqdm
+@click.command('Check if all TextGrids are generated')
+@click.option('--wavs', required=True, help='Path to the segments directory')
+@click.option('--tg', required=True, help='Path to the TextGrids directory')
+def check_tg(wavs, tg):
+    wavs = pathlib.Path(wavs)
+    tg = pathlib.Path(tg)
+    missing = []
+    filelist = list(wavs.glob('*.wav'))
+    for wavfile in tqdm.tqdm(filelist):
+        tgfile = tg / wavfile.with_suffix('.TextGrid').name
+        if not tgfile.exists():
+            missing.append(tgfile)
+    if len(missing) > 0:
+        print(
+            'These TextGrids are missing! There are possible severe errors in labels of those corresponding segments. '
+            'If you do believe there are no errors, consider increase the \'--beam\' argument for MFA.')
+        for fn in missing:
+            print(f' - {fn}')
+    else:
+        print('All alignments have been successfully generated.')
+if __name__ == '__main__':
+    check_tg()

acoustic_forced_alignment/combine_tg.py ADDED Viewed

	@@ -0,0 +1,113 @@

+import pathlib
+import re
+from typing import Dict, List
+import click
+import librosa
+import natsort
+import numpy
+import soundfile
+import textgrid
+import tqdm
+def remove_suffix(string, suffix_pattern):
+    match = re.search(f'{suffix_pattern}$', string)
+    if not match:
+        return string
+    return string[:-len(match.group())]
+@click.command(help='Combine segmented 2-tier TextGrids and wavs into 3-tier TextGrids and long wavs')
+@click.option(
+    '--wavs', required=True,
+    help='Directory containing the segmented wav files'
+)
+@click.option(
+    '--tg', required=False,
+    help='Directory containing the segmented TextGrid files (defaults to wav directory)'
+)
+@click.option(
+    '--out', required=True,
+    help='Path to output directory for combined files'
+)
+@click.option(
+    '--suffix', required=False, default=r'_\d+',
+    help='Filename suffix pattern for file combination'
+)
+@click.option(
+    '--wav_subtype', required=False, default='PCM_16',
+    help='Wav subtype (defaults to PCM_16)'
+)
+@click.option(
+    '--overwrite', is_flag=True,
+    help='Overwrite existing files'
+)
+def combine_tg(wavs, tg, out, suffix, wav_subtype, overwrite):
+    wav_path_in = pathlib.Path(wavs)
+    tg_path_in = wav_path_in if tg is None else pathlib.Path(tg)
+    del tg
+    combined_path_out = pathlib.Path(out)
+    combined_path_out.mkdir(parents=True, exist_ok=True)
+    filelist: Dict[str, List[pathlib.Path]] = {}
+    for tg_file in tg_path_in.glob('*.TextGrid'):
+        stem = remove_suffix(tg_file.stem, suffix)
+        if stem not in filelist:
+            filelist[stem] = [tg_file]
+        else:
+            filelist[stem].append(tg_file)
+    for name, files in tqdm.tqdm(sorted(filelist.items(), key=lambda kv: kv[0])):
+        wav_segments = []
+        tg = textgrid.TextGrid()
+        sentences_tier = textgrid.IntervalTier(name='sentences')
+        words_tier = textgrid.IntervalTier(name='words')
+        phones_tier = textgrid.IntervalTier(name='phones')
+        sentence_start = 0.
+        sr = None
+        for tg_file in natsort.natsorted(files):
+            wav_file = (wav_path_in / tg_file.name).with_suffix('.wav')
+            waveform, sr_ = librosa.load(wav_file, sr=None)
+            if sr is None:
+                sr = sr_
+            else:
+                assert sr_ == sr, f'Cannot combine \'{tg_file.stem}\': incompatible samplerate ({sr_} != {sr})'
+            sentence_end = waveform.shape[0] / sr + sentence_start
+            wav_segments.append(waveform)
+            sentences_tier.add(minTime=sentence_start, maxTime=sentence_end, mark=wav_file.stem)
+            sentence_tg = textgrid.TextGrid()
+            sentence_tg.read(tg_file)
+            start = sentence_start
+            for j, word in enumerate(sentence_tg[0]):
+                if j == len(sentence_tg[0]) - 1:
+                    end = sentence_end
+                else:
+                    end = start + word.duration()
+                words_tier.add(minTime=start, maxTime=end, mark=word.mark)
+                start = end
+            start = sentence_start
+            for j, phone in enumerate(sentence_tg[1]):
+                if j == len(sentence_tg[1]) - 1:
+                    end = sentence_end
+                else:
+                    end = start + phone.duration()
+                phones_tier.add(minTime=start, maxTime=end, mark=phone.mark)
+                start = end
+            sentence_start = sentence_end
+        tg.append(sentences_tier)
+        tg.append(words_tier)
+        tg.append(phones_tier)
+        tg_file_out = combined_path_out / f'{name}.TextGrid'
+        wav_file_out = tg_file_out.with_suffix('.wav')
+        if wav_file_out.exists() and not overwrite:
+            raise FileExistsError(str(wav_file_out))
+        if tg_file_out.exists() and not overwrite:
+            raise FileExistsError(str(tg_file_out))
+        tg.write(tg_file_out)
+        full_wav = numpy.concatenate(wav_segments)
+        soundfile.write(wav_file_out, full_wav, samplerate=sr, subtype=wav_subtype)
+if __name__ == '__main__':
+    combine_tg()

acoustic_forced_alignment/dictionaries/opencpop-extension.txt ADDED Viewed

	@@ -0,0 +1,601 @@

+a	a
+ai	ai
+an	an
+ang	ang
+ao	ao
+ba	b a
+bai	b ai
+ban	b an
+bang	b ang
+bao	b ao
+be	b e
+bei	b ei
+ben	b en
+beng	b eng
+ber	b er
+bi	b i
+bia	b ia
+bian	b ian
+biang	b iang
+biao	b iao
+bie	b ie
+bin	b in
+bing	b ing
+biong	b iong
+biu	b iu
+bo	b o
+bong	b ong
+bou	b ou
+bu	b u
+bua	b ua
+buai	b uai
+buan	b uan
+buang	b uang
+bui	b ui
+bun	b un
+bv	b v
+bve	b ve
+ca	c a
+cai	c ai
+can	c an
+cang	c ang
+cao	c ao
+ce	c e
+cei	c ei
+cen	c en
+ceng	c eng
+cer	c er
+cha	ch a
+chai	ch ai
+chan	ch an
+chang	ch ang
+chao	ch ao
+che	ch e
+chei	ch ei
+chen	ch en
+cheng	ch eng
+cher	ch er
+chi	ch ir
+chong	ch ong
+chou	ch ou
+chu	ch u
+chua	ch ua
+chuai	ch uai
+chuan	ch uan
+chuang	ch uang
+chui	ch ui
+chun	ch un
+chuo	ch uo
+chv	ch v
+chyi	ch i
+ci	c i0
+cong	c ong
+cou	c ou
+cu	c u
+cua	c ua
+cuai	c uai
+cuan	c uan
+cuang	c uang
+cui	c ui
+cun	c un
+cuo	c uo
+cv	c v
+cyi	c i
+da	d a
+dai	d ai
+dan	d an
+dang	d ang
+dao	d ao
+de	d e
+dei	d ei
+den	d en
+deng	d eng
+der	d er
+di	d i
+dia	d ia
+dian	d ian
+diang	d iang
+diao	d iao
+die	d ie
+din	d in
+ding	d ing
+diong	d iong
+diu	d iu
+dong	d ong
+dou	d ou
+du	d u
+dua	d ua
+duai	d uai
+duan	d uan
+duang	d uang
+dui	d ui
+dun	d un
+duo	d uo
+dv	d v
+dve	d ve
+e	e
+ei	ei
+en	en
+eng	eng
+er	er
+fa	f a
+fai	f ai
+fan	f an
+fang	f ang
+fao	f ao
+fe	f e
+fei	f ei
+fen	f en
+feng	f eng
+fer	f er
+fi	f i
+fia	f ia
+fian	f ian
+fiang	f iang
+fiao	f iao
+fie	f ie
+fin	f in
+fing	f ing
+fiong	f iong
+fiu	f iu
+fo	f o
+fong	f ong
+fou	f ou
+fu	f u
+fua	f ua
+fuai	f uai
+fuan	f uan
+fuang	f uang
+fui	f ui
+fun	f un
+fv	f v
+fve	f ve
+ga	g a
+gai	g ai
+gan	g an
+gang	g ang
+gao	g ao
+ge	g e
+gei	g ei
+gen	g en
+geng	g eng
+ger	g er
+gi	g i
+gia	g ia
+gian	g ian
+giang	g iang
+giao	g iao
+gie	g ie
+gin	g in
+ging	g ing
+giong	g iong
+giu	g iu
+gong	g ong
+gou	g ou
+gu	g u
+gua	g ua
+guai	g uai
+guan	g uan
+guang	g uang
+gui	g ui
+gun	g un
+guo	g uo
+gv	g v
+gve	g ve
+ha	h a
+hai	h ai
+han	h an
+hang	h ang
+hao	h ao
+he	h e
+hei	h ei
+hen	h en
+heng	h eng
+her	h er
+hi	h i
+hia	h ia
+hian	h ian
+hiang	h iang
+hiao	h iao
+hie	h ie
+hin	h in
+hing	h ing
+hiong	h iong
+hiu	h iu
+hong	h ong
+hou	h ou
+hu	h u
+hua	h ua
+huai	h uai
+huan	h uan
+huang	h uang
+hui	h ui
+hun	h un
+huo	h uo
+hv	h v
+hve	h ve
+ji	j i
+jia	j ia
+jian	j ian
+jiang	j iang
+jiao	j iao
+jie	j ie
+jin	j in
+jing	j ing
+jiong	j iong
+jiu	j iu
+ju	j v
+juan	j van
+jue	j ve
+jun	j vn
+ka	k a
+kai	k ai
+kan	k an
+kang	k ang
+kao	k ao
+ke	k e
+kei	k ei
+ken	k en
+keng	k eng
+ker	k er
+ki	k i
+kia	k ia
+kian	k ian
+kiang	k iang
+kiao	k iao
+kie	k ie
+kin	k in
+king	k ing
+kiong	k iong
+kiu	k iu
+kong	k ong
+kou	k ou
+ku	k u
+kua	k ua
+kuai	k uai
+kuan	k uan
+kuang	k uang
+kui	k ui
+kun	k un
+kuo	k uo
+kv	k v
+kve	k ve
+la	l a
+lai	l ai
+lan	l an
+lang	l ang
+lao	l ao
+le	l e
+lei	l ei
+len	l en
+leng	l eng
+ler	l er
+li	l i
+lia	l ia
+lian	l ian
+liang	l iang
+liao	l iao
+lie	l ie
+lin	l in
+ling	l ing
+liong	l iong
+liu	l iu
+lo	l o
+long	l ong
+lou	l ou
+lu	l u
+lua	l ua
+luai	l uai
+luan	l uan
+luang	l uang
+lui	l ui
+lun	l un
+luo	l uo
+lv	l v
+lve	l ve
+ma	m a
+mai	m ai
+man	m an
+mang	m ang
+mao	m ao
+me	m e
+mei	m ei
+men	m en
+meng	m eng
+mer	m er
+mi	m i
+mia	m ia
+mian	m ian
+miang	m iang
+miao	m iao
+mie	m ie
+min	m in
+ming	m ing
+miong	m iong
+miu	m iu
+mo	m o
+mong	m ong
+mou	m ou
+mu	m u
+mua	m ua
+muai	m uai
+muan	m uan
+muang	m uang
+mui	m ui
+mun	m un
+mv	m v
+mve	m ve
+na	n a
+nai	n ai
+nan	n an
+nang	n ang
+nao	n ao
+ne	n e
+nei	n ei
+nen	n en
+neng	n eng
+ner	n er
+ni	n i
+nia	n ia
+nian	n ian
+niang	n iang
+niao	n iao
+nie	n ie
+nin	n in
+ning	n ing
+niong	n iong
+niu	n iu
+nong	n ong
+nou	n ou
+nu	n u
+nua	n ua
+nuai	n uai
+nuan	n uan
+nuang	n uang
+nui	n ui
+nun	n un
+nuo	n uo
+nv	n v
+nve	n ve
+o	o
+ong	ong
+ou	ou
+pa	p a
+pai	p ai
+pan	p an
+pang	p ang
+pao	p ao
+pe	p e
+pei	p ei
+pen	p en
+peng	p eng
+per	p er
+pi	p i
+pia	p ia
+pian	p ian
+piang	p iang
+piao	p iao
+pie	p ie
+pin	p in
+ping	p ing
+piong	p iong
+piu	p iu
+po	p o
+pong	p ong
+pou	p ou
+pu	p u
+pua	p ua
+puai	p uai
+puan	p uan
+puang	p uang
+pui	p ui
+pun	p un
+pv	p v
+pve	p ve
+qi	q i
+qia	q ia
+qian	q ian
+qiang	q iang
+qiao	q iao
+qie	q ie
+qin	q in
+qing	q ing
+qiong	q iong
+qiu	q iu
+qu	q v
+quan	q van
+que	q ve
+qun	q vn
+ra	r a
+rai	r ai
+ran	r an
+rang	r ang
+rao	r ao
+re	r e
+rei	r ei
+ren	r en
+reng	r eng
+rer	r er
+ri	r ir
+rong	r ong
+rou	r ou
+ru	r u
+rua	r ua
+ruai	r uai
+ruan	r uan
+ruang	r uang
+rui	r ui
+run	r un
+ruo	r uo
+rv	r v
+ryi	r i
+sa	s a
+sai	s ai
+san	s an
+sang	s ang
+sao	s ao
+se	s e
+sei	s ei
+sen	s en
+seng	s eng
+ser	s er
+sha	sh a
+shai	sh ai
+shan	sh an
+shang	sh ang
+shao	sh ao
+she	sh e
+shei	sh ei
+shen	sh en
+sheng	sh eng
+sher	sh er
+shi	sh ir
+shong	sh ong
+shou	sh ou
+shu	sh u
+shua	sh ua
+shuai	sh uai
+shuan	sh uan
+shuang	sh uang
+shui	sh ui
+shun	sh un
+shuo	sh uo
+shv	sh v
+shyi	sh i
+si	s i0
+song	s ong
+sou	s ou
+su	s u
+sua	s ua
+suai	s uai
+suan	s uan
+suang	s uang
+sui	s ui
+sun	s un
+suo	s uo
+sv	s v
+syi	s i
+ta	t a
+tai	t ai
+tan	t an
+tang	t ang
+tao	t ao
+te	t e
+tei	t ei
+ten	t en
+teng	t eng
+ter	t er
+ti	t i
+tia	t ia
+tian	t ian
+tiang	t iang
+tiao	t iao
+tie	t ie
+tin	t in
+ting	t ing
+tiong	t iong
+tong	t ong
+tou	t ou
+tu	t u
+tua	t ua
+tuai	t uai
+tuan	t uan
+tuang	t uang
+tui	t ui
+tun	t un
+tuo	t uo
+tv	t v
+tve	t ve
+wa	w a
+wai	w ai
+wan	w an
+wang	w ang
+wao	w ao
+we	w e
+wei	w ei
+wen	w en
+weng	w eng
+wer	w er
+wi	w i
+wo	w o
+wong	w ong
+wou	w ou
+wu	w u
+xi	x i
+xia	x ia
+xian	x ian
+xiang	x iang
+xiao	x iao
+xie	x ie
+xin	x in
+xing	x ing
+xiong	x iong
+xiu	x iu
+xu	x v
+xuan	x van
+xue	x ve
+xun	x vn
+ya	y a
+yai	y ai
+yan	y En
+yang	y ang
+yao	y ao
+ye	y E
+yei	y ei
+yi	y i
+yin	y in
+ying	y ing
+yo	y o
+yong	y ong
+you	y ou
+yu	y v
+yuan	y van
+yue	y ve
+yun	y vn
+ywu	y u
+za	z a
+zai	z ai
+zan	z an
+zang	z ang
+zao	z ao
+ze	z e
+zei	z ei
+zen	z en
+zeng	z eng
+zer	z er
+zha	zh a
+zhai	zh ai
+zhan	zh an
+zhang	zh ang
+zhao	zh ao
+zhe	zh e
+zhei	zh ei
+zhen	zh en
+zheng	zh eng
+zher	zh er
+zhi	zh ir
+zhong	zh ong
+zhou	zh ou
+zhu	zh u
+zhua	zh ua
+zhuai	zh uai
+zhuan	zh uan
+zhuang	zh uang
+zhui	zh ui
+zhun	zh un
+zhuo	zh uo
+zhv	zh v
+zhyi	zh i
+zi	z i0
+zong	z ong
+zou	z ou
+zu	z u
+zua	z ua
+zuai	z uai
+zuan	z uan
+zuang	z uang
+zui	z ui
+zun	z un
+zuo	z uo
+zv	z v
+zyi	z i

acoustic_forced_alignment/distribution.py ADDED Viewed

	@@ -0,0 +1,14 @@

+import matplotlib.pyplot as plt
+def draw_distribution(title, x_label, y_label, items: list, values: list, zoom=0.8):
+    plt.figure(figsize=(int(len(items) * zoom), 10))
+    plt.bar(x=items, height=values)
+    plt.tick_params(labelsize=15)
+    plt.xlim(-1, len(items))
+    for a, b in zip(items, values):
+        plt.text(a, b, b, ha='center', va='bottom', fontsize=15)
+    plt.grid()
+    plt.title(title, fontsize=30)
+    plt.xlabel(x_label, fontsize=20)
+    plt.ylabel(y_label, fontsize=20)

acoustic_forced_alignment/enhance_tg.py ADDED Viewed

	@@ -0,0 +1,214 @@

+import pathlib
+import click
+import librosa
+import numpy as np
+import parselmouth as pm
+import textgrid as tg
+import tqdm
+@click.command(help='Enhance and finish the TextGrids')
+@click.option('--wavs', required=True, help='Path to the segments directory')
+@click.option('--dictionary', required=True, help='Path to the dictionary file')
+@click.option('--src', required=True, help='Path to the raw TextGrids directory')
+@click.option('--dst', required=True, help='Path to the final TextGrids directory')
+@click.option('--f0_min', type=float, default=40., show_default=True, help='Minimum value of pitch')
+@click.option('--f0_max', type=float, default=1100., show_default=True, help='Maximum value of pitch')
+@click.option('--br_len', type=float, default=0.1, show_default=True,
+              help='Minimum length of breath in seconds')
+@click.option('--br_db', type=float, default=-60., show_default=True,
+              help='Threshold of RMS in dB for detecting breath')
+@click.option('--br_centroid', type=float, default=2000., show_default=True,
+              help='Threshold of spectral centroid in Hz for detecting breath')
+@click.option('--time_step', type=float, default=0.005, show_default=True,
+              help='Time step for feature extraction')
+@click.option('--min_space', type=float, default=0.04, show_default=True,
+              help='Minimum length of space in seconds')
+@click.option('--voicing_thresh_vowel', type=float, default=0.45, show_default=True,
+              help='Threshold of voicing for fixing long utterances')
+@click.option('--voicing_thresh_breath', type=float, default=0.6, show_default=True,
+              help='Threshold of voicing for detecting breath')
+@click.option('--br_win_sz', type=float, default=0.05, show_default=True,
+              help='Size of sliding window in seconds for detecting breath')
+def enhance_tg(
+        wavs, dictionary, src, dst,
+        f0_min, f0_max, br_len, br_db, br_centroid,
+        time_step, min_space, voicing_thresh_vowel, voicing_thresh_breath, br_win_sz
+):
+    wavs = pathlib.Path(wavs)
+    dict_path = pathlib.Path(dictionary)
+    src = pathlib.Path(src)
+    dst = pathlib.Path(dst)
+    dst.mkdir(parents=True, exist_ok=True)
+    with open(dict_path, 'r', encoding='utf8') as f:
+        rules = [ln.strip().split('\t') for ln in f.readlines()]
+    dictionary = {}
+    phoneme_set = set()
+    for r in rules:
+        phonemes = r[1].split()
+        dictionary[r[0]] = phonemes
+        phoneme_set.update(phonemes)
+    filelist = list(wavs.glob('*.wav'))
+    for wavfile in tqdm.tqdm(filelist):
+        tgfile = src / wavfile.with_suffix('.TextGrid').name
+        textgrid = tg.TextGrid()
+        textgrid.read(str(tgfile))
+        words = textgrid[0]
+        phones = textgrid[1]
+        sound = pm.Sound(str(wavfile))
+        f0_voicing_breath = sound.to_pitch_ac(
+            time_step=time_step,
+            voicing_threshold=voicing_thresh_breath,
+            pitch_floor=f0_min,
+            pitch_ceiling=f0_max,
+        ).selected_array['frequency']
+        f0_voicing_vowel = sound.to_pitch_ac(
+            time_step=time_step,
+            voicing_threshold=voicing_thresh_vowel,
+            pitch_floor=f0_min,
+            pitch_ceiling=f0_max,
+        ).selected_array['frequency']
+        y, sr = librosa.load(wavfile, sr=24000, mono=True)
+        hop_size = int(time_step * sr)
+        spectral_centroid = librosa.feature.spectral_centroid(y=y, sr=sr, n_fft=2048, hop_length=hop_size).squeeze(0)
+        # Fix long utterances
+        i = j = 0
+        while i < len(words):
+            word = words[i]
+            phone = phones[j]
+            if word.mark is not None and word.mark != '':
+                i += 1
+                j += len(dictionary[word.mark])
+                continue
+            if i == 0:
+                i += 1
+                j += 1
+                continue
+            prev_word = words[i - 1]
+            prev_phone = phones[j - 1]
+            # Extend length of long utterances
+            while word.minTime < word.maxTime - time_step:
+                pos = min(f0_voicing_vowel.shape[0] - 1, int(word.minTime / time_step))
+                if f0_voicing_vowel[pos] < f0_min:
+                    break
+                prev_word.maxTime += time_step
+                prev_phone.maxTime += time_step
+                word.minTime += time_step
+                phone.minTime += time_step
+            i += 1
+            j += 1
+        # Detect aspiration
+        i = j = 0
+        while i < len(words):
+            word = words[i]
+            phone = phones[j]
+            if word.mark is not None and word.mark != '':
+                i += 1
+                j += len(dictionary[word.mark])
+                continue
+            if word.maxTime - word.minTime < br_len:
+                i += 1
+                j += 1
+                continue
+            ap_ranges = []
+            br_start = None
+            win_pos = word.minTime
+            while win_pos + br_win_sz <= word.maxTime:
+                all_noisy = (f0_voicing_breath[
+                             int(win_pos / time_step): int((win_pos + br_win_sz) / time_step)] < f0_min).all()
+                rms_db = 20 * np.log10(
+                    np.clip(sound.get_rms(from_time=win_pos, to_time=win_pos + br_win_sz), a_min=1e-12, a_max=1))
+                # print(win_pos, win_pos + br_win_sz, all_noisy, rms_db)
+                if all_noisy and rms_db >= br_db:
+                    if br_start is None:
+                        br_start = win_pos
+                else:
+                    if br_start is not None:
+                        br_end = win_pos + br_win_sz - time_step
+                        if br_end - br_start >= br_len:
+                            centroid = spectral_centroid[int(br_start / time_step): int(br_end / time_step)].mean()
+                            if centroid >= br_centroid:
+                                ap_ranges.append((br_start, br_end))
+                        br_start = None
+                        win_pos = br_end
+                win_pos += time_step
+            if br_start is not None:
+                br_end = win_pos + br_win_sz - time_step
+                if br_end - br_start >= br_len:
+                    centroid = spectral_centroid[int(br_start / time_step): int(br_end / time_step)].mean()
+                    if centroid >= br_centroid:
+                        ap_ranges.append((br_start, br_end))
+            # print(ap_ranges)
+            if len(ap_ranges) == 0:
+                i += 1
+                j += 1
+                continue
+            words.removeInterval(word)
+            phones.removeInterval(phone)
+            if word.minTime < ap_ranges[0][0]:
+                words.add(minTime=word.minTime, maxTime=ap_ranges[0][0], mark=None)
+                phones.add(minTime=phone.minTime, maxTime=ap_ranges[0][0], mark=None)
+                i += 1
+                j += 1
+            for k, ap in enumerate(ap_ranges):
+                if k > 0:
+                    words.add(minTime=ap_ranges[k - 1][1], maxTime=ap[0], mark=None)
+                    phones.add(minTime=ap_ranges[k - 1][1], maxTime=ap[0], mark=None)
+                    i += 1
+                    j += 1
+                words.add(minTime=ap[0], maxTime=min(word.maxTime, ap[1]), mark='AP')
+                phones.add(minTime=ap[0], maxTime=min(word.maxTime, ap[1]), mark='AP')
+                i += 1
+                j += 1
+            if ap_ranges[-1][1] < word.maxTime:
+                words.add(minTime=ap_ranges[-1][1], maxTime=word.maxTime, mark=None)
+                phones.add(minTime=ap_ranges[-1][1], maxTime=phone.maxTime, mark=None)
+                i += 1
+                j += 1
+        # Remove short spaces
+        i = j = 0
+        while i < len(words):
+            word = words[i]
+            phone = phones[j]
+            if word.mark is not None and word.mark != '':
+                i += 1
+                j += (1 if word.mark == 'AP' else len(dictionary[word.mark]))
+                continue
+            if word.maxTime - word.minTime >= min_space:
+                word.mark = 'SP'
+                phone.mark = 'SP'
+                i += 1
+                j += 1
+                continue
+            if i == 0:
+                if len(words) >= 2:
+                    words[i + 1].minTime = word.minTime
+                    phones[j + 1].minTime = phone.minTime
+                    words.removeInterval(word)
+                    phones.removeInterval(phone)
+                else:
+                    break
+            elif i == len(words) - 1:
+                if len(words) >= 2:
+                    words[i - 1].maxTime = word.maxTime
+                    phones[j - 1].maxTime = phone.maxTime
+                    words.removeInterval(word)
+                    phones.removeInterval(phone)
+                else:
+                    break
+            else:
+                words[i - 1].maxTime = words[i + 1].minTime = (word.minTime + word.maxTime) / 2
+                phones[j - 1].maxTime = phones[j + 1].minTime = (phone.minTime + phone.maxTime) / 2
+                words.removeInterval(word)
+                phones.removeInterval(phone)
+        textgrid.write(str(dst / tgfile.name))
+if __name__ == '__main__':
+    enhance_tg()

acoustic_forced_alignment/reformat_wavs.py ADDED Viewed

	@@ -0,0 +1,43 @@

+import pathlib
+import shutil
+import click
+import librosa
+import numpy as np
+import soundfile
+import tqdm
+@click.command(help='Reformat the WAV files to 16kHz, 16bit PCM mono format and copy labels')
+@click.option('--src', required=True, help='Source segments directory')
+@click.option('--dst', required=True, help='Target segments directory')
+@click.option(
+    '--normalize',
+    is_flag=True, show_default=True, default=False,
+    help='Normalize the audio (peak calculated over all segments)'
+)
+def reformat_wavs(src, dst, normalize):
+    src = pathlib.Path(src).resolve()
+    dst = pathlib.Path(dst).resolve()
+    assert src != dst, 'src and dst should not be the same path'
+    assert src.is_dir() and (not dst.exists() or dst.is_dir()), 'src and dst must be directories'
+    dst.mkdir(parents=True, exist_ok=True)
+    samplerate = 16000
+    filelist = list(src.glob('*.wav'))
+    max_y = 1.0
+    if normalize:
+        max_y = 0.0
+        for file in tqdm.tqdm(filelist):
+            y, _ = librosa.load(file, sr=samplerate, mono=True)
+            max_y = max(max_y, np.max(np.abs(y)))
+        max_y += 0.01
+    for file in tqdm.tqdm(filelist):
+        y, _ = librosa.load(file, sr=samplerate, mono=True)
+        soundfile.write((dst / file.name), y / max_y, samplerate, subtype='PCM_16')
+        annotation = file.with_suffix('.lab')
+        shutil.copy(annotation, dst)
+    print('Reformatting and copying done.')
+if __name__ == '__main__':
+    reformat_wavs()

acoustic_forced_alignment/requirements.txt ADDED Viewed

	@@ -0,0 +1,11 @@

+biopython==1.78
+click
+librosa<0.10.0
+matplotlib
+praatio<6.0.0
+praat-parselmouth
+pyyaml
+soundfile
+sox
+sqlalchemy==1.4.46
+textgrid

acoustic_forced_alignment/select_test_set.py ADDED Viewed

	@@ -0,0 +1,104 @@

+import csv
+import random
+from collections import defaultdict
+from pathlib import Path
+import click
+import yaml
+# noinspection PyShadowingBuiltins
+@click.command(help='Randomly select test samples')
+@click.argument(
+    'config',
+    type=click.Path(file_okay=True, dir_okay=False, resolve_path=True, writable=True, path_type=Path),
+    metavar="CONFIG"
+)
+@click.option(
+    '--rel_path',
+    type=click.Path(file_okay=False, dir_okay=True, resolve_path=True, path_type=Path),
+    default=None,
+    help='Path that is relative to the paths mentioned in the config file.'
+)
+@click.option(
+    '--min', '_min',
+    show_default=True,
+    type=click.IntRange(min=1),
+    default=10,
+    help='Minimum number of test samples.'
+)
+@click.option(
+    '--max', '_max',
+    show_default=True,
+    type=click.IntRange(min=1),
+    default=20,
+    help='Maximum number of test samples (note that each speaker will have at least one test sample).'
+)
+@click.option(
+    '--per_speaker',
+    show_default=True,
+    type=click.IntRange(min=1),
+    default=4,
+    help='Expected number of test samples per speaker.'
+)
+def select_test_set(config, rel_path, _min, _max, per_speaker):
+    assert _min <= _max, 'min must be smaller or equal to max'
+    with open(config, 'r', encoding='utf8') as f:
+        hparams = yaml.safe_load(f)
+    spk_map = None
+    spk_ids = hparams['spk_ids']
+    speakers = hparams['speakers']
+    raw_data_dirs = list(map(Path, hparams['raw_data_dir']))
+    assert isinstance(speakers, list), 'Speakers must be a list'
+    assert len(speakers) == len(raw_data_dirs), \
+        'Number of raw data dirs must equal number of speaker names!'
+    if not spk_ids:
+        spk_ids = list(range(len(raw_data_dirs)))
+    else:
+        assert len(spk_ids) == len(raw_data_dirs), \
+            'Length of explicitly given spk_ids must equal the number of raw datasets.'
+    assert max(spk_ids) < hparams['num_spk'], \
+        f'Index in spk_id sequence {spk_ids} is out of range. All values should be smaller than num_spk.'
+    spk_map = {}
+    path_spk_map = defaultdict(list)
+    for ds_id, (spk_name, raw_path, spk_id) in enumerate(zip(speakers, raw_data_dirs, spk_ids)):
+        if spk_name in spk_map and spk_map[spk_name] != spk_id:
+            raise ValueError(f'Invalid speaker ID assignment. Name \'{spk_name}\' is assigned '
+                                f'with different speaker IDs: {spk_map[spk_name]} and {spk_id}.')
+        spk_map[spk_name] = spk_id
+        path_spk_map[spk_id].append((ds_id, rel_path / raw_path if rel_path else raw_path))
+    training_cases = []
+    for spk_raw_dirs in path_spk_map.values():
+        training_case = []
+        # training cases from the same speaker are grouped together
+        for ds_id, raw_data_dir in spk_raw_dirs:
+            with open(raw_data_dir / 'transcriptions.csv', 'r', encoding='utf8') as f:
+                reader = csv.DictReader(f)
+                for row in reader:
+                    if (raw_data_dir / 'wavs' / f'{row["name"]}.wav').exists():
+                        training_case.append(f'{ds_id}:{row["name"]}')
+        training_cases.append(training_case)
+    test_prefixes = []
+    total = min(_max, max(_min, per_speaker * len(training_cases)))
+    quotient, remainder = total // len(training_cases), total % len(training_cases)
+    if quotient == 0:
+        test_counts = [1] * len(training_cases)
+    else:
+        test_counts = [quotient + 1] * remainder + [quotient] * (len(training_cases) - remainder)
+    for i, count in enumerate(test_counts):
+        test_prefixes += sorted(random.sample(training_cases[i], count))
+    if not hparams['test_prefixes'] or click.confirm('Overwrite existing test prefixes?', abort=False):
+        hparams['test_prefixes'] = test_prefixes
+        hparams['num_valid_plots'] = len(test_prefixes)
+        with open(config, 'w', encoding='utf8') as f:
+            yaml.dump(hparams, f, sort_keys=False)
+        print('Test prefixes saved.')
+    else:
+        print('Test prefixes not saved, aborted.')
+if __name__ == '__main__':
+    select_test_set()

acoustic_forced_alignment/slice_tg.py ADDED Viewed

	@@ -0,0 +1,99 @@

+import pathlib
+import click
+import librosa
+import soundfile
+import textgrid
+import tqdm
+@click.command(help='Slice 3-tier TextGrids and long recordings into segmented 2-tier TextGrids and wavs')
+@click.option(
+    '--wavs', required=True,
+    help='Directory containing the segmented wav files'
+)
+@click.option(
+    '--tg', required=False,
+    help='Directory containing the segmented TextGrid files (defaults to wav directory)'
+)
+@click.option(
+    '--out', required=True,
+    help='Path to output directory for combined files'
+)
+@click.option(
+    '--preserve_sentence_names', is_flag=True,
+    help='Whether to use sentence marks as filenames (will be re-numbered by default)'
+)
+@click.option(
+    '--digits', required=False, type=int, default=3,
+    help='Number of suffix digits (defaults to 3, will be padded with zeros on the left)'
+)
+@click.option(
+    '--wav_subtype', required=False, default='PCM_16',
+    help='Wav subtype (defaults to PCM_16)'
+)
+@click.option(
+    '--overwrite', is_flag=True,
+    help='Overwrite existing files'
+)
+def slice_tg(wavs, tg, out, preserve_sentence_names, digits, wav_subtype, overwrite):
+    wav_path_in = pathlib.Path(wavs)
+    tg_path_in = wav_path_in if tg is None else pathlib.Path(tg)
+    del tg
+    sliced_path_out = pathlib.Path(out)
+    sliced_path_out.mkdir(parents=True, exist_ok=True)
+    for tg_file in tqdm.tqdm(tg_path_in.glob('*.TextGrid')):
+        tg = textgrid.TextGrid()
+        tg.read(tg_file)
+        wav, sr = librosa.load((wav_path_in / tg_file.name).with_suffix('.wav'), sr=None)
+        sentences_tier = tg[0]
+        words_tier = tg[1]
+        phones_tier = tg[2]
+        idx = 0
+        for sentence in sentences_tier:
+            if sentence.mark == '':
+                continue
+            sentence_tg = textgrid.TextGrid()
+            sentence_words_tier = textgrid.IntervalTier(name='words')
+            sentence_phones_tier = textgrid.IntervalTier(name='phones')
+            for word in words_tier:
+                min_time = max(sentence.minTime, word.minTime)
+                max_time = min(sentence.maxTime, word.maxTime)
+                if min_time >= max_time:
+                    continue
+                sentence_words_tier.add(
+                    minTime=min_time - sentence.minTime, maxTime=max_time - sentence.minTime, mark=word.mark
+                )
+            for phone in phones_tier:
+                min_time = max(sentence.minTime, phone.minTime)
+                max_time = min(sentence.maxTime, phone.maxTime)
+                if min_time >= max_time:
+                    continue
+                sentence_phones_tier.add(
+                    minTime=min_time - sentence.minTime, maxTime=max_time - sentence.minTime, mark=phone.mark
+                )
+            sentence_tg.append(sentence_words_tier)
+            sentence_tg.append(sentence_phones_tier)
+            if preserve_sentence_names:
+                tg_file_out = sliced_path_out / f'{sentence.mark}.TextGrid'
+                wav_file_out = tg_file_out.with_suffix('.wav')
+            else:
+                tg_file_out = sliced_path_out / f'{tg_file.stem}_{str(idx).zfill(digits)}.TextGrid'
+                wav_file_out = tg_file_out.with_suffix('.wav')
+            if tg_file_out.exists() and not overwrite:
+                raise FileExistsError(str(tg_file_out))
+            if wav_file_out.exists() and not overwrite:
+                raise FileExistsError(str(wav_file_out))
+            sentence_tg.write(tg_file_out)
+            sentence_wav = wav[int(sentence.minTime * sr): min(wav.shape[0], int(sentence.maxTime * sr) + 1)]
+            soundfile.write(
+                wav_file_out,
+                sentence_wav, samplerate=sr, subtype=wav_subtype
+            )
+            idx += 1
+if __name__ == '__main__':
+    slice_tg()

acoustic_forced_alignment/summary_pitch.py ADDED Viewed

	@@ -0,0 +1,70 @@

+import pathlib
+import click
+import librosa
+import matplotlib.pyplot as plt
+import numpy as np
+import parselmouth as pm
+import tqdm
+from textgrid import TextGrid
+import distribution
+@click.command(help='Generate word-level pitch summary')
+@click.option('--wavs', required=True, help='Path to the segments directory')
+@click.option('--tg', required=True, help='Path to the TextGrids directory')
+def summary_pitch(wavs, tg):
+    wavs = pathlib.Path(wavs)
+    tg_dir = pathlib.Path(tg)
+    del tg
+    filelist = list(wavs.glob('*.wav'))
+    pit_map = {}
+    f0_min = 40.
+    f0_max = 1100.
+    voicing_thresh_vowel = 0.45
+    for wavfile in tqdm.tqdm(filelist):
+        tg = TextGrid()
+        tg.read(tg_dir / wavfile.with_suffix('.TextGrid').name)
+        timestep = 0.01
+        f0 = pm.Sound(str(wavfile)).to_pitch_ac(
+            time_step=timestep,
+            voicing_threshold=voicing_thresh_vowel,
+            pitch_floor=f0_min,
+            pitch_ceiling=f0_max,
+        ).selected_array['frequency']
+        pitch = 12. * np.log2(f0 / 440.) + 69.
+        for word in tg[0]:
+            if word.mark in ['AP', 'SP']:
+                continue
+            if word.maxTime - word.minTime < timestep:
+                continue
+            word_pit = pitch[int(word.minTime / timestep): int(word.maxTime / timestep)]
+            word_pit = np.extract(word_pit >= 0, word_pit)
+            if word_pit.shape[0] == 0:
+                continue
+            counts = np.bincount(word_pit.astype(np.int64))
+            midi = counts.argmax()
+            if midi in pit_map:
+                pit_map[midi] += 1
+            else:
+                pit_map[midi] = 1
+    midi_keys = sorted(pit_map.keys())
+    midi_keys = list(range(midi_keys[0], midi_keys[-1] + 1))
+    distribution.draw_distribution(
+        title='Pitch Distribution Summary',
+        x_label='Pitch',
+        y_label='Number of occurrences',
+        items=[librosa.midi_to_note(k) for k in midi_keys],
+        values=[pit_map.get(k, 0) for k in midi_keys]
+    )
+    pitch_summary = wavs / 'pitch_distribution.jpg'
+    plt.savefig(fname=pitch_summary,
+                bbox_inches='tight',
+                pad_inches=0.25)
+    print(f'Pitch distribution summary saved to {pitch_summary}')
+if __name__ == '__main__':
+    summary_pitch()

acoustic_forced_alignment/validate_labels.py ADDED Viewed

	@@ -0,0 +1,89 @@

+import pathlib
+import click
+import matplotlib.pyplot as plt
+import tqdm
+import distribution
+# noinspection PyShadowingBuiltins
+@click.command(help='Validate transcription labels')
+@click.option('--dir', required=True, help='Path to the segments directory')
+@click.option('--dictionary', required=True, help='Path to the dictionary file')
+def validate_labels(dir, dictionary):
+    # Load dictionary
+    dict_path = pathlib.Path(dictionary)
+    with open(dict_path, 'r', encoding='utf8') as f:
+        rules = [ln.strip().split('\t') for ln in f.readlines()]
+    dictionary = {}
+    phoneme_set = set()
+    for r in rules:
+        phonemes = r[1].split()
+        dictionary[r[0]] = phonemes
+        phoneme_set.update(phonemes)
+    # Run checks
+    check_failed = False
+    covered = set()
+    phoneme_map = {}
+    for ph in sorted(phoneme_set):
+        phoneme_map[ph] = 0
+    segments_dir = pathlib.Path(dir)
+    filelist = list(segments_dir.glob('*.wav'))
+    for file in tqdm.tqdm(filelist):
+        filename = file.stem
+        annotation = file.with_suffix('.lab')
+        if not annotation.exists():
+            print(f'No annotation found for \'{filename}\'!')
+            check_failed = True
+            continue
+        with open(annotation, 'r', encoding='utf8') as f:
+            syllables = f.read().strip().split()
+        if not syllables:
+            print(f'Annotation file \'{annotation}\' is empty!')
+            check_failed = True
+        else:
+            oov = []
+            for s in syllables:
+                if s not in dictionary:
+                    oov.append(s)
+                else:
+                    for ph in dictionary[s]:
+                        phoneme_map[ph] += 1
+                    covered.update(dictionary[s])
+            if oov:
+                print(f'Syllable(s) {oov} not allowed in annotation file \'{annotation}\'')
+                check_failed = True
+    # Phoneme coverage
+    uncovered = phoneme_set - covered
+    if uncovered:
+        print(f'The following phonemes are not covered!')
+        print(sorted(uncovered))
+        print('Please add more recordings to cover these phonemes.')
+        check_failed = True
+    if not check_failed:
+        print('All annotations are well prepared.')
+    phoneme_list = sorted(phoneme_set)
+    phoneme_counts = [phoneme_map[ph] for ph in phoneme_list]
+    distribution.draw_distribution(
+        title='Phoneme Distribution Summary',
+        x_label='Phoneme',
+        y_label='Number of occurrences',
+        items=phoneme_list,
+        values=phoneme_counts
+    )
+    phoneme_summary = segments_dir / 'phoneme_distribution.jpg'
+    plt.savefig(fname=phoneme_summary,
+                bbox_inches='tight',
+                pad_inches=0.25)
+    print(f'Phoneme distribution summary saved to {phoneme_summary}')
+if __name__ == '__main__':
+    validate_labels()

acoustic_forced_alignment/validate_lengths.py ADDED Viewed

	@@ -0,0 +1,47 @@

+import librosa
+import tqdm
+import os
+import pathlib
+import click
+def length(src: str):
+    if os.path.isfile(src) and src.endswith('.wav'):
+        return librosa.get_duration(filename=src) / 3600.
+    elif os.path.isdir(src):
+        total = 0
+        for ch in [os.path.join(src, c) for c in os.listdir(src)]:
+            total += length(ch)
+        return total
+    return 0
+# noinspection PyShadowingBuiltins
+@click.command(help='Validate segment lengths')
+@click.option('--dir', required=True, help='Path to the segments directory')
+def validate_lengths(dir):
+    dir = pathlib.Path(dir)
+    assert dir.exists() and dir.is_dir(), 'The chosen path does not exist or is not a directory.'
+    reported = False
+    filelist = list(dir.glob('*.wav'))
+    total_length = 0.
+    for file in tqdm.tqdm(filelist):
+        wave_seconds = librosa.get_duration(filename=str(file))
+        if wave_seconds < 2.:
+            reported = True
+            print(f'Too short! \'{file}\' has a length of {round(wave_seconds, 1)} seconds!')
+        if wave_seconds > 20.:
+            reported = True
+            print(f'Too long! \'{file}\' has a length of {round(wave_seconds, 1)} seconds!')
+        total_length += wave_seconds / 3600.
+    print(f'Found {len(filelist)} segments with total length of {round(total_length, 2)} hours.')
+    if not reported:
+        print('All segments have proper length.')
+if __name__ == '__main__':
+    validate_lengths()

midi-recognition/README.md ADDED Viewed

	@@ -0,0 +1,10 @@

+# MIDI Recognition
+## 1. merge_wavs.py
+Merge short audio clips into long audio segments of similar length (e.g. 4 min) and a fixed sampling rate (e.g. 16000) and save the timestamps into tags.json.
+## 2. extract_midi.py
+Extract MIDI sequences from of OpenSVIP json files, split them back into short clips according to tags.json, and add them into transcriptions.csv.

midi-recognition/extract_midi.py ADDED Viewed

	@@ -0,0 +1,107 @@

+import csv
+import json
+import pathlib
+import click
+import librosa
+from typing import List, Tuple
+@click.command(help='Extract MIDI sequences from OpenSVIP json files and add them into transcriptions.csv')
+@click.argument('json_dir', metavar='JSONS')
+@click.argument('csv_file', metavar='TRANSCRIPTIONS')
+@click.option('--key', type=int, default=0, show_default=True,
+              metavar='SEMITONES', help='Key transition')
+def extract_midi(json_dir, csv_file, key):
+    json_dir = pathlib.Path(json_dir).resolve()
+    assert json_dir.exists(), 'The json directory does not exist.'
+    tags_file = json_dir / 'tags.json'
+    assert tags_file.exists(), 'The tags.json does not exist.'
+    csv_file = pathlib.Path(csv_file).resolve()
+    assert csv_file.resolve(), 'The path to transcriptions.csv does not exist.'
+    tol = 0.001
+    with open(tags_file, 'r', encoding='utf8') as f:
+        tags: dict = json.load(f)
+    # Read MIDI sequences
+    note_seq_map: dict = {}  # key: merged filename, value: note sequence
+    for json_file in json_dir.iterdir():
+        if json_file.stem not in tags or not json_file.is_file() or json_file.suffix != '.json':
+            continue
+        with open(json_file, 'r', encoding='utf8') as f:
+            json_obj: dict = json.load(f)
+        assert len(json_obj['SongTempoList']) == 1, \
+            f'[ERROR] {json_file.name}: there must be one and only one single tempo in the project.'
+        tempo = json_obj['SongTempoList'][0]['BPM']
+        midi_seq: list = json_obj['TrackList'][0]['NoteList']
+        note_seq: List[Tuple[str, float]] = []  # (note, duration)
+        prev_pos: int = 0  # in ticks
+        for i, midi in enumerate(midi_seq):
+            if prev_pos < midi['StartPos']:
+                note_seq.append(
+                    ('rest', (midi['StartPos'] - prev_pos) / 8 / tempo)
+                )
+            note_seq.append(
+                (librosa.midi_to_note(midi['KeyNumber'] + key, unicode=False), midi['Length'] / 8 / tempo)
+            )
+            prev_pos = midi['StartPos'] + midi['Length']
+        remain_secs = prev_pos / 8 / tempo - sum(t['duration'] for t in tags[json_file.stem])
+        if remain_secs > tol:
+            note_seq.append(
+                ('rest', remain_secs)
+            )
+        note_seq_map[json_file.stem] = note_seq
+    # Load transcriptions
+    transcriptions: dict = {}  # key: split filename, value: attr dict
+    with open(csv_file, 'r', encoding='utf8') as f:
+        reader = csv.DictReader(f)
+        for attrs in reader:
+            transcriptions[attrs['name']] = attrs
+    # Split note sequence and add into transcriptions
+    for merged_name, note_seq in note_seq_map.items():
+        note_seq: Tuple[str, float]
+        idx = 0
+        offset = 0.
+        cur_note_secs = 0.
+        cur_clip_secs = 0.
+        for split_tag in tags[merged_name]:
+            split_note_seq = []
+            while idx < len(note_seq):
+                cur_note_dur = note_seq[idx][1] - offset
+                if cur_note_secs + cur_note_dur <= cur_clip_secs + split_tag['duration']:
+                    split_note_seq.append(
+                        (note_seq[idx][0], cur_note_dur)
+                    )
+                    idx += 1
+                    cur_note_secs += cur_note_dur
+                    offset = 0.
+                else:
+                    offset = cur_clip_secs + split_tag['duration'] - cur_note_secs
+                    cur_note_secs += offset
+                    cur_clip_secs += split_tag['duration']
+                    split_note_seq.append(
+                        (note_seq[idx][0], offset)
+                    )
+                    break
+            if idx == len(note_seq) and cur_clip_secs + split_tag['duration'] - cur_note_secs >= tol:
+                split_note_seq.append(
+                    ('rest', cur_clip_secs + split_tag['duration'] - cur_note_secs)
+                )
+            if split_tag['filename'] not in transcriptions:
+                continue
+            dst_dict = transcriptions[split_tag['filename']]
+            dst_dict['note_seq'] = ' '.join(n[0] for n in split_note_seq)
+            dst_dict['note_dur'] = ' '.join(str(n[1]) for n in split_note_seq)
+    with open(csv_file, 'w', encoding='utf8', newline='') as f:
+        writer = csv.DictWriter(f, fieldnames=['name', 'ph_seq', 'ph_dur', 'ph_num', 'note_seq', 'note_dur'])
+        writer.writeheader()
+        writer.writerows(v for _, v in transcriptions.items())
+if __name__ == '__main__':
+    extract_midi()

midi-recognition/merge_wavs.py ADDED Viewed

	@@ -0,0 +1,70 @@

+import tqdm
+import json
+import pathlib
+from collections import OrderedDict
+import click
+import librosa
+import numpy as np
+import soundfile
+@click.command(help='Merge clips into segments of similar length')
+@click.argument('input_wavs', metavar='INPUT_WAVS')
+@click.argument('output_wavs', metavar='OUTPUT_WAVS')
+@click.option('--length', type=int, required=False, default=240, metavar='SECONDS')
+@click.option('--sr', type=int, required=False, default=16000)
+def merge_wavs(
+        input_wavs, output_wavs, length, sr
+):
+    input_wavs = pathlib.Path(input_wavs).resolve()
+    assert input_wavs.exists(), 'The input directory does not exist.'
+    output_wavs = pathlib.Path(output_wavs).resolve()
+    assert not output_wavs.exists() or all(False for _ in output_wavs.iterdir()), \
+        'The output directory is not empty.'
+    output_wavs.mkdir(parents=True, exist_ok=True)
+    tags = OrderedDict()
+    count = 0
+    cache: list[tuple[str, np.ndarray]] = []
+    cache_len = 0.
+    def save_cache():
+        nonlocal tags, count, cache, cache_len
+        waveform_merged = np.concatenate(tuple(c[1] for c in cache))
+        filename = (output_wavs / str(count).zfill(8)).with_suffix('.wav')
+        soundfile.write(
+            str(filename),
+            waveform_merged, sr, format='WAV'
+        )
+        tags[str(filename.stem)] = [
+            {
+                'filename': c[0],
+                'duration': c[1].shape[0] / sr
+            }
+            for c in cache
+        ]
+        cache.clear()
+        cache_len = 0.
+        count += 1
+    for wav in tqdm.tqdm(input_wavs.iterdir()):
+        if not wav.is_file() or wav.suffix != '.wav':
+            continue
+        y, _ = librosa.load(wav, sr=sr, mono=True)
+        cur_len = y.shape[0] / sr
+        if len(cache) > 0 and cache_len + cur_len >= length:
+            save_cache()
+        cache.append((wav.stem, y))
+        cache_len += cur_len
+    if len(cache) > 0:
+        save_cache()
+    tags_path = output_wavs / 'tags.json'
+    with open(tags_path, 'w', encoding='utf8') as f:
+        json.dump(tags, f, ensure_ascii=False, indent=2)
+        print(f'Timestamps saved to {tags_path}')
+if __name__ == '__main__':
+    merge_wavs()

variance-temp-solution/.gitignore ADDED Viewed

	@@ -0,0 +1,12 @@

+.idea
+*.pyc
+__pycache__/
+*.sh
+local_tools/
+/venv/
+.vscode
+.ipynb_checkpoints/
+assets/*
+!assets/.gitkeep

variance-temp-solution/README.md ADDED Viewed

	@@ -0,0 +1,178 @@

+# Making variance datasets (temporary solution)
+This pipeline will guide you to migrate your old DiffSinger datasets to the new and complete format for both acoustic and variance model training.
+## 1. Clone repo and install dependencies
+```bash
+git clone https://github.com/openvpi/MakeDiffSinger.git
+cd MakeDiffSinger/variance-temp-solution
+pip install -r requirements.txt  # or you can reuse a pre-existing DiffSinger environment
+```
+## 2. Convert transcriptions
+Assume you have a DiffSinger dataset which contains a transcriptions.txt file.
+Run:
+```bash
+python convert_txt.py path/to/your/transcriptions.txt
+```
+This will generate transcriptions.csv in the same folder as transcriptions.txt, which has three attributes: `name`, `ph_seq` and `ph_dur`.
+## 3. Add `ph_num` attribute
+The attribute `ph_num` is needed for training the variance models especially if you need to train the phoneme duration predictor. This attribute represents the number of phones that each word contains.
+In singing, vowels, instead of consonants, are used to align with the beginnings of notes. For this reason, each word should start with a vowel/AP/SP, and end with leading consonant(s) of the next word (if there are any). See the example below:
+```text
+text      |   AP   |     shi     |        zhe       |  => word transcriptions (pinyin, romaji, etc.)
+ph_seq    |   AP   |  sh  |  ir  | zh |      e      |  => phoneme sequence
+ph_num    |       2       |     2     |      1      |  => word-level phoneme division
+```
+where `sh` and `zh` are consonants, `AP`, `ir` and `e` can be regarded as vowels. There are one special case that a word can start with a consonants: isolated consonants. In this case, all phones in the word are consonants.
+For all monosyllabic phoneme systems (at most one vowel in one word), this step can be performed automatically.
+### 3.1 two-part dictionaries (Chinese, Japanese, etc.)
+A two-part dictionary has "V" and "C-V" phoneme patterns.
+Run:
+```bash
+python add_ph_num.py path/to/your/transcriptions.csv --dictionary path/to/your/dictionary.txt
+```
+### 3.2 monosyllabic phoneme systems (Cantonese, Korean, etc.)
+A universal monosyllabic phoneme system has "C(m)-V-C(n)" (m,n >= 0) phoneme patterns.
+1. Collect all vowels into vowels.txt, divided by spaces.
+2. Collect all consonants into consonants.txt, divided by spaces.
+3. Run:
+   ```bash
+   python add_ph_num.py path/to/your/transcriptions.csv --vowels vowels.txt --consonants consonants.txt
+   ```
+### 3.3 polysyllabic phoneme systems (English, Russian, etc.)
+We recommand this step be manually performed because word divisions cannot be infered from phoneme sequences in these phoneme systems.
+> After finishing this step, the transcriptions.csv file can be directly used to train the phoneme duration predictor. If you want to train a pitch predictor, you must finish the remaining steps as follows.
+>
+## 4. Estimate note values
+The note tier is another division of words besides the phoneme tier. See the example below:
+```text
+ph_seq       |   AP   |  sh  |  ir  | zh |      e      |  => phoneme sequence
+ph_num       |       2       |     2     |      1      |  => word-level phoneme division
+note_seq     |     rest      |    D#3    | D#3 |  C4   |  => note sequence
+note_slur    |       0       |     0     |  0  |   1   |  => slur flag (will not be stored)
+```
+Note sequences can be automatically estimated and manually refined in two ways.
+### 4.1 Infer a rough pitch value for each word
+The following program can infer a rough note value for each word. There are no slurs - slurs are hard to judge, and different people have different labeling styles.
+Run:
+```bash
+python estimate_midi.py path/to/your/transcriptions.csv path/to/your/wavs
+```
+> **IMPORTANT**
+>
+> This step only estimates the rough MIDI value for each word. You have to refine the MIDI sequences, otherwise the pitch predictor will not be accurate.
+### 4.2 (New!) Use the AI-powered MIDI extractor - SOME
+SOME (Singing-Oriented MIDI Extractor) is a NN-based MIDI extractor developed under the DiffSinger ecosystem. See guidance [here](https://github.com/openvpi/SOME#inference-via-pretrained-model-diffsinger-dataset) for using it on your DiffSinger dataset.
+## 5. Refine MIDI sequences
+### 5.1 take apart transcriptions.csv into DS files
+Run:
+```bash
+python convert_ds.py csv2ds path/to/your/transcriptions.csv path/to/your/wavs
+```
+This will generate *.ds files matching your *.wav files in the same directory.
+> **IMPORTANT**
+>
+> In this step, we highly recommend using RMVPE, a more accurate NN-based pitch extraction algorithm, to get better pitch results. See guidance [here](#rmvpe-pitch-extraction-algorithm).
+>
+> Also note that after you finish manual MIDI refinement, please use the **same algorithm** and **same model** in your DiffSinger configuration files for variance model training to get the best results.
+### 5.2 manually edit MIDI sequences
+Get the latest release of SlurCutter from [here](../README.md#essential-tools-to-process-and-label-your-datasets). This simple tool helps you adjust MIDI pitch in each DS file and cut notes into slurs if neccessary. Be sure to back up your DS files before you start, since this tool will automatically save and overwrite an edited DS file.
+### 5.3 re-combine DS files into transcriptions.csv
+Run:
+```bash
+python convert_ds.py ds2csv path/to/your/ds path/to/your/transcriptions.csv
+```
+This will generate a new transcriptions.csv from the DS files you just edited. Append `-f` if you are sure you want to overwrite the original transcription file (and the script complains about it).
+Now the transcriptions.csv can be used for all functionalities of DiffSinger training.
+`convert_ds.py ds2csv` supports DS files which have no corresponding WAV files. All sentences in these files will be assigned a virtual item name, and inserted into the transcriptions. This is a preparation to support using DS tuning projects to train a variance model. In addition, `curves.json` file is written to support `f0` sequence refinement.
+## (Appendix) other useful tools
+### RMVPE pitch extraction algorithm
+convert_ds.py and estimate_midi.py supports the state-of-the-art RMVPE pitch extraction algorithm. To use it:
+- Install PyTorch via [official guidance](https://pytorch.org/get-started/locally/).
+- Get RMVPE pretrained model [here](https://github.com/yxlllc/RMVPE/releases).
+- Put the RMVPE model.pt in `variance-temp-solution/assets/rmvpe/`.
+- Use `--pe rmvpe` when running `python convert_ds.py csv2ds` or `python estimate_midi.py`.
+### correct_cents.py
+Apply cents correction to note sequences in a transcriptions.csv to offset the out-of-tune errors. Need pitch extracted from waveforms for reference.
+Usage:
+```bash
+python correct_cents.py csv path/to/your/transcriptions.csv path/to/your/wavs
+```
+or
+```bash
+python correct_cents.py ds path/to/your/ds/files
+```
+Note: this operation will overwrite your input file(s).
+### eliminate_short.py
+Eliminate short slur notes in DS files. Slurs that are shorter than a given threshold will be merged into its neighboring notes within the same word.
+Usage:
+```bash
+python eliminate_short.py path/to/your/ds/files
+```
+Note: this operation will overwrite your input DS files.

variance-temp-solution/add_ph_num.py ADDED Viewed

	@@ -0,0 +1,80 @@

+import csv
+import pathlib
+import click
+@click.command(help='Add ph_num attribute into transcriptions.csv')
+@click.argument('transcription', metavar='TRANSCRIPTIONS')
+@click.option('--dictionary', metavar='DICTIONARY')
+@click.option('--vowels', metavar='FILE')
+@click.option('--consonants', metavar='FILE')
+def add_ph_num(
+        transcription: str,
+        dictionary: str = None,
+        vowels: str = None,
+        consonants: str = None
+):
+    assert dictionary is not None or (vowels is not None and consonants is not None), \
+        'Either dictionary file or vowels and consonants file should be specified.'
+    if dictionary is not None:
+        dictionary = pathlib.Path(dictionary).resolve()
+        vowels = {'SP', 'AP'}
+        consonants = set()
+        with open(dictionary, 'r', encoding='utf8') as f:
+            rules = f.readlines()
+        for r in rules:
+            syllable, phonemes = r.split('\t')
+            phonemes = phonemes.split()
+            assert len(phonemes) <= 2, 'We only support two-phase dictionaries for automatically adding ph_num.'
+            if len(phonemes) == 1:
+                vowels.add(phonemes[0])
+            else:
+                consonants.add(phonemes[0])
+                vowels.add(phonemes[1])
+    else:
+        vowels_path = pathlib.Path(vowels).resolve()
+        consonants_path = pathlib.Path(consonants).resolve()
+        vowels = {'SP', 'AP'}
+        consonants = set()
+        with open(vowels_path, 'r', encoding='utf8') as f:
+            vowels.update(f.read().split())
+        with open(consonants_path, 'r', encoding='utf8') as f:
+            consonants.update(f.read().split())
+        overlapped = vowels.intersection(consonants)
+        assert len(vowels.intersection(consonants)) == 0, \
+            'Vowel set and consonant set overlapped. The following phonemes ' \
+            'appear both as vowels and as consonants:\n' \
+            f'{sorted(overlapped)}'
+    transcription = pathlib.Path(transcription).resolve()
+    items: list[dict] = []
+    with open(transcription, 'r', encoding='utf8') as f:
+        reader = csv.DictReader(f)
+        for item in reader:
+            items.append(item)
+    for item in items:
+        item: dict
+        ph_seq = item['ph_seq'].split()
+        for ph in ph_seq:
+            assert ph in vowels or ph in consonants, \
+                f'Invalid phoneme symbol \'{ph}\' in \'{item["name"]}\'.'
+        ph_num = []
+        i = 0
+        while i < len(ph_seq):
+            j = i + 1
+            while j < len(ph_seq) and ph_seq[j] in consonants:
+                j += 1
+            ph_num.append(str(j - i))
+            i = j
+        item['ph_num'] = ' '.join(ph_num)
+    with open(transcription, 'w', encoding='utf8', newline='') as f:
+        writer = csv.DictWriter(f, fieldnames=items[0].keys())
+        writer.writeheader()
+        writer.writerows(items)
+if __name__ == '__main__':
+    add_ph_num()

variance-temp-solution/assets/.gitkeep ADDED Viewed

File without changes

variance-temp-solution/assets/rmvpe/model.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:1d49bd662038808878c9d7420e0f583f506fe69086cc384f0da88f0b3a4e1115
+size 368492925

variance-temp-solution/convert_ds.py ADDED Viewed

	@@ -0,0 +1,293 @@

+import csv
+import json
+import pathlib
+from decimal import Decimal
+from math import isclose
+import click
+import librosa
+import numpy as np
+from tqdm import tqdm
+from get_pitch import get_pitch
+def try_resolve_note_slur_by_matching(ph_dur, ph_num, note_dur, tol):
+    if len(ph_num) > len(note_dur):
+        raise ValueError("ph_num should not be longer than note_dur.")
+    ph_num_cum = np.cumsum([0] + ph_num)
+    word_pos = np.cumsum([sum(ph_dur[l:r]) for l, r in zip(ph_num_cum[:-1], ph_num_cum[1:])])
+    note_pos = np.cumsum(note_dur)
+    new_note_dur = []
+    note_slur = []
+    idx_word, idx_note = 0, 0
+    slur = False
+    while idx_word < len(word_pos) and idx_note < len(note_pos):
+        if isclose(word_pos[idx_word], note_pos[idx_note], abs_tol=tol):
+            note_slur.append(1 if slur else 0)
+            new_note_dur.append(word_pos[idx_word])
+            idx_word += 1
+            idx_note += 1
+            slur = False
+        elif note_pos[idx_note] > word_pos[idx_word]:
+            raise ValueError("Cannot resolve note_slur by matching.")
+        elif note_pos[idx_note] <= word_pos[idx_word]:
+            note_slur.append(1 if slur else 0)
+            new_note_dur.append(note_pos[idx_note])
+            idx_note += 1
+            slur = True
+    ret_note_dur = np.diff(new_note_dur, prepend=Decimal("0.0")).tolist()
+    assert len(ret_note_dur) == len(note_slur)
+    return ret_note_dur, note_slur
+def try_resolve_slur_by_slicing(ph_dur, ph_num, note_seq, note_dur, tol):
+    ph_num_cum = np.cumsum([0] + ph_num)
+    word_pos = np.cumsum([sum(ph_dur[l:r]) for l, r in zip(ph_num_cum[:-1], ph_num_cum[1:])])
+    note_pos = np.cumsum(note_dur)
+    new_note_seq = []
+    new_note_dur = []
+    note_slur = []
+    idx_word, idx_note = 0, 0
+    while idx_word < len(word_pos):
+        slur = False
+        if note_pos[idx_note] > word_pos[idx_word] and not isclose(
+            note_pos[idx_note], word_pos[idx_word], abs_tol=tol
+        ):
+            new_note_seq.append(note_seq[idx_note])
+            new_note_dur.append(word_pos[idx_word])
+            note_slur.append(1 if slur else 0)
+        else:
+            while idx_note < len(note_pos) and (
+                note_pos[idx_note] < word_pos[idx_word]
+                or isclose(note_pos[idx_note], word_pos[idx_word], abs_tol=tol)
+            ):
+                new_note_seq.append(note_seq[idx_note])
+                new_note_dur.append(note_pos[idx_note])
+                note_slur.append(1 if slur else 0)
+                slur = True
+                idx_note += 1
+            if new_note_dur[-1] < word_pos[idx_word]:
+                if isclose(new_note_dur[-1], word_pos[idx_word], abs_tol=tol):
+                    new_note_dur[-1] = word_pos[idx_word]
+                else:
+                    new_note_seq.append(note_seq[idx_note])
+                    new_note_dur.append(word_pos[idx_word])
+                    note_slur.append(1 if slur else 0)
+        idx_word += 1
+    ret_note_dur = np.diff(new_note_dur, prepend=Decimal("0.0")).tolist()
+    assert len(new_note_seq) == len(ret_note_dur) == len(note_slur)
+    return new_note_seq, ret_note_dur, note_slur
+@click.group()
+def cli():
+    pass
+@click.command(help="Convert a transcription file to DS files")
+@click.argument(
+    "transcription_file",
+    type=click.Path(
+        dir_okay=False,
+        resolve_path=True,
+        path_type=pathlib.Path,
+        exists=True,
+        readable=True,
+    ),
+    metavar="TRANSCRIPTIONS",
+)
+@click.argument(
+    "wavs_folder",
+    type=click.Path(file_okay=False, resolve_path=True, path_type=pathlib.Path),
+    metavar="FOLDER",
+)
+@click.option(
+    "--tolerance",
+    "-t",
+    type=float,
+    default=0.005,
+    help="Tolerance for ph_dur/note_dur mismatch",
+    metavar="FLOAT",
+)
+@click.option(
+    "--hop_size", "-h", type=int, default=512, help="Hop size for f0_seq", metavar="INT"
+)
+@click.option(
+    "--sample_rate",
+    "-s",
+    type=int,
+    default=44100,
+    help="Sample rate of audio",
+    metavar="INT",
+)
+@click.option(
+    "--pe",
+    type=str,
+    default="parselmouth",
+    help="Pitch extractor (parselmouth, rmvpe)",
+    metavar="ALGORITHM",
+)
+def csv2ds(transcription_file, wavs_folder, tolerance, hop_size, sample_rate, pe):
+    """Convert a transcription file to DS file"""
+    assert wavs_folder.is_dir(), "wavs folder not found."
+    out_ds = {}
+    out_exists = []
+    with open(transcription_file, "r", encoding="utf-8") as f:
+        for trans_line in tqdm(csv.DictReader(f)):
+            item_name = trans_line["name"]
+            wav_fn = wavs_folder / f"{item_name}.wav"
+            ds_fn = wavs_folder / f"{item_name}.ds"
+            ph_dur = list(map(Decimal, trans_line["ph_dur"].strip().split()))
+            ph_num = list(map(int, trans_line["ph_num"].strip().split()))
+            note_seq = trans_line["note_seq"].strip().split()
+            note_dur = list(map(Decimal, trans_line["note_dur"].strip().split()))
+            note_glide = trans_line["note_glide"].strip().split() if "note_glide" in trans_line else None
+            assert wav_fn.is_file(), f"{item_name}.wav not found."
+            assert len(ph_dur) == sum(ph_num), "ph_dur and ph_num mismatch."
+            assert len(note_seq) == len(note_dur), "note_seq and note_dur should have the same length."
+            if note_glide:
+                assert len(note_glide) == len(note_seq), "note_glide and note_seq should have the same length."
+            assert isclose(
+                sum(ph_dur), sum(note_dur), abs_tol=tolerance
+            ), f"[{item_name}] ERROR: mismatch total duration: {sum(ph_dur) - sum(note_dur)}"
+            # Resolve note_slur
+            if "note_slur" in trans_line and trans_line["note_slur"]:
+                note_slur = list(map(int, trans_line["note_slur"].strip().split()))
+            else:
+                try:
+                    note_dur, note_slur = try_resolve_note_slur_by_matching(
+                        ph_dur, ph_num, note_dur, tolerance
+                    )
+                except ValueError:
+                    # logging.warning(f"note_slur is not resolved by matching for {item_name}")
+                    note_seq, note_dur, note_slur = try_resolve_slur_by_slicing(
+                        ph_dur, ph_num, note_seq, note_dur, tolerance
+                    )
+            # Extract f0_seq
+            wav, _ = librosa.load(wav_fn, sr=sample_rate, mono=True)
+            # length = len(wav) + (win_size - hop_size) // 2 + (win_size - hop_size + 1) // 2
+            # length = ceil((length - win_size) / hop_size)
+            f0_timestep, f0, _ = get_pitch(pe, wav, hop_size, sample_rate)
+            ds_content = [
+                {
+                    "offset": 0.0,
+                    "text": trans_line["ph_seq"],
+                    "ph_seq": trans_line["ph_seq"],
+                    "ph_dur": " ".join(str(round(d, 6)) for d in ph_dur),
+                    "ph_num": trans_line["ph_num"],
+                    "note_seq": " ".join(note_seq),
+                    "note_dur": " ".join(str(round(d, 6)) for d in note_dur),
+                    "note_slur": " ".join(map(str, note_slur)),
+                    "f0_seq": " ".join(map("{:.1f}".format, f0)),
+                    "f0_timestep": str(f0_timestep),
+                }
+            ]
+            if note_glide:
+                ds_content[0]["note_glide"] = " ".join(note_glide)
+            out_ds[ds_fn] = ds_content
+            if ds_fn.exists():
+                out_exists.append(ds_fn)
+    if not out_exists or click.confirm(f"Overwrite {len(out_exists)} existing DS files?", abort=False):
+        for ds_fn, ds_content in out_ds.items():
+            with open(ds_fn, "w", encoding="utf-8") as f:
+                json.dump(ds_content, f, ensure_ascii=False, indent=4)
+    else:
+        click.echo("Aborted.")
+@click.command(help="Convert DS files to a transcription and curve files")
+@click.argument(
+    "ds_folder",
+    type=click.Path(file_okay=False, resolve_path=True, exists=True, path_type=pathlib.Path),
+    metavar="FOLDER",
+)
+@click.argument(
+    "transcription_file",
+    type=click.Path(file_okay=True, dir_okay=False, resolve_path=True, path_type=pathlib.Path),
+    metavar="TRANSCRIPTIONS",
+)
+@click.option(
+    "--overwrite",
+    "-f",
+    is_flag=True,
+    default=False,
+    help="Overwrite existing transcription file",
+)
+def ds2csv(ds_folder, transcription_file, overwrite):
+    """Convert DS files to a transcription file"""
+    if not overwrite and transcription_file.exists():
+        raise FileExistsError(f"{transcription_file} already exist.")
+    transcriptions = []
+    any_with_glide = False
+    # records that have corresponding wav files, assuming it's midi annotation
+    for fp in tqdm(ds_folder.glob("*.ds"), ncols=80):
+        if fp.with_suffix(".wav").exists():
+            with open(fp, "r", encoding="utf-8") as f:
+                ds = json.load(f)
+                transcriptions.append(
+                    {
+                        "name": fp.stem,
+                        "ph_seq": ds[0]["ph_seq"],
+                        "ph_dur": " ".join(str(round(Decimal(d), 6)) for d in ds[0]["ph_dur"].split()),
+                        "ph_num": ds[0]["ph_num"],
+                        "note_seq": ds[0]["note_seq"],
+                        "note_dur": " ".join(str(round(Decimal(d), 6)) for d in ds[0]["note_dur"].split()),
+                        # "note_slur": ds[0]["note_slur"],
+                    }
+                )
+                if "note_glide" in ds[0]:
+                    any_with_glide = True
+                    transcriptions[-1]["note_glide"] = ds[0]["note_glide"]
+    # Lone DS files.
+    for fp in tqdm(ds_folder.glob("*.ds"), ncols=80):
+        if not fp.with_suffix(".wav").exists():
+            with open(fp, "r", encoding="utf-8") as f:
+                ds = json.load(f)
+                for idx, sub_ds in enumerate(ds):
+                    item_name = f"{fp.stem}#{idx}" if len(ds) > 1 else fp.stem
+                    transcriptions.append(
+                        {
+                            "name": item_name,
+                            "ph_seq": sub_ds["ph_seq"],
+                            "ph_dur": " ".join(str(round(Decimal(d), 6)) for d in sub_ds["ph_dur"].split()),
+                            "ph_num": sub_ds["ph_num"],
+                            "note_seq": sub_ds["note_seq"],
+                            "note_dur": " ".join(str(round(Decimal(d), 6)) for d in sub_ds["note_dur"].split()),
+                            # "note_slur": sub_ds["note_slur"],
+                        }
+                    )
+                    if "note_glide" in sub_ds:
+                        any_with_glide = True
+                        transcriptions[-1]["note_glide"] = sub_ds["note_glide"]
+    if any_with_glide:
+        for row in transcriptions:
+            if "note_glide" not in row:
+                row["note_glide"] = " ".join(["none"] * len(row["note_seq"].split()))
+    with open(transcription_file, "w", newline="", encoding="utf-8") as f:
+        writer = csv.DictWriter(
+            f,
+            fieldnames=[
+                "name",
+                "ph_seq",
+                "ph_dur",
+                "ph_num",
+                "note_seq",
+                "note_dur",
+                # "note_slur",
+            ] + (["note_glide"] if any_with_glide else []),
+        )
+        writer.writeheader()
+        writer.writerows(transcriptions)
+cli.add_command(csv2ds)
+cli.add_command(ds2csv)
+if __name__ == "__main__":
+    cli()

variance-temp-solution/convert_txt.py ADDED Viewed

	@@ -0,0 +1,33 @@

+import csv
+import pathlib
+import click
+@click.command(help='Migrate transcriptions.txt in old datasets to transcriptions.csv')
+@click.argument('input_txt', metavar='INPUT')
+def convert_txt(
+        input_txt: str
+):
+    input_txt = pathlib.Path(input_txt).resolve()
+    assert input_txt.exists(), 'The input file does not exist.'
+    with open(input_txt, 'r', encoding='utf8') as f:
+        utterances = f.readlines()
+    utterances = [u.split('|') for u in utterances]
+    utterances = [
+        {
+            'name': u[0],
+            'ph_seq': u[2],
+            'ph_dur': u[5]
+        }
+        for u in utterances
+    ]
+    with open(input_txt.with_suffix('.csv'), 'w', encoding='utf8', newline='') as f:
+        writer = csv.DictWriter(f, fieldnames=['name', 'ph_seq', 'ph_dur'])
+        writer.writeheader()
+        writer.writerows(utterances)
+if __name__ == '__main__':
+    convert_txt()

variance-temp-solution/correct_cents.py ADDED Viewed

	@@ -0,0 +1,171 @@

+import json
+import math
+import warnings
+from collections import OrderedDict
+import librosa
+import numpy as np
+import tqdm
+import pathlib
+from csv import DictReader, DictWriter
+import click
+from get_pitch import get_pitch_parselmouth
+warns = []
+def get_aligned_pitch(wav_path: pathlib.Path, total_secs: float, timestep: float):
+    waveform, _ = librosa.load(wav_path, sr=44100, mono=True)
+    _, f0, _ = get_pitch_parselmouth(waveform, 512, 44100)
+    pitch = librosa.hz_to_midi(f0)
+    if pitch.shape[0] < total_secs / timestep:
+        pad = math.ceil(total_secs / timestep) - pitch.shape[0]
+        pitch = np.pad(pitch, [0, pad], mode='constant', constant_values=[0, pitch[-1]])
+    return pitch
+def correct_cents_item(
+        name: str, item: OrderedDict, ref_pitch: np.ndarray,
+        timestep: float, error_ratio: float
+):
+    note_seq = item['note_seq'].split()
+    note_dur = [float(d) for d in item['note_dur'].split()]
+    assert len(note_seq) == len(note_dur)
+    start = 0.
+    note_seq_correct = []
+    for i, (note, dur) in enumerate(zip(note_seq, note_dur)):
+        end = start + dur
+        if note == 'rest':
+            start = end
+            note_seq_correct.append('rest')
+            continue
+        midi = librosa.note_to_midi(note, round_midi=False)
+        start_idx = math.floor(start / timestep)
+        end_idx = math.ceil(end / timestep)
+        note_pitch = ref_pitch[start_idx: end_idx]
+        note_pitch_close = note_pitch[(note_pitch >= midi - 0.5) & (note_pitch < midi + 0.5)]
+        if len(note_pitch_close) < len(note_pitch) * error_ratio or len(note_pitch) == 0:
+            warns.append({
+                'position': name,
+                'note_index': i,
+                'note_value': note
+            })
+            if len(note_pitch) == 0 or len(note_pitch_close) == 0:
+                start = end
+                note_seq_correct.append(note)
+                continue
+        midi_correct = np.mean(note_pitch_close)
+        note_seq_correct.append(librosa.midi_to_note(midi_correct, cents=True, unicode=False))
+        start = end
+    item['note_seq'] = ' '.join(note_seq_correct)
+def save_warnings(save_dir: pathlib.Path):
+    if len(warns) > 0:
+        save_path = save_dir.resolve() / 'warnings.csv'
+        with open(save_path, 'w', encoding='utf8', newline='') as f:
+            writer = DictWriter(f, fieldnames=['position', 'note_index', 'note_value'])
+            writer.writeheader()
+            writer.writerows(warns)
+        warnings.warn(
+            message=f'possible labeling errors saved in {save_path}',
+            category=UserWarning
+        )
+        warnings.filterwarnings(action='default')
+@click.group(help='Apply cents correction to note sequences')
+def correct_cents():
+    pass
+@correct_cents.command(help='Apply cents correction to note sequences in transcriptions.csv')
+@click.argument('transcriptions', metavar='TRANSCRIPTIONS')
+@click.argument('waveforms', metavar='WAVS')
+@click.option('--error_ratio', metavar='RATIO', type=float, default=0.4,
+              help='If the percentage of pitch points within a deviation of 50 cents compared to the note label '
+                   'is lower than this value, a warning will be raised.')
+def csv(
+        transcriptions,
+        waveforms,
+        error_ratio
+):
+    transcriptions = pathlib.Path(transcriptions).resolve()
+    waveforms = pathlib.Path(waveforms).resolve()
+    with open(transcriptions, 'r', encoding='utf8') as f:
+        reader = DictReader(f)
+        items: list[OrderedDict] = []
+        for item in reader:
+            items.append(OrderedDict(item))
+    timestep = 512 / 44100
+    for item in tqdm.tqdm(items):
+        item: OrderedDict
+        ref_pitch = get_aligned_pitch(
+            wav_path=waveforms / (item['name'] + '.wav'),
+            total_secs=sum(float(d) for d in item['note_dur'].split()),
+            timestep=timestep
+        )
+        correct_cents_item(
+            name=item['name'], item=item, ref_pitch=ref_pitch,
+            timestep=timestep, error_ratio=error_ratio
+        )
+    with open(transcriptions, 'w', encoding='utf8', newline='') as f:
+        writer = DictWriter(f, fieldnames=['name', 'ph_seq', 'ph_dur', 'ph_num', 'note_seq', 'note_dur'])
+        writer.writeheader()
+        writer.writerows(items)
+    save_warnings(transcriptions.parent)
+@correct_cents.command(help='Apply cents correction to note sequences in DS files')
+@click.argument('ds_dir', metavar='DS_DIR')
+@click.option('--error_ratio', metavar='RATIO', type=float, default=0.4,
+              help='If the percentage of pitch points within a deviation of 50 cents compared to the note label '
+                   'is lower than this value, a warning will be raised.')
+def ds(
+        ds_dir,
+        error_ratio
+):
+    ds_dir = pathlib.Path(ds_dir).resolve()
+    assert ds_dir.exists(), 'The directory of DS files does not exist.'
+    timestep = 512 / 44100
+    for ds_file in tqdm.tqdm(ds_dir.glob('*.ds')):
+        if not ds_file.is_file():
+            continue
+        assert ds_file.with_suffix('.wav').exists(), \
+            f'Missing corresponding .wav file of {ds_file.name}.'
+        with open(ds_file, 'r', encoding='utf8') as f:
+            params = json.load(f)
+        if not isinstance(params, list):
+            params = [params]
+        params = [OrderedDict(p) for p in params]
+        ref_pitch = get_aligned_pitch(
+            wav_path=ds_file.with_suffix('.wav'),
+            total_secs=params[-1]['offset'] + sum(float(d) for d in params[-1]['note_dur'].split()),
+            timestep=timestep
+        )
+        for i, param in enumerate(params):
+            start_idx = math.floor(param['offset'] / timestep)
+            end_idx = math.ceil((param['offset'] + sum(float(d) for d in param['note_dur'].split())) / timestep)
+            correct_cents_item(
+                name=f'{ds_file.stem}#{i}', item=param, ref_pitch=ref_pitch[start_idx: end_idx],
+                timestep=timestep, error_ratio=error_ratio
+            )
+        with open(ds_file, 'w', encoding='utf8') as f:
+            json.dump(params, f, ensure_ascii=False, indent=2)
+    save_warnings(ds_dir)
+if __name__ == '__main__':
+    correct_cents()

variance-temp-solution/eliminate_short.py ADDED Viewed

	@@ -0,0 +1,91 @@

+import json
+import pathlib
+from collections import OrderedDict
+import click
+@click.command(help='Eliminate short slur notes in DS files')
+@click.argument('ds_dir', metavar='DS_DIR')
+@click.argument('threshold', type=float, metavar='THRESHOLD')
+def eliminate_short(
+        ds_dir,
+        threshold: float
+):
+    ds_dir = pathlib.Path(ds_dir).resolve()
+    assert ds_dir.exists(), 'The directory of DS files does not exist.'
+    for ds in ds_dir.iterdir():
+        if not ds.is_file() or ds.suffix != '.ds':
+            continue
+        with open(ds, 'r', encoding='utf8') as f:
+            params = json.load(f)
+        if not isinstance(params, list):
+            params = [params]
+        params = [OrderedDict(p) for p in params]
+        for param in params:
+            note_list = [
+                (note, float(dur), bool(int(slur)))
+                for note, dur, slur
+                in zip(param['note_seq'].split(), param['note_dur'].split(), param['note_slur'].split())
+            ]
+            word_note_div = []
+            cache = []
+            for note in note_list:
+                if len(cache) == 0 or note[2]:
+                    cache.append(note)
+                else:
+                    word_note_div.append(cache)
+                    cache = [note]
+            if len(cache) > 0:
+                word_note_div.append(cache)
+            word_note_div_new = []
+            for i in range(len(word_note_div)):
+                word_note_seq = word_note_div[i]
+                if len(word_note_seq) == 1 or all(n[1] < threshold for n in word_note_seq):
+                    word_note_div_new.append(word_note_seq)
+                    continue
+                word_note_seq_new = []
+                j = 0
+                prev_merge = 0.
+                while word_note_seq[j][1] < threshold:
+                    # Enumerate leading short notes
+                    prev_merge += word_note_seq[j][1]
+                    j += 1
+                # Iter note sequence
+                while j < len(word_note_seq):
+                    k = j + 1
+                    while k < len(word_note_seq) and word_note_seq[k][1] < threshold:
+                        k += 1
+                    post_merge = sum(n[1] for n in word_note_seq[j + 1: k])
+                    if k < len(word_note_seq):
+                        post_merge /= 2
+                    word_note_seq_new.append(
+                        (word_note_seq[j][0], prev_merge + word_note_seq[j][1] + post_merge, False)
+                    )
+                    prev_merge = post_merge
+                    j = k
+                word_note_div_new.append(word_note_seq_new)
+            note_seq_new = []
+            note_dur_new = []
+            note_slur_new = []
+            for word_note_seq in word_note_div_new:
+                note_seq_new += [n[0] for n in word_note_seq]
+                note_dur_new += [n[1] for n in word_note_seq]
+                note_slur_new += [pos > 0 for pos in range(len(word_note_seq))]
+            param['note_seq'] = ' '.join(note_seq_new)
+            param['note_dur'] = ' '.join(str(round(d, 6)) for d in note_dur_new)
+            param['note_slur'] = ' '.join(str(int(s)) for s in note_slur_new)
+        with open(ds, 'w', encoding='utf8') as f:
+            json.dump(params, f, ensure_ascii=False, indent=2)
+if __name__ == '__main__':
+    eliminate_short()

variance-temp-solution/estimate_midi.py ADDED Viewed

	@@ -0,0 +1,88 @@

+import csv
+import math
+import pathlib
+import click
+import librosa
+import numpy as np
+import tqdm
+from typing import List
+from get_pitch import get_pitch
+@click.command(help='Estimate note pitch from transcriptions and corresponding waveforms')
+@click.argument('transcriptions', metavar='TRANSCRIPTIONS')
+@click.argument('waveforms', metavar='WAVS')
+@click.option('--pe', metavar='ALGORITHM', default='parselmouth',
+              help='Pitch extractor (parselmouth, rmvpe)')
+@click.option('--rest_uv_ratio', metavar='RATIO', type=float, default=0.85,
+              help='The minimum percentage of unvoiced length for a note to be regarded as rest')
+def estimate_midi(
+        transcriptions: str,
+        waveforms: str,
+        pe: str = 'parselmouth',
+        rest_uv_ratio: float = 0.85
+):
+    transcriptions = pathlib.Path(transcriptions).resolve()
+    waveforms = pathlib.Path(waveforms).resolve()
+    with open(transcriptions, 'r', encoding='utf8') as f:
+        reader = csv.DictReader(f)
+        items: List[dict] = []
+        for item in reader:
+            items.append(item)
+    timestep = 512 / 44100
+    for item in tqdm.tqdm(items):
+        item: dict
+        ph_dur = [float(d) for d in item['ph_dur'].split()]
+        ph_num = [int(n) for n in item['ph_num'].split()]
+        assert sum(ph_num) == len(ph_dur), f'ph_num does not sum to number of phones in \'{item["name"]}\'.'
+        word_dur = []
+        i = 0
+        for num in ph_num:
+            word_dur.append(sum(ph_dur[i: i + num]))
+            i += num
+        total_secs = sum(ph_dur)
+        waveform, _ = librosa.load(waveforms / (item['name'] + '.wav'), sr=44100, mono=True)
+        _, f0, uv = get_pitch(pe, waveform, 512, 44100)
+        pitch = librosa.hz_to_midi(f0)
+        if pitch.shape[0] < total_secs / timestep:
+            pad = math.ceil(total_secs / timestep) - pitch.shape[0]
+            pitch = np.pad(pitch, [0, pad], mode='constant', constant_values=[0, pitch[-1]])
+            uv = np.pad(uv, [0, pad], mode='constant')
+        note_seq = []
+        note_dur = []
+        start = 0.
+        for dur in word_dur:
+            end = start + dur
+            start_idx = math.floor(start / timestep)
+            end_idx = math.ceil(end / timestep)
+            word_pitch = pitch[start_idx: end_idx]
+            word_uv = uv[start_idx: end_idx]
+            word_valid_pitch = np.extract(~word_uv & (word_pitch >= 0), word_pitch)
+            if len(word_valid_pitch) < (1 - rest_uv_ratio) * (end_idx - start_idx):
+                note_seq.append('rest')
+            else:
+                counts = np.bincount(np.round(word_valid_pitch).astype(np.int64))
+                midi = counts.argmax()
+                midi = np.mean(word_valid_pitch[(word_valid_pitch >= midi - 0.5) & (word_valid_pitch < midi + 0.5)])
+                note_seq.append(librosa.midi_to_note(midi, cents=True, unicode=False))
+            note_dur.append(dur)
+            start = end
+        item['note_seq'] = ' '.join(note_seq)
+        item['note_dur'] = ' '.join([str(round(d, 6)) for d in note_dur])
+    with open(transcriptions, 'w', encoding='utf8', newline='') as f:
+        writer = csv.DictWriter(f, fieldnames=['name', 'ph_seq', 'ph_dur', 'ph_num', 'note_seq', 'note_dur'])
+        writer.writeheader()
+        writer.writerows(items)
+if __name__ == '__main__':
+    estimate_midi()

variance-temp-solution/get_pitch.py ADDED Viewed

	@@ -0,0 +1,92 @@

+import pathlib
+import numpy as np
+import parselmouth
+def norm_f0(f0):
+    f0 = np.log2(f0)
+    return f0
+def denorm_f0(f0, uv, pitch_padding=None):
+    f0 = 2 ** f0
+    if uv is not None:
+        f0[uv > 0] = 0
+    if pitch_padding is not None:
+        f0[pitch_padding] = 0
+    return f0
+def interp_f0(f0, uv=None):
+    if uv is None:
+        uv = f0 == 0
+    f0 = norm_f0(f0)
+    if sum(uv) == len(f0):
+        f0[uv] = -np.inf
+    elif sum(uv) > 0:
+        f0[uv] = np.interp(np.where(uv)[0], np.where(~uv)[0], f0[~uv])
+    return denorm_f0(f0, uv=None), uv
+def resample_align_curve(points: np.ndarray, original_timestep: float, target_timestep: float, align_length: int):
+    t_max = (len(points) - 1) * original_timestep
+    curve_interp = np.interp(
+        np.arange(0, t_max, target_timestep),
+        original_timestep * np.arange(len(points)),
+        points
+    ).astype(points.dtype)
+    delta_l = align_length - len(curve_interp)
+    if delta_l < 0:
+        curve_interp = curve_interp[:align_length]
+    elif delta_l > 0:
+        curve_interp = np.concatenate((curve_interp, np.full(delta_l, fill_value=curve_interp[-1])), axis=0)
+    return curve_interp
+def get_pitch_parselmouth(wav_data, hop_size, audio_sample_rate, interp_uv=True):
+    time_step = hop_size / audio_sample_rate
+    f0_min = 65.
+    f0_max = 1100.
+    # noinspection PyArgumentList
+    f0 = (
+        parselmouth.Sound(wav_data, sampling_frequency=audio_sample_rate)
+        .to_pitch_ac(
+            time_step=time_step, voicing_threshold=0.6, pitch_floor=f0_min, pitch_ceiling=f0_max
+        ).selected_array["frequency"]
+    )
+    uv = f0 == 0
+    if interp_uv:
+        f0, uv = interp_f0(f0, uv)
+    return time_step, f0, uv
+rmvpe = None
+def get_pitch_rmvpe(wav_data, hop_size, audio_sample_rate, interp_uv=True):
+    global rmvpe
+    if rmvpe is None:
+        from rmvpe import RMVPE
+        rmvpe = RMVPE(pathlib.Path(__file__).parent / 'assets' / 'rmvpe' / 'model.pt')
+    f0 = rmvpe.infer_from_audio(wav_data, sample_rate=audio_sample_rate)
+    uv = f0 == 0
+    f0, uv = interp_f0(f0, uv)
+    time_step = hop_size / audio_sample_rate
+    length = (wav_data.shape[0] + hop_size - 1) // hop_size
+    f0_res = resample_align_curve(f0, 0.01, time_step, length)
+    uv_res = resample_align_curve(uv.astype(np.float32), 0.01, time_step, length) > 0.5
+    if not interp_uv:
+        f0_res[uv_res] = 0
+    return time_step, f0_res, uv_res
+def get_pitch(algorithm, wav_data, hop_size, audio_sample_rate, interp_uv=True):
+    if algorithm == 'parselmouth':
+        return get_pitch_parselmouth(wav_data, hop_size, audio_sample_rate, interp_uv=interp_uv)
+    elif algorithm == 'rmvpe':
+        return get_pitch_rmvpe(wav_data, hop_size, audio_sample_rate, interp_uv=interp_uv)
+    else:
+        raise ValueError(f" [x] Unknown f0 extractor: {algorithm}")

variance-temp-solution/requirements.txt ADDED Viewed

	@@ -0,0 +1,5 @@

+click
+librosa<0.10.0
+numpy==1.23.5
+praat-parselmouth==0.4.3
+tqdm

variance-temp-solution/rmvpe/__init__.py ADDED Viewed

	@@ -0,0 +1 @@


1	+ from .inference import RMVPE

variance-temp-solution/rmvpe/constants.py ADDED Viewed

	@@ -0,0 +1,9 @@

+SAMPLE_RATE = 16000
+N_CLASS = 360
+N_MELS = 128
+MEL_FMIN = 30
+MEL_FMAX = 8000
+WINDOW_LENGTH = 1024
+CONST = 1997.3794084376191

variance-temp-solution/rmvpe/deepunet.py ADDED Viewed

	@@ -0,0 +1,173 @@

+import torch
+import torch.nn as nn
+from .constants import N_MELS
+class ConvBlockRes(nn.Module):
+    def __init__(self, in_channels, out_channels, momentum=0.01):
+        super(ConvBlockRes, self).__init__()
+        self.conv = nn.Sequential(
+            nn.Conv2d(in_channels=in_channels,
+                      out_channels=out_channels,
+                      kernel_size=(3, 3),
+                      stride=(1, 1),
+                      padding=(1, 1),
+                      bias=False),
+            nn.BatchNorm2d(out_channels, momentum=momentum),
+            nn.ReLU(),
+            nn.Conv2d(in_channels=out_channels,
+                      out_channels=out_channels,
+                      kernel_size=(3, 3),
+                      stride=(1, 1),
+                      padding=(1, 1),
+                      bias=False),
+            nn.BatchNorm2d(out_channels, momentum=momentum),
+            nn.ReLU(),
+        )
+        if in_channels != out_channels:
+            self.shortcut = nn.Conv2d(in_channels, out_channels, (1, 1))
+            self.is_shortcut = True
+        else:
+            self.is_shortcut = False
+    def forward(self, x):
+        if self.is_shortcut:
+            return self.conv(x) + self.shortcut(x)
+        else:
+            return self.conv(x) + x
+class ResEncoderBlock(nn.Module):
+    def __init__(self, in_channels, out_channels, kernel_size, n_blocks=1, momentum=0.01):
+        super(ResEncoderBlock, self).__init__()
+        self.n_blocks = n_blocks
+        self.conv = nn.ModuleList()
+        self.conv.append(ConvBlockRes(in_channels, out_channels, momentum))
+        for i in range(n_blocks - 1):
+            self.conv.append(ConvBlockRes(out_channels, out_channels, momentum))
+        self.kernel_size = kernel_size
+        if self.kernel_size is not None:
+            self.pool = nn.AvgPool2d(kernel_size=kernel_size)
+    def forward(self, x):
+        for i in range(self.n_blocks):
+            x = self.conv[i](x)
+        if self.kernel_size is not None:
+            return x, self.pool(x)
+        else:
+            return x
+class ResDecoderBlock(nn.Module):
+    def __init__(self, in_channels, out_channels, stride, n_blocks=1, momentum=0.01):
+        super(ResDecoderBlock, self).__init__()
+        out_padding = (0, 1) if stride == (1, 2) else (1, 1)
+        self.n_blocks = n_blocks
+        self.conv1 = nn.Sequential(
+            nn.ConvTranspose2d(in_channels=in_channels,
+                               out_channels=out_channels,
+                               kernel_size=(3, 3),
+                               stride=stride,
+                               padding=(1, 1),
+                               output_padding=out_padding,
+                               bias=False),
+            nn.BatchNorm2d(out_channels, momentum=momentum),
+            nn.ReLU(),
+        )
+        self.conv2 = nn.ModuleList()
+        self.conv2.append(ConvBlockRes(out_channels * 2, out_channels, momentum))
+        for i in range(n_blocks-1):
+            self.conv2.append(ConvBlockRes(out_channels, out_channels, momentum))
+    def forward(self, x, concat_tensor):
+        x = self.conv1(x)
+        x = torch.cat((x, concat_tensor), dim=1)
+        for i in range(self.n_blocks):
+            x = self.conv2[i](x)
+        return x
+class Encoder(nn.Module):
+    def __init__(self, in_channels, in_size, n_encoders, kernel_size, n_blocks, out_channels=16, momentum=0.01):
+        super(Encoder, self).__init__()
+        self.n_encoders = n_encoders
+        self.bn = nn.BatchNorm2d(in_channels, momentum=momentum)
+        self.layers = nn.ModuleList()
+        self.latent_channels = []
+        for i in range(self.n_encoders):
+            self.layers.append(ResEncoderBlock(in_channels, out_channels, kernel_size, n_blocks, momentum=momentum))
+            self.latent_channels.append([out_channels, in_size])
+            in_channels = out_channels
+            out_channels *= 2
+            in_size //= 2
+        self.out_size = in_size
+        self.out_channel = out_channels
+    def forward(self, x):
+        concat_tensors = []
+        x = self.bn(x)
+        for i in range(self.n_encoders):
+            _, x = self.layers[i](x)
+            concat_tensors.append(_)
+        return x, concat_tensors
+class Intermediate(nn.Module):
+    def __init__(self, in_channels, out_channels, n_inters, n_blocks, momentum=0.01):
+        super(Intermediate, self).__init__()
+        self.n_inters = n_inters
+        self.layers = nn.ModuleList()
+        self.layers.append(ResEncoderBlock(in_channels, out_channels, None, n_blocks, momentum))
+        for i in range(self.n_inters-1):
+            self.layers.append(ResEncoderBlock(out_channels, out_channels, None, n_blocks, momentum))
+    def forward(self, x):
+        for i in range(self.n_inters):
+            x = self.layers[i](x)
+        return x
+class Decoder(nn.Module):
+    def __init__(self, in_channels, n_decoders, stride, n_blocks, momentum=0.01):
+        super(Decoder, self).__init__()
+        self.layers = nn.ModuleList()
+        self.n_decoders = n_decoders
+        for i in range(self.n_decoders):
+            out_channels = in_channels // 2
+            self.layers.append(ResDecoderBlock(in_channels, out_channels, stride, n_blocks, momentum))
+            in_channels = out_channels
+    def forward(self, x, concat_tensors):
+        for i in range(self.n_decoders):
+            x = self.layers[i](x, concat_tensors[-1-i])
+        return x
+class TimbreFilter(nn.Module):
+    def __init__(self, latent_rep_channels):
+        super(TimbreFilter, self).__init__()
+        self.layers = nn.ModuleList()
+        for latent_rep in latent_rep_channels:
+            self.layers.append(ConvBlockRes(latent_rep[0], latent_rep[0]))
+    def forward(self, x_tensors):
+        out_tensors = []
+        for i, layer in enumerate(self.layers):
+            out_tensors.append(layer(x_tensors[i]))
+        return out_tensors
+class DeepUnet0(nn.Module):
+    def __init__(self, kernel_size, n_blocks, en_de_layers=5, inter_layers=4, in_channels=1, en_out_channels=16):
+        super(DeepUnet0, self).__init__()
+        self.encoder = Encoder(in_channels, N_MELS, en_de_layers, kernel_size, n_blocks, en_out_channels)
+        self.intermediate = Intermediate(self.encoder.out_channel // 2, self.encoder.out_channel, inter_layers, n_blocks)
+        self.tf = TimbreFilter(self.encoder.latent_channels)
+        self.decoder = Decoder(self.encoder.out_channel, en_de_layers, kernel_size, n_blocks)
+    def forward(self, x):
+        x, concat_tensors = self.encoder(x)
+        x = self.intermediate(x)
+        x = self.decoder(x, concat_tensors)
+        return x

variance-temp-solution/rmvpe/inference.py ADDED Viewed

	@@ -0,0 +1,49 @@

+import torch
+import torch.nn.functional as F
+from torchaudio.transforms import Resample
+from .constants import *
+from .model import E2E0
+from .spec import MelSpectrogram
+from .utils import to_local_average_f0, to_viterbi_f0
+class RMVPE:
+    def __init__(self, model_path, hop_length=160):
+        self.resample_kernel = {}
+        self.device = 'cuda' if torch.cuda.is_available() else 'cpu'
+        self.model = E2E0(4, 1, (2, 2)).eval().to(self.device)
+        ckpt = torch.load(model_path, map_location=self.device)
+        self.model.load_state_dict(ckpt['model'], strict=False)
+        self.mel_extractor = MelSpectrogram(
+            N_MELS, SAMPLE_RATE, WINDOW_LENGTH, hop_length, None, MEL_FMIN, MEL_FMAX
+        ).to(self.device)
+    @torch.no_grad()
+    def mel2hidden(self, mel):
+        n_frames = mel.shape[-1]
+        mel = F.pad(mel, (0, 32 * ((n_frames - 1) // 32 + 1) - n_frames), mode='constant')
+        hidden = self.model(mel)
+        return hidden[:, :n_frames]
+    def decode(self, hidden, thred=0.03, use_viterbi=False):
+        if use_viterbi:
+            f0 = to_viterbi_f0(hidden, thred=thred)
+        else:
+            f0 = to_local_average_f0(hidden, thred=thred)
+        return f0
+    def infer_from_audio(self, audio, sample_rate=16000, thred=0.03, use_viterbi=False):
+        audio = torch.from_numpy(audio).float().unsqueeze(0).to(self.device)
+        if sample_rate == 16000:
+            audio_res = audio
+        else:
+            key_str = str(sample_rate)
+            if key_str not in self.resample_kernel:
+                self.resample_kernel[key_str] = Resample(sample_rate, 16000, lowpass_filter_width=128)
+            self.resample_kernel[key_str] = self.resample_kernel[key_str].to(self.device)
+            audio_res = self.resample_kernel[key_str](audio)
+        mel = self.mel_extractor(audio_res, center=True)
+        hidden = self.mel2hidden(mel)
+        f0 = self.decode(hidden, thred=thred, use_viterbi=use_viterbi)
+        return f0

variance-temp-solution/rmvpe/model.py ADDED Viewed

	@@ -0,0 +1,32 @@

+from torch import nn
+from .constants import *
+from .deepunet import DeepUnet0
+from .seq import BiGRU
+class E2E0(nn.Module):
+    def __init__(self, n_blocks, n_gru, kernel_size, en_de_layers=5, inter_layers=4, in_channels=1,
+                 en_out_channels=16):
+        super(E2E0, self).__init__()
+        self.unet = DeepUnet0(kernel_size, n_blocks, en_de_layers, inter_layers, in_channels, en_out_channels)
+        self.cnn = nn.Conv2d(en_out_channels, 3, (3, 3), padding=(1, 1))
+        if n_gru:
+            self.fc = nn.Sequential(
+                BiGRU(3 * N_MELS, 256, n_gru),
+                nn.Linear(512, N_CLASS),
+                nn.Dropout(0.25),
+                nn.Sigmoid()
+            )
+        else:
+            self.fc = nn.Sequential(
+                nn.Linear(3 * N_MELS, N_CLASS),
+                nn.Dropout(0.25),
+                nn.Sigmoid()
+            )
+    def forward(self, mel):
+        mel = mel.transpose(-1, -2).unsqueeze(1)
+        x = self.cnn(self.unet(mel)).transpose(1, 2).flatten(-2)
+        x = self.fc(x)
+        return x

variance-temp-solution/rmvpe/seq.py ADDED Viewed

	@@ -0,0 +1,10 @@

+import torch.nn as nn
+class BiGRU(nn.Module):
+    def __init__(self, input_features, hidden_features, num_layers):
+        super(BiGRU, self).__init__()
+        self.gru = nn.GRU(input_features, hidden_features, num_layers=num_layers, batch_first=True, bidirectional=True)
+    def forward(self, x):
+        return self.gru(x)[0]

variance-temp-solution/rmvpe/spec.py ADDED Viewed

	@@ -0,0 +1,68 @@

+import torch
+import numpy as np
+import torch.nn.functional as F
+from librosa.filters import mel
+class MelSpectrogram(torch.nn.Module):
+    def __init__(
+            self,
+            n_mel_channels,
+            sampling_rate,
+            win_length,
+            hop_length,
+            n_fft=None,
+            mel_fmin=0,
+            mel_fmax=None,
+            clamp=1e-5
+    ):
+        super().__init__()
+        n_fft = win_length if n_fft is None else n_fft
+        self.hann_window = {}
+        mel_basis = mel(
+            sr=sampling_rate,
+            n_fft=n_fft,
+            n_mels=n_mel_channels,
+            fmin=mel_fmin,
+            fmax=mel_fmax,
+            htk=True)
+        mel_basis = torch.from_numpy(mel_basis).float()
+        self.register_buffer("mel_basis", mel_basis)
+        self.n_fft = win_length if n_fft is None else n_fft
+        self.hop_length = hop_length
+        self.win_length = win_length
+        self.sampling_rate = sampling_rate
+        self.n_mel_channels = n_mel_channels
+        self.clamp = clamp
+    def forward(self, audio, keyshift=0, speed=1, center=True):
+        factor = 2 ** (keyshift / 12)
+        n_fft_new = int(np.round(self.n_fft * factor))
+        win_length_new = int(np.round(self.win_length * factor))
+        hop_length_new = int(np.round(self.hop_length * speed))
+        keyshift_key = str(keyshift) + '_' + str(audio.device)
+        if keyshift_key not in self.hann_window:
+            self.hann_window[keyshift_key] = torch.hann_window(win_length_new).to(audio.device)
+        fft = torch.stft(
+            audio,
+            n_fft=n_fft_new,
+            hop_length=hop_length_new,
+            win_length=win_length_new,
+            window=self.hann_window[keyshift_key],
+            center=center,
+            return_complex=True
+        )
+        magnitude = fft.abs()
+        if keyshift != 0:
+            size = self.n_fft // 2 + 1
+            resize = magnitude.size(1)
+            if resize < size:
+                magnitude = F.pad(magnitude, (0, 0, 0, size - resize))
+            magnitude = magnitude[:, :size, :] * self.win_length / win_length_new
+        mel_output = torch.matmul(self.mel_basis, magnitude)
+        log_mel_spec = torch.log(torch.clamp(mel_output, min=self.clamp))
+        return log_mel_spec

variance-temp-solution/rmvpe/utils.py ADDED Viewed

	@@ -0,0 +1,43 @@

+import librosa
+import numpy as np
+import torch
+from .constants import *
+def to_local_average_f0(hidden, center=None, thred=0.03):
+    idx = torch.arange(N_CLASS, device=hidden.device)[None, None, :]  # [B=1, T=1, N]
+    idx_cents = idx * 20 + CONST  # [B=1, N]
+    if center is None:
+        center = torch.argmax(hidden, dim=2, keepdim=True)  # [B, T, 1]
+    start = torch.clip(center - 4, min=0)  # [B, T, 1]
+    end = torch.clip(center + 5, max=N_CLASS)  # [B, T, 1]
+    idx_mask = (idx >= start) & (idx < end)  # [B, T, N]
+    weights = hidden * idx_mask  # [B, T, N]
+    product_sum = torch.sum(weights * idx_cents, dim=2)  # [B, T]
+    weight_sum = torch.sum(weights, dim=2)  # [B, T]
+    cents = product_sum / (weight_sum + (weight_sum == 0))  # avoid dividing by zero, [B, T]
+    f0 = 10 * 2 ** (cents / 1200)
+    uv = hidden.max(dim=2)[0] < thred  # [B, T]
+    f0 = f0 * ~uv
+    return f0.squeeze(0).cpu().numpy()
+def to_viterbi_f0(hidden, thred=0.03):
+    # Create viterbi transition matrix
+    if not hasattr(to_viterbi_f0, 'transition'):
+        xx, yy = np.meshgrid(range(N_CLASS), range(N_CLASS))
+        transition = np.maximum(30 - abs(xx - yy), 0)
+        transition = transition / transition.sum(axis=1, keepdims=True)
+        to_viterbi_f0.transition = transition
+    # Convert to probability
+    prob = hidden.squeeze(0).cpu().numpy()
+    prob = prob.T
+    prob = prob / prob.sum(axis=0)
+    # Perform viterbi decoding
+    path = librosa.sequence.viterbi(prob, to_viterbi_f0.transition).astype(np.int64)
+    center = torch.from_numpy(path).unsqueeze(0).unsqueeze(-1).to(hidden.device)
+    return to_local_average_f0(hidden, center=center, thred=thred)