diff --git a/.gitattributes b/.gitattributes new file mode 100644 index 0000000000000000000000000000000000000000..568459e003f3960cb46a4ca6a01396e49b98f067 --- /dev/null +++ b/.gitattributes @@ -0,0 +1,9 @@ +*.hdf5 filter=lfs diff=lfs merge=lfs -text +*.wav filter=lfs diff=lfs merge=lfs -text +*.cWG5V7 filter=lfs diff=lfs merge=lfs -text +*.pt filter=lfs diff=lfs merge=lfs -text +*.pth filter=lfs diff=lfs merge=lfs -text +*.png filter=lfs diff=lfs merge=lfs -text +*.pdf filter=lfs diff=lfs merge=lfs -text +*.exe filter=lfs diff=lfs merge=lfs -text +*.safetensors filter=lfs diff=lfs merge=lfs -text diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000000000000000000000000000000000000..d27883cebe792766d06fa700b3ea593398edc5ac --- /dev/null +++ b/.gitignore @@ -0,0 +1,40 @@ +__pycache__/ + +runtime/ +.venv/ +venv/ +.venv_linux/ +.vscode/ + +*_pretrain/ +crepe/assets/full.pth + +chkpt/ +data_svc/ +dataset_raw/ +files/ +logs/ + +sovits5.0.pth +svc_out_pit.wav +svc_out.wav +svc_tmp.pit.csv +svc_tmp.ppg.npy +svc_tmp.vec.npy +test.wav + +so-vits-svc-5.0-*.zip + +# Ignore model checkpoints and large audio arrays +*.pt +*.pth +model_1200000.safetensors +*.wav +chkpt/ +chkpt_cfm/ +logs/ + +opensinger/ +dataset_raw_old/ +data_svc_infer/ +stable-audio-tools/ diff --git a/LICENSE b/LICENSE new file mode 100644 index 0000000000000000000000000000000000000000..6ebe0ff106911262124772a8f199fe9c9e68e585 --- /dev/null +++ b/LICENSE @@ -0,0 +1,21 @@ +MIT License + +Copyright (c) 2023 PlayVoice + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. diff --git a/README.md b/README.md new file mode 100755 index 0000000000000000000000000000000000000000..ef2d847aa33fde6f99b044e47a936d2c9d200854 --- /dev/null +++ b/README.md @@ -0,0 +1,283 @@ +# CFM-SVC / F5-SVC — Singing Voice Conversion + +Two implementations of a flow-matching-based Singing Voice Conversion (SVC) system. + +| | V1 (CFM-SVC) | V2 (F5-SVC) | +|---|---|---| +| Backbone | DiT trained from scratch | F5-TTS pretrained (LoRA) | +| Output space | DAC codec latents (1024-dim) | Log-mel spectrogram (100-dim) | +| Vocoder | DAC decoder (frozen) | Vocos (frozen) | +| Params trained | ~82M | ~5M (adapter + LoRA) | +| Training data | Multi-speaker singing | Multi-speaker singing | +| Speaker adaptation | Speaker d-vector | Stage 2: spk_proj on speech clips | + +--- + +## Project Structure + +``` +matcha_svc/ +├── models/ +│ ├── cfm.py V1: Diffusion Transformer (DiT) +│ ├── cond_encoder.py V1: PPG+HuBERT+F0+Speaker → conditioning +│ ├── codec_wrapper.py V1: DAC codec + projector head +│ ├── svc_cond_adapter.py V2: PPG+HuBERT+F0+Speaker → F5-TTS text_dim +│ ├── lora_utils.py V2: LoRALinear, inject_lora(), freeze_non_lora() +│ └── f5_svc.py V2: F5SVCModel wrapper + build_f5svc() factory +│ +├── losses/ +│ └── cfm_loss.py V1: flow matching + projector commitment loss +│ +├── svc_data/ +│ └── mel_svc_dataset.py V2: log-mel dataset (same directory layout as V1) +│ +├── train_cfm.py V1 training script +├── train_f5_stage1.py V2 Stage 1: SVCCondAdapter + LoRA on singing data +├── train_f5_stage2.py V2 Stage 2: spk_proj on target speaker speech +├── infer_f5_svc.py V2 inference: Euler sampling → Vocos → .wav +├── submit_train.sh SLURM job script for V1 +│ +├── data_svc/ Preprocessed features (generated by svc_preprocessing.py) +│ ├── audio//.wav +│ ├── whisper//.ppg.npy +│ ├── hubert//.vec.npy +│ ├── pitch//.pit.npy +│ ├── speaker//.spk.npy +│ └── codec_targets//.pt ← V1 only +│ +├── chkpt_cfm/ V1 checkpoints +└── chkpt_f5svc/ V2 checkpoints +``` + +--- + +## Prerequisites + +```bash +python -m venv .venv +source .venv/bin/activate # or .venv\Scripts\activate on Windows + +pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu121 +pip install -r requirements.txt +pip install descript-audio-codec # V1 +pip install f5-tts vocos safetensors huggingface_hub # V2 +``` + +Pretrained feature extractors (shared by V1 and V2): + +| File | Destination | +|---|---| +| `best_model.pth.tar` (Speaker encoder) | `speaker_pretrain/` | +| `large-v2.pt` (Whisper) | `whisper_pretrain/` | +| `hubert-soft-0d54a1f4.pt` | `hubert_pretrain/` | +| `full.pth` (CREPE) | `crepe/assets/` | + +--- + +## Data Preparation (shared by V1 and V2) + +### 1. Raw audio layout + +``` +dataset_raw/ +├── speaker0/ +│ ├── 000001.wav +│ └── ... +└── speaker1/ + └── ... +``` + +Clips should be clean vocals, < 30 seconds, no accompaniment. +Use UVR for source separation and audio-slicer for cutting. + +### 2. Extract features + +```bash +python svc_preprocessing.py -t 2 +``` + +Produces under `data_svc/`: +- `whisper//.ppg.npy` — Whisper PPG (1280-dim, 50 Hz) +- `hubert//.vec.npy` — HuBERT (256-dim, 50 Hz) +- `pitch//.pit.npy` — F0 in Hz (50 Hz, 0 = unvoiced) +- `speaker//.spk.npy` — Speaker d-vector (256-dim) + +### 3. V1 only: extract codec targets + +```bash +python data/codec_targets.py -w ./data_svc/waves-32k -o ./data_svc/codec_targets +``` + +V2 computes mel spectrograms on-the-fly from the raw audio — no offline codec step needed. + +--- + +## V1: CFM-SVC (Training from Scratch) + +### Train + +```bash +python train_cfm.py \ + --data_dir ./data_svc/codec_targets \ + --batch_size 64 \ + --lr 2e-5 \ + --epochs 250 \ + --save_interval 1 + +# or via SLURM: +sbatch submit_train.sh +``` + +Training automatically resumes from the latest checkpoint in `chkpt_cfm/`. + +Key arguments: + +| Argument | Default | Description | +|---|---|---| +| `--lr` | `1e-4` | Learning rate | +| `--batch_size` | `2` | Batch size | +| `--grad_accum` | `1` | Gradient accumulation steps | +| `--grad_clip` | `1.0` | Gradient clip max norm | +| `--save_interval` | `50` | Save every N epochs | +| `--use_checkpointing` | off | Enable gradient checkpointing (saves VRAM) | +| `--freeze_norm` | off | Freeze latent norm stats (for fine-tuning) | + +### Inference (V1) + +```bash +python infer.py --wave /path/to/source_singing.wav +``` + +--- + +## V2: F5-SVC (LoRA on F5-TTS) + +### Architecture + +- F5-TTS's DiT is loaded with pretrained weights and kept mostly frozen. +- `SVCCondAdapter` replaces the text encoder: PPG + HuBERT + F0 + speaker → (B, T, 512). +- LoRA (rank 16) is injected into every DiT attention projection (Q, K, V, Out). +- Vocos decodes mel spectrograms to audio. +- Two-stage training protocol: + - **Stage 1** (singing): SVCCondAdapter + LoRA trained on multi-speaker singing data. + - **Stage 2** (per-speaker): only `spk_proj` trained on the target speaker's speech clips. + +### Download F5-TTS checkpoint + +```python +from huggingface_hub import hf_hub_download +path = hf_hub_download("SWivid/F5-TTS", "F5TTS_Base/model_1200000.safetensors") +print(path) +``` + +### Stage 1 — Singing Adaptation + +Trains: `SVCCondAdapter` (content projection + speaker projection) + LoRA adapters +Freezes: All other DiT weights + +```bash +python train_f5_stage1.py \ + --f5tts_ckpt /path/to/model_1200000.safetensors \ + --audio_dir ./data_svc/audio \ + --epochs 200 \ + --batch_size 16 \ + --lr 1e-4 + +# Checkpoints saved to ./chkpt_f5svc/stage1_epoch_N.pt +``` + +All PPG/HuBERT/F0/speaker features from V1 preprocessing are reused directly. +The only difference is the audio directory name: V1 produces `data_svc/waves-32k/` +while V2 defaults to `data_svc/audio/`. Pass `--audio_dir ./data_svc/waves-32k` to +reuse V1 audio (it is resampled to 24 kHz on-the-fly, no re-extraction needed). +The codec targets directory (`data_svc/codec_targets/`) is V1-only and not needed here. + +### Stage 2 — Per-Speaker Fine-tuning + +Trains: `svc_adapter.spk_proj` only +Freezes: DiT + LoRA (locked in from Stage 1) +Data: Speech clips of the target speaker (no singing required) + +```bash +python train_f5_stage2.py \ + --stage1_ckpt ./chkpt_f5svc/stage1_epoch_200.pt \ + --audio_dir ./data_svc/audio/my_speaker \ + --speaker_id my_speaker \ + --epochs 50 + +# Saved to ./chkpt_f5svc/stage2_my_speaker.pt +``` + +The target speaker's speech clips need the same feature extraction as Stage 1: +run `svc_preprocessing.py` pointing at the speech audio directory. + +### Inference (V2) + +```bash +python infer_f5_svc.py \ + --ckpt ./chkpt_f5svc/stage1_epoch_200.pt \ + --source ./source_singing.wav \ + --target_spk ./data_svc/speaker/my_speaker/ref.spk.npy \ + --ref_audio ./data_svc/audio/my_speaker/ref.wav \ + --output ./converted.wav \ + --steps 32 +``` + +For a Stage 2 speaker-adapted checkpoint: +```bash +python infer_f5_svc.py \ + --ckpt ./chkpt_f5svc/stage2_my_speaker.pt \ + --source ./source_singing.wav \ + --target_spk ./data_svc/speaker/my_speaker/ref.spk.npy \ + --ref_audio ./data_svc/audio/my_speaker/ref.wav \ + --output ./converted.wav +``` + +Inference arguments: + +| Argument | Default | Description | +|---|---|---| +| `--ckpt` | required | Stage 1 or Stage 2 checkpoint | +| `--source` | required | Source singing .wav | +| `--target_spk` | required | Target speaker .spk.npy | +| `--ref_audio` | `None` | Short .wav of target speaker for timbre reference | +| `--ref_sec` | `3.0` | Seconds of ref_audio to use | +| `--steps` | `32` | Euler ODE steps (more = higher quality, slower) | +| `--output` | `./converted.wav` | Output path | + +The source audio must have pre-extracted features (PPG, HuBERT, F0) in the standard +`data_svc/` directory structure. Run `svc_preprocessing.py` on the source if needed. + +--- + +## Checkpoints + +V1 saves full model state per epoch to `chkpt_cfm/`: +``` +chkpt_cfm/ +├── dit_epoch_N.pt +├── cond_encoder_epoch_N.pt +├── projector_epoch_N.pt +├── ema_dit_epoch_N.pt +├── optimizer_epoch_N.pt +├── scheduler_epoch_N.pt +└── latent_norm.pt ← cached normalization stats +``` + +V2 saves adapter + LoRA state per epoch to `chkpt_f5svc/`: +``` +chkpt_f5svc/ +├── stage1_epoch_N.pt ← full model state (adapter + LoRA + frozen DiT) +│ also contains lora_only key for lightweight sharing +└── stage2_.pt ← speaker-adapted state +``` + +--- + +## References + +- Rectified Flow / Flow Matching +- F5-TTS: [SWivid/F5-TTS](https://github.com/SWivid/F5-TTS) +- Vocos vocoder: [hubert-whisper/vocos](https://github.com/hubert-whisper/vocos) +- DAC: [descriptinc/descript-audio-codec](https://github.com/descriptinc/descript-audio-codec) +- so-vits-svc-5.0: preprocessing pipeline diff --git a/README_OLD.md b/README_OLD.md new file mode 100755 index 0000000000000000000000000000000000000000..1eb9076eb8d52e252f36b152571a61507bafd17d --- /dev/null +++ b/README_OLD.md @@ -0,0 +1,382 @@ +
+

Variational Inference with adversarial learning for end-to-end Singing Voice Conversion based on VITS

+ +[![Hugging Face Spaces](https://img.shields.io/badge/%F0%9F%A4%97%20Hugging%20Face-Spaces-blue)](https://huggingface.co/spaces/maxmax20160403/sovits5.0) +GitHub Repo stars +GitHub forks +GitHub issues +GitHub + +
+ +- This project targets deep learning beginners, basic knowledge of Python and PyTorch are the prerequisites for this project; +- This project aims to help deep learning beginners get rid of boring pure theoretical learning, and master the basic knowledge of deep learning by combining it with practices; +- This project does not support real-time voice converting; (need to replace whisper if real-time voice converting is what you are looking for) +- This project will not develop one-click packages for other purposes; + +![vits-5.0-frame](https://github.com/PlayVoice/so-vits-svc-5.0/assets/16432329/3854b281-8f97-4016-875b-6eb663c92466) + +- 6GB low minimum VRAM requirement for training + +- support for multiple speakers + +- create unique speakers through speaker mixing + +- even voices with light accompaniment can also be converted + +- F0 can be edited using Excel + +https://github.com/PlayVoice/so-vits-svc-5.0/assets/16432329/6a09805e-ab93-47fe-9a14-9cbc1e0e7c3a + +Powered by [@ShadowVap](https://space.bilibili.com/491283091) + +## Model properties + +| Feature | From | Status | Function | +| :--- | :--- | :--- | :--- | +| whisper | OpenAI | ✅ | strong noise immunity | +| bigvgan | NVIDA | ✅ | alias and snake | The formant is clearer and the sound quality is obviously improved | +| natural speech | Microsoft | ✅ | reduce mispronunciation | +| neural source-filter | NII | ✅ | solve the problem of audio F0 discontinuity | +| speaker encoder | Google | ✅ | Timbre Encoding and Clustering | +| GRL for speaker | Ubisoft |✅ | Preventing Encoder Leakage Timbre | +| SNAC | Samsung | ✅ | One Shot Clone of VITS | +| SCLN | Microsoft | ✅ | Improve Clone | +| PPG perturbation | this project | ✅ | Improved noise immunity and de-timbre | +| HuBERT perturbation | this project | ✅ | Improved noise immunity and de-timbre | +| VAE perturbation | this project | ✅ | Improve sound quality | +| MIX encoder | this project | ✅ | Improve conversion stability | +| USP infer | this project | ✅ | Improve conversion stability | + +due to the use of data perturbation, it takes longer to train than other projects. + +**USP : Unvoice and Silence with Pitch when infer** +![vits_svc_usp](https://github.com/PlayVoice/so-vits-svc-5.0/assets/16432329/ba733b48-8a89-4612-83e0-a0745587d150) + +## Quick Installation + +```PowerShell +# clone project +git clone https://github.com/ouor/so-vits-svc-5.0 + +# create virtual environment +python -m venv .venv + +# activate virtual environment +.venv\Scripts\activate + +# install pytorch +pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu117 + +# install dependencies +pip install -r requirements.txt + +# run app.py +python app.py +``` + +## Setup Environment + +1. Install [PyTorch](https://pytorch.org/get-started/locally/). + +2. Install project dependencies + ```shell + pip install -i https://pypi.tuna.tsinghua.edu.cn/simple -r requirements.txt + ``` + **Note: whisper is already built-in, do not install it again otherwise it will cuase conflict and error** +3. Download the Timbre Encoder: [Speaker-Encoder by @mueller91](https://drive.google.com/drive/folders/15oeBYf6Qn1edONkVLXe82MzdIi3O_9m3), put `best_model.pth.tar` into `speaker_pretrain/`. + +4. Download whisper model [whisper-large-v2](https://openaipublic.azureedge.net/main/whisper/models/81f7c96c852ee8fc832187b0132e569d6c3065a3252ed18e56effd0b6a73e524/large-v2.pt). Make sure to download `large-v2.pt`,put it into `whisper_pretrain/`. + +5. Download [hubert_soft model](https://github.com/bshall/hubert/releases/tag/v0.1),put `hubert-soft-0d54a1f4.pt` into `hubert_pretrain/`. + +6. Download pitch extractor [crepe full](https://github.com/maxrmorrison/torchcrepe/tree/master/torchcrepe/assets),put `full.pth` into `crepe/assets`. + +7. Download pretrain model [sovits5.0.pretrain.pth](https://github.com/PlayVoice/so-vits-svc-5.0/releases/tag/5.0/), and put it into `vits_pretrain/`. + ```shell + python svc_inference.py --config configs/base.yaml --model ./vits_pretrain/sovits5.0.pretrain.pth --spk ./configs/singers/singer0001.npy --wave test.wav + ``` + +## Dataset preparation + +Necessary pre-processing: +1. Separate vocie and accompaniment with [UVR](https://github.com/Anjok07/ultimatevocalremovergui) (skip if no accompaniment) +2. Cut audio input to shorter length with [slicer](https://github.com/flutydeer/audio-slicer), whisper takes input less than 30 seconds. +3. Manually check generated audio input, remove inputs shorter than 2 seconds or with obivous noise. +4. Adjust loudness if necessary, recommand Adobe Audiiton. +5. Put the dataset into the `dataset_raw` directory following the structure below. +``` +dataset_raw +├───speaker0 +│ ├───000001.wav +│ ├───... +│ └───000xxx.wav +└───speaker1 + ├───000001.wav + ├───... + └───000xxx.wav +``` + +## Data preprocessing +```shell +python sve_preprocessing.py -t 2 +``` +`-t`: threading, max number should not exceed CPU core count, usually 2 is enough. +After preprocessing you will get an output with following structure. +``` +data_svc/ +└── waves-16k +│ └── speaker0 +│ │ ├── 000001.wav +│ │ └── 000xxx.wav +│ └── speaker1 +│ ├── 000001.wav +│ └── 000xxx.wav +└── waves-32k +│ └── speaker0 +│ │ ├── 000001.wav +│ │ └── 000xxx.wav +│ └── speaker1 +│ ├── 000001.wav +│ └── 000xxx.wav +└── pitch +│ └── speaker0 +│ │ ├── 000001.pit.npy +│ │ └── 000xxx.pit.npy +│ └── speaker1 +│ ├── 000001.pit.npy +│ └── 000xxx.pit.npy +└── hubert +│ └── speaker0 +│ │ ├── 000001.vec.npy +│ │ └── 000xxx.vec.npy +│ └── speaker1 +│ ├── 000001.vec.npy +│ └── 000xxx.vec.npy +└── whisper +│ └── speaker0 +│ │ ├── 000001.ppg.npy +│ │ └── 000xxx.ppg.npy +│ └── speaker1 +│ ├── 000001.ppg.npy +│ └── 000xxx.ppg.npy +└── speaker +│ └── speaker0 +│ │ ├── 000001.spk.npy +│ │ └── 000xxx.spk.npy +│ └── speaker1 +│ ├── 000001.spk.npy +│ └── 000xxx.spk.npy +└── singer + ├── speaker0.spk.npy + └── speaker1.spk.npy +``` + +1. Re-sampling + - Generate audio with a sampling rate of 16000Hz in `./data_svc/waves-16k` + ``` + python prepare/preprocess_a.py -w ./dataset_raw -o ./data_svc/waves-16k -s 16000 + ``` + + - Generate audio with a sampling rate of 32000Hz in `./data_svc/waves-32k` + ``` + python prepare/preprocess_a.py -w ./dataset_raw -o ./data_svc/waves-32k -s 32000 + ``` +2. Use 16K audio to extract pitch + ``` + python prepare/preprocess_crepe.py -w data_svc/waves-16k/ -p data_svc/pitch + ``` +3. Use 16K audio to extract ppg + ``` + python prepare/preprocess_ppg.py -w data_svc/waves-16k/ -p data_svc/whisper + ``` +4. Use 16K audio to extract hubert + ``` + python prepare/preprocess_hubert.py -w data_svc/waves-16k/ -v data_svc/hubert + ``` +5. Use 16k audio to extract timbre code + ``` + python prepare/preprocess_speaker.py data_svc/waves-16k/ data_svc/speaker + ``` +6. Extract the average value of the timbre code for inference; it can also replace a single audio timbre in generating the training index, and use it as the unified timbre of the speaker for training + ``` + python prepare/preprocess_speaker_ave.py data_svc/speaker/ data_svc/singer + ``` +7. use 32k audio to extract the linear spectrum + ``` + python prepare/preprocess_spec.py -w data_svc/waves-32k/ -s data_svc/specs + ``` +8. Use 32k audio to generate training index + ``` + python prepare/preprocess_train.py + ``` +11. Training file debugging + ``` + python prepare/preprocess_zzz.py + ``` + +## Train +1. If fine-tuning based on the pre-trained model, you need to download the pre-trained model: [sovits5.0.pretrain.pth](https://github.com/PlayVoice/so-vits-svc-5.0/releases/tag/5.0). Put pretrained model under project root, change this line + ``` + pretrain: "./vits_pretrain/sovits5.0.pretrain.pth" + ``` + in `configs/base.yaml`,and adjust the learning rate appropriately, eg 5e-5. + + `batch_szie`: for GPU with 6G VRAM, 6 is the recommended value, 8 will work but step speed will be much slower. +2. Start training + ``` + python svc_trainer.py -c configs/base.yaml -n sovits5.0 + ``` +3. Resume training + ``` + python svc_trainer.py -c configs/base.yaml -n sovits5.0 -p chkpt/sovits5.0/***.pth + ``` +4. Log visualization + ``` + tensorboard --logdir logs/ + ``` + +![sovits5 0_base](https://github.com/PlayVoice/so-vits-svc-5.0/assets/16432329/1628e775-5888-4eac-b173-a28dca978faa) + +![sovits_spec](https://github.com/PlayVoice/so-vits-svc-5.0/assets/16432329/c4223cf3-b4a0-4325-bec0-6d46d195a1fc) + +## Inference + +1. Export inference model: text encoder, Flow network, Decoder network + ``` + python svc_export.py --config configs/base.yaml --checkpoint_path chkpt/sovits5.0/***.pt + ``` +2. Inference + - if there is no need to adjust `f0`, just run the following command. + ``` + python svc_inference.py --config configs/base.yaml --model sovits5.0.pth --spk ./data_svc/singer/your_singer.spk.npy --wave test.wav --shift 0 + ``` + - if `f0` will be adjusted manually, follow the steps: + 1. use whisper to extract content encoding, generate `test.vec.npy`. + ``` + python whisper/inference.py -w test.wav -p test.ppg.npy + ``` + 2. use hubert to extract content vector, without using one-click reasoning, in order to reduce GPU memory usage + ``` + python hubert/inference.py -w test.wav -v test.vec.npy + ``` + 3. extract the F0 parameter to the csv text format, open the csv file in Excel, and manually modify the wrong F0 according to Audition or SonicVisualiser + ``` + python pitch/inference.py -w test.wav -p test.csv + ``` + 4. final inference + ``` + python svc_inference.py --config configs/base.yaml --model sovits5.0.pth --spk ./data_svc/singer/your_singer.spk.npy --wave test.wav --ppg test.ppg.npy --vec test.vec.npy --pit test.csv --shift 0 + ``` +3. Notes + + - when `--ppg` is specified, when the same audio is reasoned multiple times, it can avoid repeated extraction of audio content codes; if it is not specified, it will be automatically extracted; + + - when `--vec` is specified, when the same audio is reasoned multiple times, it can avoid repeated extraction of audio content codes; if it is not specified, it will be automatically extracted; + + - when `--pit` is specified, the manually tuned F0 parameter can be loaded; if not specified, it will be automatically extracted; + + - generate files in the current directory:svc_out.wav + +4. Arguments ref + + | args |--config | --model | --spk | --wave | --ppg | --vec | --pit | --shift | + | :---: | :---: | :---: | :---: | :---: | :---: | :---: | :---: | :---: | + | name | config path | model path | speaker | wave input | wave ppg | wave hubert | wave pitch | pitch shift | + +## Creat singer +named by pure coincidence:average -> ave -> eva,eve(eva) represents conception and reproduction + +``` +python svc_eva.py +``` + +```python +eva_conf = { + './configs/singers/singer0022.npy': 0, + './configs/singers/singer0030.npy': 0, + './configs/singers/singer0047.npy': 0.5, + './configs/singers/singer0051.npy': 0.5, +} +``` + +the generated singer file will be `eva.spk.npy`. + +## Data set + +| Name | URL | +| :--- | :--- | +|KiSing |http://shijt.site/index.php/2021/05/16/kising-the-first-open-source-mandarin-singing-voice-synthesis-corpus/| +|PopCS |https://github.com/MoonInTheRiver/DiffSinger/blob/master/resources/apply_form.md| +|opencpop |https://wenet.org.cn/opencpop/download/| +|Multi-Singer |https://github.com/Multi-Singer/Multi-Singer.github.io| +|M4Singer |https://github.com/M4Singer/M4Singer/blob/master/apply_form.md| +|CSD |https://zenodo.org/record/4785016#.YxqrTbaOMU4| +|KSS |https://www.kaggle.com/datasets/bryanpark/korean-single-speaker-speech-dataset| +|JVS MuSic |https://sites.google.com/site/shinnosuketakamichi/research-topics/jvs_music| +|PJS |https://sites.google.com/site/shinnosuketakamichi/research-topics/pjs_corpus| +|JUST Song |https://sites.google.com/site/shinnosuketakamichi/publication/jsut-song| +|MUSDB18 |https://sigsep.github.io/datasets/musdb.html#musdb18-compressed-stems| +|DSD100 |https://sigsep.github.io/datasets/dsd100.html| +|Aishell-3 |http://www.aishelltech.com/aishell_3| +|VCTK |https://datashare.ed.ac.uk/handle/10283/2651| + +## Code sources and references + +https://github.com/facebookresearch/speech-resynthesis [paper](https://arxiv.org/abs/2104.00355) + +https://github.com/jaywalnut310/vits [paper](https://arxiv.org/abs/2106.06103) + +https://github.com/openai/whisper/ [paper](https://arxiv.org/abs/2212.04356) + +https://github.com/NVIDIA/BigVGAN [paper](https://arxiv.org/abs/2206.04658) + +https://github.com/mindslab-ai/univnet [paper](https://arxiv.org/abs/2106.07889) + +https://github.com/nii-yamagishilab/project-NN-Pytorch-scripts/tree/master/project/01-nsf + +https://github.com/brentspell/hifi-gan-bwe + +https://github.com/mozilla/TTS + +https://github.com/bshall/soft-vc + +https://github.com/maxrmorrison/torchcrepe + +https://github.com/OlaWod/FreeVC [paper](https://arxiv.org/abs/2210.15418) + +[SNAC : Speaker-normalized Affine Coupling Layer in Flow-based Architecture for Zero-Shot Multi-Speaker Text-to-Speech](https://github.com/hcy71o/SNAC) + +[Adapter-Based Extension of Multi-Speaker Text-to-Speech Model for New Speakers](https://arxiv.org/abs/2211.00585) + +[AdaSpeech: Adaptive Text to Speech for Custom Voice](https://arxiv.org/pdf/2103.00993.pdf) + +[Cross-Speaker Prosody Transfer on Any Text for Expressive Speech Synthesis](https://github.com/ubisoft/ubisoft-laforge-daft-exprt) + +[Learn to Sing by Listening: Building Controllable Virtual Singer by Unsupervised Learning from Voice Recordings](https://arxiv.org/abs/2305.05401) + +[Adversarial Speaker Disentanglement Using Unannotated External Data for Self-supervised Representation Based Voice Conversion](https://arxiv.org/pdf/2305.09167.pdf) + +[Speaker normalization (GRL) for self-supervised speech emotion recognition](https://arxiv.org/abs/2202.01252) + +## Method of Preventing Timbre Leakage Based on Data Perturbation + +https://github.com/auspicious3000/contentvec/blob/main/contentvec/data/audio/audio_utils_1.py + +https://github.com/revsic/torch-nansy/blob/main/utils/augment/praat.py + +https://github.com/revsic/torch-nansy/blob/main/utils/augment/peq.py + +https://github.com/biggytruck/SpeechSplit2/blob/main/utils.py + +https://github.com/OlaWod/FreeVC/blob/main/preprocess_sr.py + +## Contributors + + + + + +## Relevant Projects +- [LoRA-SVC](https://github.com/PlayVoice/lora-svc): decoder only svc +- [NSF-BigVGAN](https://github.com/PlayVoice/NSF-BigVGAN): vocoder for more work diff --git a/README_V1.md b/README_V1.md new file mode 100755 index 0000000000000000000000000000000000000000..7b1d52c87ce8d3481aabba8c34768b09d4048c0c --- /dev/null +++ b/README_V1.md @@ -0,0 +1,102 @@ +Variational Inference with adversarial learning for end-to-end Singing Voice Conversion based on CFM + +This project targets deep learning beginners, basic knowledge of Python and PyTorch are the prerequisites for this project. This project implements a highly modular, mathematically rigorous Continuous Normalizing Flow (CFM) based Singing Voice Conversion (SVC) system using a pretrained codec and learned projection (Option C*). + +By replacing the VITS/VAE monoliths with a Diffusion Transformer (DiT) and an explicit codebook projector, we achieve stronger temporal dependency modeling and faster, more stable training without the overhead of learning an autoencoder from scratch. + +## Architecture Highlights +- **Frozen Pretrained Codec**: Uses a pretrained neural codec (e.g., DAC 44KHz) purely for encoding and decoding, freezing its weights to save VRAM. +- **Offline Data Processing**: `z_target` latents are extracted once before training, preventing massive CPU/GPU bottlenecks in dataloaders. +- **Diffusion Transformer (DiT)**: Velocity field prediction $v_\theta$ uses a DiT instead of 1D U-Nets for state-of-the-art long-sequence audio modeling. +- **Dual-Loss Formulation with Implied Targets**: Solves the mathematical trap of backpropagating through an ODE solver during training. Calculates projection commitments instantly via the target velocity. + +## Quick Installation + +```bash +# clone project +git clone https://github.com/ouor/so-vits-svc-5.0 + +# create virtual environment +python -m venv .venv + +# activate virtual environment +.venv\Scripts\activate + +# install pytorch +pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu117 + +# install dependencies +pip install -r requirements.txt +pip install descript-audio-codec + +# run app.py (Gradio UI) +python ui_cfm.py +``` + +## Setup Environment + +- Download the Timbre Encoder: Speaker-Encoder by @mueller91, put `best_model.pth.tar` into `speaker_pretrain/`. +- Download whisper model whisper-large-v2. Make sure to download `large-v2.pt`, put it into `whisper_pretrain/`. +- Download hubert_soft model, put `hubert-soft-0d54a1f4.pt` into `hubert_pretrain/`. +- Download pitch extractor crepe full, put `full.pth` into `crepe/assets`. + +## Dataset preparation + +Necessary pre-processing: +1. Separate voice and accompaniment with UVR (skip if no accompaniment). +2. Cut audio input to shorter length with slicer (< 30s). +3. Put the dataset into the `dataset_raw` directory following the structure below. + +``` +dataset_raw +├───speaker0 +│ ├───000001.wav +│ └───000xxx.wav +└───speaker1 + ├───000001.wav + └───000xxx.wav +``` + +## Data preprocessing (Offline Shift) + +Unlike traditional VAE-based SVC which handles encoding in the dataloader, this pipeline pre-extracts both conditioning and quantized continuous vectors to save GPU resources. + +1. **Standard Extractors**: Extract PPG (Whisper), F0 (Crepe), and Speaker embeddings into their respective `data_svc/` folders: + ```bash + python svc_preprocessing.py -t 2 + ``` + +2. **Codec Targets Extraction**: Run the new offline generation script to pass all waveforms through the frozen codec and cache `z_target` tensors. + ```bash + python data/codec_targets.py -w ./data_svc/waves-32k -o ./data_svc/codec_targets + ``` + +## Train + +You will jointly train the DiT velocity network $v_\theta$ and the lightweight projection network $P(u)$. The heavy codec encoder/decoder remains entirely offline. + +```bash +# Start Training +python train_cfm.py +``` +*The training script utilizes the dual-loss schema (Flow matching MSE + Projector Commitment MSE) utilizing the implicit velocity targets rather than integrating an ODE. Models will automatically save to the `chkpt/` folder.* + +## Inference + +The inference pipeline extracts conditioning, samples the continuous spatial latent using your preferred ODE solver (Euler, Heun, RK4), snaps the sample back to codebook space using the projector, and finally decodes the waveform via the DAC codec. **Long audio inputs will automatically chunk into 30s segments to avoid VRAM overflow.** + +```bash +# Run Inference +python infer.py --wave /path/to/your/input.wav +``` + +### Notes on Inference Pipeline Components: +- The **ODE Solver** (`samplers/ode.py`) is modular. You can configure solver steps and methods (`solver='rk4'`) based on your quality-vs-speed needs. +- **Temporal Resampling** is handled automatically in `models/cond_encoder.py`, perfectly matching Whisper and Crepe conditionings to the target codec's continuous latent frame sequence length. + +## Code sources and references + +- Rectified Flow / Flow Matching literature +- Diffusion Transformers (DiT) based on [Peebles & Xie, 2022] +- Neural Audio Codecs (DAC / EnCodec) +- so-vits-svc-5.0 original repository components extracted for preprocessing diff --git a/app.py b/app.py new file mode 100644 index 0000000000000000000000000000000000000000..8539a72ff37ada8c67f5144b5c50a681cb0b998f --- /dev/null +++ b/app.py @@ -0,0 +1,356 @@ +import os +import subprocess +import yaml +import sys +import webbrowser +import gradio as gr +import shutil +import soundfile +import shlex + +class WebUI: + def __init__(self): + self.train_config_path = 'configs/train.yaml' + self.info = Info() + self.names = [] + self.names2 = [] + self.voice_names = [] + base_config_path = 'configs/base.yaml' + if not os.path.exists(self.train_config_path): + shutil.copyfile(base_config_path, self.train_config_path) + print("초기화 성공") + else: + print("준비됨") + self.main_ui() + + def main_ui(self): + with gr.Blocks(theme=gr.themes.Base(primary_hue=gr.themes.colors.green)) as ui: + gr.Markdown('# so-vits-svc5.0 WebUI') + + with gr.Tab("학습"): + with gr.Accordion('학습 안내', open=False): + gr.Markdown(self.info.train) + + gr.Markdown('### 데이터셋 파일 복사') + with gr.Row(): + self.dataset_name = gr.Textbox(value='', placeholder='chopin', label='데이터셋 이름', info='데이터셋 화자의 이름을 입력하세요.', interactive=True) + self.dataset_src = gr.Textbox(value='', placeholder='C:/Users/Tacotron2/Downloads/chopin_dataset/', label='데이터셋 폴더', info='데이터셋 wav 파일이 있는 폴더를 지정하세요.', interactive=True) + self.bt_dataset_copy = gr.Button(value='복사', variant="primary") + + gr.Markdown('### 전처리 파라미터 설정') + with gr.Row(): + self.model_name = gr.Textbox(value='sovits5.0', label='model', info='모델명', interactive=True) + self.f0_extractor = gr.Dropdown(choices=['crepe'], value='crepe', label='f0_extractor', info='F0 추출기', interactive=True) + self.thread_count = gr.Slider(minimum=1, maximum=os.cpu_count(), step=1, value=2, label='thread_count', info='전처리 스레드 수', interactive=True) + + gr.Markdown('### 학습 파라미터 설정') + with gr.Row(): + self.learning_rate = gr.Number(value=5e-5, label='learning_rate', info='학습률', interactive=True) + self.batch_size = gr.Slider(minimum=1, maximum=50, step=1, value=6, label='batch_size', info='배치 크기', interactive=True) + self.epochs = gr.Textbox(value='100', label='epoch', info='학습 에포크 수', interactive=True) + with gr.Row(): + self.info_interval = gr.Number(value=50, label='info_interval', info='학습 로깅 간격(step}', interactive=True) + self.eval_interval = gr.Number(value=1, label='eval_interval', info='검증 세트 간격(epoch}', interactive=True) + self.save_interval = gr.Number(value=5, label='save_interval', info='체크포인트 저장 간격(epoch}', interactive=True) + self.keep_ckpts = gr.Number(value=5, label='keep_ckpts', info='최신 체크포인트 파일 유지 갯수(0은 모두 저장)',interactive=True) + with gr.Row(): + self.use_pretrained = gr.Checkbox(label="use_pretrained", info='사전학습모델 사용 여부', value=True, interactive=True, visible=False) + + gr.Markdown('### 학습 시작') + with gr.Row(): + self.bt_open_dataset_folder = gr.Button(value='데이터 세트 폴더 열기') + self.bt_onekey_train = gr.Button('원클릭 학습 시작', variant="primary") + self.bt_tb = gr.Button('Tensorboard 열기', variant="primary") + + gr.Markdown('### 학습 재개') + with gr.Row(): + self.resume_model = gr.Dropdown(choices=sorted(self.names), label='Resume training progress from checkpoints', info='체크포인트에서 학습 진행 재개', interactive=True) + with gr.Column(): + self.bt_refersh = gr.Button('새로 고침') + self.bt_resume_train = gr.Button('학습 재개', variant="primary") + + with gr.Tab("추론"): + + with gr.Accordion('추론 안내', open=False): + gr.Markdown(self.info.inference) + + gr.Markdown('### 추론 파라미터 설정') + with gr.Row(): + with gr.Column(): + self.keychange = gr.Slider(-12, 12, value=0, step=1, label='음높이 조절') + self.file_list = gr.Markdown(value="", label="파일 목록") + + with gr.Row(): + self.resume_model2 = gr.Dropdown(choices=sorted(self.names2), label='Select the model you want to export', + info='내보낼 모델 선택', interactive=True) + with gr.Column(): + self.bt_refersh2 = gr.Button(value='모델 및 사운드 새로 고침') + self.bt_out_model = gr.Button(value='모델 내보내기', variant="primary") + with gr.Row(): + self.resume_voice = gr.Dropdown(choices=sorted(self.voice_names), label='Select the sound file', + info='*.spk.npy 파일 선택', interactive=True) + with gr.Row(): + self.input_wav = gr.Audio(type='filepath', label='변환할 오디오 선택', source='upload') + with gr.Row(): + self.bt_infer = gr.Button(value='변환 시작', variant="primary") + with gr.Row(): + self.output_wav = gr.Audio(label='출력 오디오', interactive=False) + + self.bt_dataset_copy.click(fn=self.copydataset, inputs=[self.dataset_name, self.dataset_src]) + self.bt_open_dataset_folder.click(fn=self.openfolder) + self.bt_onekey_train.click(fn=self.onekey_training,inputs=[self.model_name, self.thread_count,self.learning_rate,self.batch_size, self.epochs, self.info_interval, self.eval_interval,self.save_interval, self.keep_ckpts, self.use_pretrained]) + self.bt_out_model.click(fn=self.out_model, inputs=[self.model_name, self.resume_model2]) + self.bt_tb.click(fn=self.tensorboard) + self.bt_refersh.click(fn=self.refresh_model, inputs=[self.model_name], outputs=[self.resume_model]) + self.bt_resume_train.click(fn=self.resume_train, inputs=[self.model_name, self.resume_model, self.epochs]) + self.bt_infer.click(fn=self.inference, inputs=[self.input_wav, self.resume_voice, self.keychange], outputs=[self.output_wav]) + self.bt_refersh2.click(fn=self.refresh_model_and_voice, inputs=[self.model_name],outputs=[self.resume_model2, self.resume_voice]) + + ui.launch(inbrowser=True) + + def copydataset(self, dataset_name, dataset_src): + assert dataset_name != '', '데이터셋 이름을 입력하세요' + assert dataset_src != '', '데이터셋 경로를 입력하세요' + assert os.path.isdir(dataset_src), '데이터셋 경로가 잘못되었습니다' + from glob import glob + wav_files = glob(os.path.join(dataset_src, '*.wav')) + assert len(wav_files) > 0, '데이터셋 경로에 wav 파일이 없습니다' + + import shutil + dst_dir = os.path.join('dataset_raw', dataset_name) + if not os.path.exists(dst_dir): os.makedirs(dst_dir, exist_ok=True) + for wav_file in wav_files: + shutil.copy(wav_file, dst_dir) + print('데이터셋 복사 완료') + + def openfolder(self): + if not os.path.exists('dataset_raw'): os.makedirs('dataset_raw', exist_ok=True) + try: + if sys.platform.startswith('win'): + os.startfile('dataset_raw') + elif sys.platform.startswith('linux'): + subprocess.call(['xdg-open', 'dataset_raw']) + elif sys.platform.startswith('darwin'): + subprocess.call(['open', 'dataset_raw']) + else: + print('폴더를 열지 못했습니다!') + except BaseException: + print('폴더를 열지 못했습니다!') + + def preprocessing(self, thread_count): + print('전처리 시작') + train_process = subprocess.Popen(f'{sys.executable} -u svc_preprocessing.py -t {str(thread_count)}', stdout=subprocess.PIPE) + while train_process.poll() is None: + output = train_process.stdout.readline().decode('utf-8') + print(output, end='') + + def create_config(self, model_name, learning_rate, batch_size, epochs, info_interval, eval_interval, save_interval, + keep_ckpts, use_pretrained): + with open("configs/train.yaml", "r") as f: + config = yaml.load(f, Loader=yaml.FullLoader) + config['train']['model'] = model_name + config['train']['learning_rate'] = learning_rate + config['train']['batch_size'] = batch_size + config['train']['epochs'] = int(epochs) + config["log"]["info_interval"] = int(info_interval) + config["log"]["eval_interval"] = int(eval_interval) + config["log"]["save_interval"] = int(save_interval) + config["log"]["keep_ckpts"] = int(keep_ckpts) + if use_pretrained: + config["train"]["pretrain"] = "vits_pretrain/sovits5.0.pretrain.pth" + else: + config["train"]["pretrain"] = "" + with open("configs/train.yaml", "w") as f: + yaml.dump(config, f) + return f"로그 파라미터를 다음으로 업데이트했습니다.{config['log']}" + + def training(self, model_name): + print('학습 시작') + print('학습을 수행하는 새로운 콘솔 창이 열립니다.') + print('학습 도중 학습을 중지하려면, 콘솔 창을 닫으세요.') + train_process = subprocess.Popen(f'{sys.executable} -u svc_trainer.py -c {self.train_config_path} -n {str(model_name)}', stdout=subprocess.PIPE, creationflags=subprocess.CREATE_NEW_CONSOLE) + while train_process.poll() is None: + output = train_process.stdout.readline().decode('utf-8') + print(output, end='') + + def onekey_training(self, model_name, thread_count, learning_rate, batch_size, epochs, info_interval, eval_interval, save_interval, keep_ckpts, use_pretrained): + print(model_name, thread_count, learning_rate, batch_size, epochs, info_interval, eval_interval, save_interval, keep_ckpts) + self.create_config(model_name, learning_rate, batch_size, epochs, info_interval, eval_interval, save_interval, keep_ckpts, use_pretrained) + self.preprocessing(thread_count) + self.training(model_name) + + def out_model(self, model_name, resume_model2): + print('모델 내보내기 시작') + try: + subprocess.Popen(f'{sys.executable} -u svc_export.py -c {self.train_config_path} -p "chkpt/{model_name}/{resume_model2}"',stdout=subprocess.PIPE) + print('모델 내보내기 성공') + except Exception as e: + print("에러 발생함:", e) + + + def tensorboard(self): + tensorboard_path = os.path.join(os.path.dirname(sys.executable), 'Scripts', 'tensorboard.exe') + print(tensorboard_path) + tb_process = subprocess.Popen(f'{tensorboard_path} --logdir=logs --port=6006', stdout=subprocess.PIPE, creationflags=subprocess.CREATE_NEW_CONSOLE) + webbrowser.open("http://localhost:6006") + + while tb_process.poll() is None: + output = tb_process.stdout.readline().decode('utf-8') + print(output) + + def refresh_model(self, model_name): + self.script_dir = os.path.dirname(os.path.abspath(__file__)) + self.model_root = os.path.join(self.script_dir, f"chkpt/{model_name}") + self.names = [] + try: + for self.name in os.listdir(self.model_root): + if self.name.endswith(".pt"): + self.names.append(self.name) + return {"choices": sorted(self.names), "__type__": "update"} + except FileNotFoundError: + return {"label": "모델 파일 누락", "__type__": "update"} + + def refresh_model2(self, model_name): + self.script_dir = os.path.dirname(os.path.abspath(__file__)) + self.model_root = os.path.join(self.script_dir, f"chkpt/{model_name}") + self.names2 = [] + try: + for self.name in os.listdir(self.model_root): + if self.name.endswith(".pt"): + self.names2.append(self.name) + return {"choices": sorted(self.names2), "__type__": "update"} + except FileNotFoundError as e: + return {"label": "모델 파일 누락", "__type__": "update"} + + def refresh_voice(self): + self.script_dir = os.path.dirname(os.path.abspath(__file__)) + self.model_root = os.path.join(self.script_dir, "data_svc/singer") + self.voice_names = [] + for self.name in os.listdir(self.model_root): + if self.name.endswith(".npy"): + self.voice_names.append(self.name) + return {"choices": sorted(self.voice_names), "__type__": "update"} + + def refresh_model_and_voice(self, model_name): + model_update = self.refresh_model2(model_name) + voice_update = self.refresh_voice() + return model_update, voice_update + + def resume_train(self, model_name, resume_model, epochs): + print('학습 재개') + with open("configs/train.yaml", "r") as f: + config = yaml.load(f, Loader=yaml.FullLoader) + config['epochs'] = epochs + with open("configs/train.yaml", "w") as f: + yaml.dump(config, f) + train_process = subprocess.Popen(f'{sys.executable} -u svc_trainer.py -c {self.train_config_path} -n {model_name} -p "chkpt/{model_name}/{resume_model}"', stdout=subprocess.PIPE, creationflags=subprocess.CREATE_NEW_CONSOLE) + while train_process.poll() is None: + output = train_process.stdout.readline().decode('utf-8') + print(output, end='') + + def inference(self, input, resume_voice, keychange): + if os.path.isfile('test.wav'): os.remove('test.wav') + self.train_config_path = 'configs/train.yaml' + print('추론 시작') + shutil.copy(input, ".") + input_name = os.path.basename(input) + os.rename(input_name, "test.wav") + input_name = "test.wav" + if not input_name.endswith(".wav"): + data, samplerate = soundfile.read(input_name) + input_name = input_name.rsplit(".", 1)[0] + ".wav" + soundfile.write(input_name, data, samplerate) + train_config_path = shlex.quote(self.train_config_path) + keychange = shlex.quote(str(keychange)) + cmd = [f'{sys.executable}', "-u", "svc_inference.py", "--config", train_config_path, "--model", "sovits5.0.pth", "--spk", + f"data_svc/singer/{resume_voice}", "--wave", "test.wav", "--shift", keychange, '--clean'] + train_process = subprocess.run(cmd, shell=False, capture_output=True, text=True) + print(train_process.stdout) + print(train_process.stderr) + print("추론 성공") + return "svc_out.wav" + + +class Info: + def __init__(self) -> None: + self.train = ''' +### 2023.7.11\n +@OOPPEENN(https://github.com/OOPPEENN)第一次编写\n +@thestmitsuk(https://github.com/thestmitsuki)二次补完\n +@OOPPEENN(https://github.com/OOPPEENN)is written for the first time\n +@thestmitsuki(https://github.com/thestmitsuki)Secondary completion + + ''' + self.inference = ''' +### 2023.7.11\n +@OOPPEENN(https://github.com/OOPPEENN)第一次编写\n +@thestmitsuk(https://github.com/thestmitsuki)二次补完\n +@OOPPEENN(https://github.com/OOPPEENN)is written for the first time\n +@thestmitsuki(https://github.com/thestmitsuki)Secondary completion + + ''' + +def check_pretrained(): + links = { + 'hubert_pretrain/hubert-soft-0d54a1f4.pt': 'https://github.com/bshall/hubert/releases/download/v0.1/hubert-soft-0d54a1f4.pt', + 'speaker_pretrain/best_model.pth.tar': 'https://drive.google.com/uc?id=1UPjQ2LVSIt3o-9QMKMJcdzT8aZRZCI-E', + 'speaker_pretrain/config.json': 'https://raw.githubusercontent.com/PlayVoice/so-vits-svc-5.0/9d415f9d7c7c7a131b89ec6ff633be10739f41ed/speaker_pretrain/config.json', + 'whisper_pretrain/large-v2.pt': 'https://openaipublic.azureedge.net/main/whisper/models/81f7c96c852ee8fc832187b0132e569d6c3065a3252ed18e56effd0b6a73e524/large-v2.pt', + 'crepe/assets/full.pth': 'https://github.com/maxrmorrison/torchcrepe/raw/master/torchcrepe/assets/full.pth', + 'vits_pretrain/sovits5.0.pretrain.pth': 'https://github.com/PlayVoice/so-vits-svc-5.0/releases/download/5.0/sovits5.0.pretrain.pth', + } + + links_to_download = {} + for path, link in links.items(): + if not os.path.isfile(path): + links_to_download[path] = link + + if len(links_to_download) == 0: + print("사전 학습 모델이 모두 존재합니다.") + return + + import gdown + import requests + + def download(url, path): + r = requests.get(url, allow_redirects=True) + open(path, 'wb').write(r.content) + + for path, url in links_to_download.items(): + if not os.path.exists(os.path.dirname(path)): + os.makedirs(os.path.dirname(path)) + print(f"사전 학습 모델 {path} 다운로드 중...") + if "drive.google.com" in url: + gdown.download(url, path, quiet=False) + else: + download(url, path) + print(f"사전 학습 모델 {path} 다운로드 완료") + + print("모든 사전 학습 모델이 다운로드 되었습니다.") + return + +def check_transformers(): + try: + import transformers + del transformers + except: + print("transformers 라이브러리를 설치합니다.") + os.system(f"{sys.executable} -m pip install transformers") + print("transformers 라이브러리 설치 완료") + return + +def check_tensorboard(): + try: + import tensorboard + del tensorboard + except: + print("tensorboard 라이브러리를 설치합니다.") + os.system(f"{sys.executable} -m pip install tensorboard") + print("tensorboard 라이브러리 설치 완료") + return + +if __name__ == "__main__": + check_pretrained() + check_transformers() + check_tensorboard() + webui = WebUI() diff --git a/automated_pipeline.sh b/automated_pipeline.sh new file mode 100644 index 0000000000000000000000000000000000000000..791d990947870c67dbbe330778c0f085c1bb078a --- /dev/null +++ b/automated_pipeline.sh @@ -0,0 +1,97 @@ +#!/bin/bash +#SBATCH --job-name=cfm_full_pipeline +#SBATCH --partition=a100 +#SBATCH --gres=gpu:1 +#SBATCH --cpus-per-task=8 +#SBATCH --mem=64G +#SBATCH --time=120:00:00 +#SBATCH --output=logs/pipeline_%j.out +#SBATCH --error=logs/pipeline_%j.err +#SBATCH --mail-type=ALL +#SBATCH --mail-user=hl3025@imperial.ac.uk + +set -e # Exit on any error + +# Navigate to project directory +cd /vol/bitbucket/hl3025/cfm_svc + +# Activate environment +source .venv_linux/bin/activate + +# Export environment variables +export PIP_CACHE_DIR=/vol/bitbucket/hl3025/pip_cache +export TMPDIR=/vol/bitbucket/hl3025/tmp + +# Prevent BLAS/OpenMP from spawning too many threads +export OMP_NUM_THREADS=1 +export OPENBLAS_NUM_THREADS=1 +export MKL_NUM_THREADS=1 +export VECLIB_MAXIMUM_THREADS=1 +export NUMEXPR_NUM_THREADS=1 + +# Force Python output to be unbuffered so logs stream instantly +export PYTHONUNBUFFERED=1 + +# Create logs directory if it doesn't exist +mkdir -p logs + +echo "======================================" +echo "Starting CFM SVC Automated Pipeline" +echo "======================================" +echo "Start time: $(date)" + +# ============================================================================ +# STAGE 1: Data Preprocessing +# ============================================================================ +echo "" +echo "STAGE 1: Data Preprocessing with 8 threads..." +echo "Time: $(date)" +python svc_preprocessing.py -t 8 + +# ============================================================================ +# STAGE 2: Codec Targets Generation +# ============================================================================ +echo "" +echo "STAGE 2: Generating Codec Targets..." +echo "Time: $(date)" +python data/codec_targets.py -w ./data_svc/waves-32k -o ./data_svc/codec_targets + +# ============================================================================ +# STAGE 3: Teacher Model Distillation (Offline) +# ============================================================================ +echo "" +echo "STAGE 3: Offline Teacher Distillation..." +echo "Time: $(date)" +python preprocess_teacher.py \ + --teacher_ckpt vits_pretrain/sovits5.0.pretrain.pth \ + --teacher_config configs/base.yaml \ + --codec_target_dir ./data_svc/codec_targets \ + --data_root ./data_svc \ + --out_dir ./data_svc/teacher_codec_targets \ + --log_interval 200 + +# ============================================================================ +# STAGE 4: CFM Training +# ============================================================================ +echo "" +echo "STAGE 4: CFM Training with Teacher Distillation..." +echo "Time: $(date)" +python train_cfm.py \ + --data_dir ./data_svc/codec_targets \ + --teacher_target_dir ./data_svc/teacher_codec_targets \ + --lambda_teacher 0 \ + --batch_size 16 \ + --lr 1e-4 \ + --num_workers 4 \ + --epochs 200 \ + --log_interval 50 \ + --save_interval 10 + +# ============================================================================ +# Pipeline Complete +# ============================================================================ +echo "" +echo "======================================" +echo "CFM SVC Automated Pipeline Complete!" +echo "======================================" +echo "End time: $(date)" diff --git a/build_faiss_index.py b/build_faiss_index.py new file mode 100644 index 0000000000000000000000000000000000000000..b990886d4804d3c88256735f151289faeb90a619 --- /dev/null +++ b/build_faiss_index.py @@ -0,0 +1,51 @@ +import argparse +import glob +import os +import faiss +import numpy as np +from tqdm import tqdm + +def build_index(speaker_dir, output_path): + print(f"Finding HuBERT features in {speaker_dir}...") + vec_files = glob.glob(os.path.join(speaker_dir, "*.vec.npy")) + + if not vec_files: + print(f"No .vec.npy files found in {speaker_dir}!") + return + + print(f"Found {len(vec_files)} files. Loading vectors...") + + all_vectors = [] + for f in tqdm(vec_files): + vec = np.load(f) # (T, 256) + all_vectors.append(vec) + + all_vectors = np.concatenate(all_vectors, axis=0).astype(np.float32) + print(f"Total frames: {all_vectors.shape[0]}, Feature dimension: {all_vectors.shape[1]}") + + # Initialize FAISS index + # We use IndexFlatL2 for exact nearest neighbor search based on L2 distance. + index = faiss.IndexFlatL2(all_vectors.shape[1]) + + print("Adding vectors to FAISS index...") + index.add(all_vectors) + + os.makedirs(os.path.dirname(output_path), exist_ok=True) + + print(f"Saving index to {output_path}...") + faiss.write_index(index, output_path) + + # Save the original vectors as well so we can retrieve them and average them + vectors_path = output_path.replace(".index", "_vectors.npy") + print(f"Saving source vectors to {vectors_path}...") + np.save(vectors_path, all_vectors) + + print("Done!") + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument("--speaker_dir", type=str, required=True, help="Path to speaker's HuBERT directory (e.g. data_svc/hubert/singer_0005)") + parser.add_argument("--output_path", type=str, required=True, help="Where to save the .index file (e.g. data_svc/hubert/singer_0005/feature.index)") + args = parser.parse_args() + + build_index(args.speaker_dir, args.output_path) diff --git a/build_mmap.py b/build_mmap.py new file mode 100644 index 0000000000000000000000000000000000000000..23559b475345c9515ddd416fc7ef76699a29b1e4 --- /dev/null +++ b/build_mmap.py @@ -0,0 +1,85 @@ +import os +import glob +import numpy as np +from tqdm import tqdm +import argparse + +def build_mmap(data_dir, feature_name, output_prefix): + """ + Combines all .npy files for a feature (e.g. hubert, ppg) into a single large + memory-mapped array alongside an index file for fast O(1) lookups. + """ + print(f"Finding {feature_name} features in {data_dir}...") + files = glob.glob(os.path.join(data_dir, "**", "*.npy"), recursive=True) + + if not files: + print(f"No {feature_name} files found!") + return + + # We don't want to load them all into RAM at once, so we do two passes. + # First pass: find total frames and index mapping. + print("Pass 1: calculating total frames and indexing...") + total_frames = 0 + dim = None + dtype = None + + index_map = {} # { filename: (start_idx, length) } + + valid_files = [] + + for f in tqdm(files): + # We can memory map just to get shape/dtype quickly + try: + arr = np.load(f, mmap_mode='r') + if dim is None: + dim = arr.shape[1] if len(arr.shape) > 1 else 1 + dtype = arr.dtype + + length = arr.shape[0] + + # Use relative path as key + rel_path = os.path.relpath(f, start=data_dir) + + index_map[rel_path] = (total_frames, length) + total_frames += length + valid_files.append((f, rel_path, length)) + + except Exception as e: + pass + + print(f"Total valid files: {len(valid_files)}") + print(f"Total frames: {total_frames}, Feature dim: {dim}") + + # Second pass: allocate mmap and write + mmap_path = f"{output_prefix}.npy" + index_path = f"{output_prefix}_index.npy" + + shape = (total_frames, dim) if dim > 1 else (total_frames,) + print(f"Allocating mmap at {mmap_path} with shape {shape}...") + + mmap_arr = np.lib.format.open_memmap(mmap_path, mode='w+', dtype=dtype, shape=shape) + + print("Pass 2: writing data to mmap...") + for f, rel_path, length in tqdm(valid_files): + start_idx, _ = index_map[rel_path] + + arr = np.load(f) + if dim == 1: + mmap_arr[start_idx : start_idx + length] = arr + else: + mmap_arr[start_idx : start_idx + length, :] = arr + + mmap_arr.flush() + + print(f"Saving index map to {index_path}...") + np.save(index_path, index_map) + print("Done!") + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument("--data_dir", type=str, required=True, help="Base dir (e.g. data_svc/hubert)") + parser.add_argument("--feature", type=str, required=True, help="Feature name for printing") + parser.add_argument("--out_prefix", type=str, required=True, help="Output path prefix (e.g. data_svc/hubert_mmap)") + args = parser.parse_args() + + build_mmap(args.data_dir, args.feature, args.out_prefix) diff --git a/compare_audio.py b/compare_audio.py new file mode 100644 index 0000000000000000000000000000000000000000..7c1f6de79f47f749d5d2d361163ba1ad6cad85f8 --- /dev/null +++ b/compare_audio.py @@ -0,0 +1,20 @@ +import soundfile as sf +import librosa +import numpy as np + +wav_gt, sr = librosa.load('test_train_gt.wav', sr=44100) +wav_pred, _ = librosa.load('test_overfit_pe.wav', sr=44100) + +min_len = min(len(wav_gt), len(wav_pred)) + +# Calculate spectral diff +S_gt = np.abs(librosa.stft(wav_gt[:min_len])) +S_pred = np.abs(librosa.stft(wav_pred[:min_len])) + +diff = np.mean(np.abs(S_gt - S_pred)) +print("Spectral Mean Absolute Error:", diff) + +# Write a mix to see if they are identical but one is delayed +mix = np.zeros_like(wav_gt[:min_len]) +mix = (wav_gt[:min_len] + wav_pred[:min_len]) / 2 +sf.write('test_mix.wav', mix, 44100) diff --git a/configs/base.yaml b/configs/base.yaml new file mode 100644 index 0000000000000000000000000000000000000000..6b1dfa07d3e49429aeb2084b92a1b6842bcc9f46 --- /dev/null +++ b/configs/base.yaml @@ -0,0 +1,71 @@ +train: + model: "sovits" + seed: 1234 + epochs: 10 + learning_rate: 5e-5 + betas: [0.8, 0.99] + lr_decay: 0.999875 + eps: 1e-9 + batch_size: 2 + c_stft: 9 + c_mel: 1. + c_kl: 0.2 + port: 8001 + pretrain: "./vits_pretrain/sovits5.0.pretrain.pth" +############################# +data: + training_files: "files/train.txt" + validation_files: "files/valid.txt" + segment_size: 8000 # WARNING: base on hop_length + max_wav_value: 32768.0 + sampling_rate: 32000 + filter_length: 1024 + hop_length: 320 + win_length: 1024 + mel_channels: 100 + mel_fmin: 50.0 + mel_fmax: 16000.0 +############################# +vits: + ppg_dim: 1280 + vec_dim: 256 + spk_dim: 256 + gin_channels: 256 + inter_channels: 192 + hidden_channels: 192 + filter_channels: 640 +############################# +gen: + upsample_input: 192 + upsample_rates: [5,4,4,2,2] + upsample_kernel_sizes: [15,8,8,4,4] + upsample_initial_channel: 320 + resblock_kernel_sizes: [3,7,11] + resblock_dilation_sizes: [[1,3,5], [1,3,5], [1,3,5]] +############################# +mpd: + periods: [2,3,5,7,11] + kernel_size: 5 + stride: 3 + use_spectral_norm: False + lReLU_slope: 0.2 +############################# +mrd: + resolutions: "[(1024, 120, 600), (2048, 240, 1200), (4096, 480, 2400), (512, 50, 240)]" # (filter_length, hop_length, win_length) + use_spectral_norm: False + lReLU_slope: 0.2 +############################# +log: + info_interval: 100 + eval_interval: 10 + save_interval: 10 + num_audio: 6 + pth_dir: 'chkpt' + log_dir: 'logs' + keep_ckpts: 0 +############################# +dist_config: + dist_backend: "nccl" + dist_url: "tcp://localhost:54321" + world_size: 1 + diff --git a/configs/singers/singer0001.npy b/configs/singers/singer0001.npy new file mode 100644 index 0000000000000000000000000000000000000000..3c95d83dae5147fdc70d2c427dce80e3f124cae5 Binary files /dev/null and b/configs/singers/singer0001.npy differ diff --git a/configs/singers/singer0002.npy b/configs/singers/singer0002.npy new file mode 100644 index 0000000000000000000000000000000000000000..96d7119554a83694a5a63597b6041b12d19f55ae Binary files /dev/null and b/configs/singers/singer0002.npy differ diff --git a/configs/singers/singer0003.npy b/configs/singers/singer0003.npy new file mode 100644 index 0000000000000000000000000000000000000000..659a6f1d2480e56aa7b66c5211cdfe161d8eccde Binary files /dev/null and b/configs/singers/singer0003.npy differ diff --git a/configs/singers/singer0004.npy b/configs/singers/singer0004.npy new file mode 100644 index 0000000000000000000000000000000000000000..ece16951c072a239ade24ad284d2189d2675b7ed Binary files /dev/null and b/configs/singers/singer0004.npy differ diff --git a/configs/singers/singer0005.npy b/configs/singers/singer0005.npy new file mode 100644 index 0000000000000000000000000000000000000000..9535e17f0493344301232a6b5d6fa42af60dffe0 Binary files /dev/null and b/configs/singers/singer0005.npy differ diff --git a/configs/singers/singer0006.npy b/configs/singers/singer0006.npy new file mode 100644 index 0000000000000000000000000000000000000000..703154e6ed42e1d5361d14dcb31e853f0976ba91 Binary files /dev/null and b/configs/singers/singer0006.npy differ diff --git a/configs/singers/singer0007.npy b/configs/singers/singer0007.npy new file mode 100644 index 0000000000000000000000000000000000000000..92c87c6fd83992ec8d88a338f8d0ad7e82e34345 Binary files /dev/null and b/configs/singers/singer0007.npy differ diff --git a/configs/singers/singer0008.npy b/configs/singers/singer0008.npy new file mode 100644 index 0000000000000000000000000000000000000000..eca296230c2b8ec99803ab6dff37ee3b0fffc9c0 Binary files /dev/null and b/configs/singers/singer0008.npy differ diff --git a/configs/singers/singer0009.npy b/configs/singers/singer0009.npy new file mode 100644 index 0000000000000000000000000000000000000000..9a2e7bfd460d5e96adb9508779d3d21cc90dbe11 Binary files /dev/null and b/configs/singers/singer0009.npy differ diff --git a/configs/singers/singer0010.npy b/configs/singers/singer0010.npy new file mode 100644 index 0000000000000000000000000000000000000000..b54fe9da1464b769c772bd64338c114b9114e593 Binary files /dev/null and b/configs/singers/singer0010.npy differ diff --git a/configs/singers/singer0011.npy b/configs/singers/singer0011.npy new file mode 100644 index 0000000000000000000000000000000000000000..6054d872bf1418c3ade49d135e0f196b7aadb9f5 Binary files /dev/null and b/configs/singers/singer0011.npy differ diff --git a/configs/singers/singer0012.npy b/configs/singers/singer0012.npy new file mode 100644 index 0000000000000000000000000000000000000000..416d3aacad37027bb27af262077a6d7bda597634 Binary files /dev/null and b/configs/singers/singer0012.npy differ diff --git a/configs/singers/singer0013.npy b/configs/singers/singer0013.npy new file mode 100644 index 0000000000000000000000000000000000000000..bf96f4ed81bd8ff2e8204e04252adcfb7e8f494b Binary files /dev/null and b/configs/singers/singer0013.npy differ diff --git a/configs/singers/singer0014.npy b/configs/singers/singer0014.npy new file mode 100644 index 0000000000000000000000000000000000000000..9ccef2f86207d4e1b158339f30c7de1a6f2a9b34 Binary files /dev/null and b/configs/singers/singer0014.npy differ diff --git a/configs/singers/singer0015.npy b/configs/singers/singer0015.npy new file mode 100644 index 0000000000000000000000000000000000000000..406fcfc1aa214ba9d6e2e06c3f8b2b86a046c1c5 Binary files /dev/null and b/configs/singers/singer0015.npy differ diff --git a/configs/singers/singer0016.npy b/configs/singers/singer0016.npy new file mode 100644 index 0000000000000000000000000000000000000000..8895a40fb131ccb22c47f50221faa035a5e22671 Binary files /dev/null and b/configs/singers/singer0016.npy differ diff --git a/configs/singers/singer0017.npy b/configs/singers/singer0017.npy new file mode 100644 index 0000000000000000000000000000000000000000..72de0be4e894a5f6d40cdd2cf22fb76c593d9189 Binary files /dev/null and b/configs/singers/singer0017.npy differ diff --git a/configs/singers/singer0018.npy b/configs/singers/singer0018.npy new file mode 100644 index 0000000000000000000000000000000000000000..fcdacf6c516023a6970d2e981f4f9cd11f22f814 Binary files /dev/null and b/configs/singers/singer0018.npy differ diff --git a/configs/singers/singer0019.npy b/configs/singers/singer0019.npy new file mode 100644 index 0000000000000000000000000000000000000000..86dcfbd6bcd1b449a61910fcfa73074996b8b468 Binary files /dev/null and b/configs/singers/singer0019.npy differ diff --git a/configs/singers/singer0020.npy b/configs/singers/singer0020.npy new file mode 100644 index 0000000000000000000000000000000000000000..629d6337ca293cf75c1e3089daa0fe53b6a93d34 Binary files /dev/null and b/configs/singers/singer0020.npy differ diff --git a/configs/singers/singer0021.npy b/configs/singers/singer0021.npy new file mode 100644 index 0000000000000000000000000000000000000000..be25ae64a5efd07234bdd09f50e435b1f1bea8fb Binary files /dev/null and b/configs/singers/singer0021.npy differ diff --git a/configs/singers/singer0022.npy b/configs/singers/singer0022.npy new file mode 100644 index 0000000000000000000000000000000000000000..2a8312709b1d555dfb4b43119ea15165db084354 Binary files /dev/null and b/configs/singers/singer0022.npy differ diff --git a/configs/singers/singer0023.npy b/configs/singers/singer0023.npy new file mode 100644 index 0000000000000000000000000000000000000000..a5c8ea0e25ce3fd68bba26871a07413992c14db3 Binary files /dev/null and b/configs/singers/singer0023.npy differ diff --git a/configs/singers/singer0024.npy b/configs/singers/singer0024.npy new file mode 100644 index 0000000000000000000000000000000000000000..c0e7a03531d69527f54abe9f77817bb328596590 Binary files /dev/null and b/configs/singers/singer0024.npy differ diff --git a/configs/singers/singer0025.npy b/configs/singers/singer0025.npy new file mode 100644 index 0000000000000000000000000000000000000000..d467c77c5b784d3ff06a03008bac9ae86223ae9f Binary files /dev/null and b/configs/singers/singer0025.npy differ diff --git a/configs/singers/singer0026.npy b/configs/singers/singer0026.npy new file mode 100644 index 0000000000000000000000000000000000000000..a462700e65f481fb429e7f59058e0fb282f3474c Binary files /dev/null and b/configs/singers/singer0026.npy differ diff --git a/configs/singers/singer0027.npy b/configs/singers/singer0027.npy new file mode 100644 index 0000000000000000000000000000000000000000..46aa2bccd9b60c359ec0f7d47f7dd2bd838252ea Binary files /dev/null and b/configs/singers/singer0027.npy differ diff --git a/configs/singers/singer0028.npy b/configs/singers/singer0028.npy new file mode 100644 index 0000000000000000000000000000000000000000..ddf0ca0ba09f1a62c10ecd4149ee8d407742de4a Binary files /dev/null and b/configs/singers/singer0028.npy differ diff --git a/configs/singers/singer0029.npy b/configs/singers/singer0029.npy new file mode 100644 index 0000000000000000000000000000000000000000..6c6a3a1c9b93775c6136121fe41a1dd01ea3556b Binary files /dev/null and b/configs/singers/singer0029.npy differ diff --git a/configs/singers/singer0030.npy b/configs/singers/singer0030.npy new file mode 100644 index 0000000000000000000000000000000000000000..1d87b27981fdd42b938d8b21e7d8fd6f69b73a8d Binary files /dev/null and b/configs/singers/singer0030.npy differ diff --git a/configs/singers/singer0031.npy b/configs/singers/singer0031.npy new file mode 100644 index 0000000000000000000000000000000000000000..ac6626018932f9665a440efda102e7a9bc10fa38 Binary files /dev/null and b/configs/singers/singer0031.npy differ diff --git a/configs/singers/singer0032.npy b/configs/singers/singer0032.npy new file mode 100644 index 0000000000000000000000000000000000000000..548d9f43fcf32b58a33dc7c31716b6ae03715f2a Binary files /dev/null and b/configs/singers/singer0032.npy differ diff --git a/configs/singers/singer0033.npy b/configs/singers/singer0033.npy new file mode 100644 index 0000000000000000000000000000000000000000..f0d424b818645f287ab9d251d5a4a6357fadc3b4 Binary files /dev/null and b/configs/singers/singer0033.npy differ diff --git a/configs/singers/singer0034.npy b/configs/singers/singer0034.npy new file mode 100644 index 0000000000000000000000000000000000000000..2ee10691c4bb1a603b0214add72d6665df60b21b Binary files /dev/null and b/configs/singers/singer0034.npy differ diff --git a/configs/singers/singer0035.npy b/configs/singers/singer0035.npy new file mode 100644 index 0000000000000000000000000000000000000000..ed16dfdaf733c852e87cb8d10c5d5b850576620c Binary files /dev/null and b/configs/singers/singer0035.npy differ diff --git a/configs/singers/singer0036.npy b/configs/singers/singer0036.npy new file mode 100644 index 0000000000000000000000000000000000000000..f7a659f072a94bcf1a4a5354dd03c0757fcc4d7a Binary files /dev/null and b/configs/singers/singer0036.npy differ diff --git a/configs/singers/singer0037.npy b/configs/singers/singer0037.npy new file mode 100644 index 0000000000000000000000000000000000000000..36a597c57171873eea5ab95529ca4d9c797cfe3d Binary files /dev/null and b/configs/singers/singer0037.npy differ diff --git a/configs/singers/singer0038.npy b/configs/singers/singer0038.npy new file mode 100644 index 0000000000000000000000000000000000000000..2a0fe2ff60a402e86f988dcb7ba3ba8614b751e4 Binary files /dev/null and b/configs/singers/singer0038.npy differ diff --git a/configs/singers/singer0039.npy b/configs/singers/singer0039.npy new file mode 100644 index 0000000000000000000000000000000000000000..7b2a49d89d1d1a457f455518606b996e5e3ddf6a Binary files /dev/null and b/configs/singers/singer0039.npy differ diff --git a/configs/singers/singer0040.npy b/configs/singers/singer0040.npy new file mode 100644 index 0000000000000000000000000000000000000000..d3d75480d262557c2f0f6e4556c101638ca72b60 Binary files /dev/null and b/configs/singers/singer0040.npy differ diff --git a/configs/singers/singer0041.npy b/configs/singers/singer0041.npy new file mode 100644 index 0000000000000000000000000000000000000000..70fba34f2af4e64af3b7c85c5c61b6e2375430c8 Binary files /dev/null and b/configs/singers/singer0041.npy differ diff --git a/configs/singers/singer0042.npy b/configs/singers/singer0042.npy new file mode 100644 index 0000000000000000000000000000000000000000..2328d3957bc01dc7a4674d50fbfb1f2897cb3023 Binary files /dev/null and b/configs/singers/singer0042.npy differ diff --git a/configs/singers/singer0043.npy b/configs/singers/singer0043.npy new file mode 100644 index 0000000000000000000000000000000000000000..2fc910f5b352a309771c93ef624b21f1ae31faec Binary files /dev/null and b/configs/singers/singer0043.npy differ diff --git a/configs/singers/singer0044.npy b/configs/singers/singer0044.npy new file mode 100644 index 0000000000000000000000000000000000000000..a04bd7b3b332fa60295412d74f315980a5038605 Binary files /dev/null and b/configs/singers/singer0044.npy differ diff --git a/configs/singers/singer0045.npy b/configs/singers/singer0045.npy new file mode 100644 index 0000000000000000000000000000000000000000..9b9c68d476f817a91c77a19fff4ac39fe1546f52 Binary files /dev/null and b/configs/singers/singer0045.npy differ diff --git a/configs/singers/singer0046.npy b/configs/singers/singer0046.npy new file mode 100644 index 0000000000000000000000000000000000000000..6cc2fa3cc0d7e1e72e786b9aeca70453f0e8ac91 Binary files /dev/null and b/configs/singers/singer0046.npy differ diff --git a/configs/singers/singer0047.npy b/configs/singers/singer0047.npy new file mode 100644 index 0000000000000000000000000000000000000000..a8ba93de650a6662a3a196b44ce0119b401d7b73 Binary files /dev/null and b/configs/singers/singer0047.npy differ diff --git a/configs/singers/singer0048.npy b/configs/singers/singer0048.npy new file mode 100644 index 0000000000000000000000000000000000000000..61d3ce6de70f1a4ee6d3213e36f56bb3e2de895b Binary files /dev/null and b/configs/singers/singer0048.npy differ diff --git a/configs/singers/singer0049.npy b/configs/singers/singer0049.npy new file mode 100644 index 0000000000000000000000000000000000000000..a2309de54fc91207c89e97f1cd10ae16e27510ee Binary files /dev/null and b/configs/singers/singer0049.npy differ diff --git a/configs/singers/singer0050.npy b/configs/singers/singer0050.npy new file mode 100644 index 0000000000000000000000000000000000000000..5cb04f02132ef0f108dc4bcf14fb594c9da7b5d3 Binary files /dev/null and b/configs/singers/singer0050.npy differ diff --git a/configs/singers/singer0051.npy b/configs/singers/singer0051.npy new file mode 100644 index 0000000000000000000000000000000000000000..f0e38b5b768f7585361043bf74bcaf893ce796bd Binary files /dev/null and b/configs/singers/singer0051.npy differ diff --git a/configs/singers/singer0052.npy b/configs/singers/singer0052.npy new file mode 100644 index 0000000000000000000000000000000000000000..550813af6a14657c78fc0614b0bf8026a9050627 Binary files /dev/null and b/configs/singers/singer0052.npy differ diff --git a/configs/singers/singer0053.npy b/configs/singers/singer0053.npy new file mode 100644 index 0000000000000000000000000000000000000000..99aaf8f1c8c53005aa34458ceb3229d1ab804f59 Binary files /dev/null and b/configs/singers/singer0053.npy differ diff --git a/configs/singers/singer0054.npy b/configs/singers/singer0054.npy new file mode 100644 index 0000000000000000000000000000000000000000..2e720bf49c2ae8491f6f3452daf5fb6b2e67c59d Binary files /dev/null and b/configs/singers/singer0054.npy differ diff --git a/configs/singers/singer0055.npy b/configs/singers/singer0055.npy new file mode 100644 index 0000000000000000000000000000000000000000..108e226585935648d0e1c789f0e69007ddb97e88 Binary files /dev/null and b/configs/singers/singer0055.npy differ diff --git a/configs/singers/singer0056.npy b/configs/singers/singer0056.npy new file mode 100644 index 0000000000000000000000000000000000000000..ae8e652cd8047eab11df4158493ea49e4d7e3d4d Binary files /dev/null and b/configs/singers/singer0056.npy differ diff --git a/configs/train.yaml b/configs/train.yaml new file mode 100644 index 0000000000000000000000000000000000000000..30e291e3acb417973559323027d13fe6c5e0387d --- /dev/null +++ b/configs/train.yaml @@ -0,0 +1,94 @@ +data: + filter_length: 1024 + hop_length: 320 + max_wav_value: 32768.0 + mel_channels: 100 + mel_fmax: 16000.0 + mel_fmin: 50.0 + sampling_rate: 32000 + segment_size: 8000 + training_files: files/train.txt + validation_files: files/valid.txt + win_length: 1024 +dist_config: + dist_backend: nccl + dist_url: tcp://localhost:54321 + world_size: 1 +epochs: '100' +gen: + resblock_dilation_sizes: + - - 1 + - 3 + - 5 + - - 1 + - 3 + - 5 + - - 1 + - 3 + - 5 + resblock_kernel_sizes: + - 3 + - 7 + - 11 + upsample_initial_channel: 320 + upsample_input: 192 + upsample_kernel_sizes: + - 15 + - 8 + - 8 + - 4 + - 4 + upsample_rates: + - 5 + - 4 + - 4 + - 2 + - 2 +log: + eval_interval: 1 + info_interval: 50 + keep_ckpts: 5 + log_dir: logs + num_audio: 6 + pth_dir: chkpt + save_interval: 5 +mpd: + kernel_size: 5 + lReLU_slope: 0.2 + periods: + - 2 + - 3 + - 5 + - 7 + - 11 + stride: 3 + use_spectral_norm: false +mrd: + lReLU_slope: 0.2 + resolutions: '[(1024, 120, 600), (2048, 240, 1200), (4096, 480, 2400), (512, 50, + 240)]' + use_spectral_norm: false +train: + batch_size: 6 + betas: + - 0.8 + - 0.99 + c_kl: 0.2 + c_mel: 1.0 + c_stft: 9 + epochs: 100 + eps: 1e-9 + learning_rate: 5.0e-05 + lr_decay: 0.999875 + model: sovits5.0 + port: 8001 + pretrain: vits_pretrain/sovits5.0.pretrain.pth + seed: 1234 +vits: + filter_channels: 640 + gin_channels: 256 + hidden_channels: 192 + inter_channels: 192 + ppg_dim: 1280 + spk_dim: 256 + vec_dim: 256 diff --git a/count_params.py b/count_params.py new file mode 100644 index 0000000000000000000000000000000000000000..e8559a6a7187ab6628eb501a3f56a71374ed60ba --- /dev/null +++ b/count_params.py @@ -0,0 +1,21 @@ +import torch + +def analyze_all(): + print("--- F5-SVC ---") + stg1 = "/vol/bitbucket/hl3025/cfm_svc/chkpt_f5svc/stage1_epoch_50.pt" + stg2 = "/vol/bitbucket/hl3025/cfm_svc/chkpt_f5svc/stage2_obama.pt" + + ckpt1 = torch.load(stg1, map_location='cpu') + p_stg1 = sum(v.numel() for k, v in ckpt1['trainable'].items() if hasattr(v, 'numel')) + print(f"Stage 1 Trainable (LoRA): {p_stg1}") + + ckpt2 = torch.load(stg2, map_location='cpu') + p_stg2 = 0 + if 'stage2' in ckpt2: + p_stg2 = sum(v.numel() for k, v in ckpt2['stage2'].items() if hasattr(v, 'numel')) + + print(f"Stage 2 Trainable (LoRA): {p_stg2}") + print(f"Total F5-SVC LoRA parameters: {p_stg1 + p_stg2}") + +analyze_all() + diff --git a/crepe/LICENSE.txt b/crepe/LICENSE.txt new file mode 100755 index 0000000000000000000000000000000000000000..efc01ae87f6cc931d539ee9672a4e00aa583814c --- /dev/null +++ b/crepe/LICENSE.txt @@ -0,0 +1,21 @@ +MIT License + +Copyright (c) 2020 Max Morrison + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. diff --git a/crepe/README.md b/crepe/README.md new file mode 100755 index 0000000000000000000000000000000000000000..296537c8aee47545f5600a6e7d84731d535e84d8 --- /dev/null +++ b/crepe/README.md @@ -0,0 +1,223 @@ +

torchcrepe

+
+ +[![PyPI](https://img.shields.io/pypi/v/torchcrepe.svg)](https://pypi.python.org/pypi/torchcrepe) +[![License](https://img.shields.io/badge/License-MIT-blue.svg)](https://opensource.org/licenses/MIT) +[![Downloads](https://pepy.tech/badge/torchcrepe)](https://pepy.tech/project/torchcrepe) + +
+ +Pytorch implementation of the CREPE [1] pitch tracker. The original Tensorflow +implementation can be found [here](https://github.com/marl/crepe/). The +provided model weights were obtained by converting the "tiny" and "full" models +using [MMdnn](https://github.com/microsoft/MMdnn), an open-source model +management framework. + + +## Installation +Perform the system-dependent PyTorch install using the instructions found +[here](https://pytorch.org/). + +`pip install torchcrepe` + + +## Usage + +### Computing pitch and periodicity from audio + + +```python +import torchcrepe + + +# Load audio +audio, sr = torchcrepe.load.audio( ... ) + +# Here we'll use a 5 millisecond hop length +hop_length = int(sr / 200.) + +# Provide a sensible frequency range for your domain (upper limit is 2006 Hz) +# This would be a reasonable range for speech +fmin = 50 +fmax = 550 + +# Select a model capacity--one of "tiny" or "full" +model = 'tiny' + +# Choose a device to use for inference +device = 'cuda:0' + +# Pick a batch size that doesn't cause memory errors on your gpu +batch_size = 2048 + +# Compute pitch using first gpu +pitch = torchcrepe.predict(audio, + sr, + hop_length, + fmin, + fmax, + model, + batch_size=batch_size, + device=device) +``` + +A periodicity metric similar to the Crepe confidence score can also be +extracted by passing `return_periodicity=True` to `torchcrepe.predict`. + + +### Decoding + +By default, `torchcrepe` uses Viterbi decoding on the softmax of the network +output. This is different than the original implementation, which uses a +weighted average near the argmax of binary cross-entropy probabilities. +The argmax operation can cause double/half frequency errors. These can be +removed by penalizing large pitch jumps via Viterbi decoding. The `decode` +submodule provides some options for decoding. + +```python +# Decode using viterbi decoding (default) +torchcrepe.predict(..., decoder=torchcrepe.decode.viterbi) + +# Decode using weighted argmax (as in the original implementation) +torchcrepe.predict(..., decoder=torchcrepe.decode.weighted_argmax) + +# Decode using argmax +torchcrepe.predict(..., decoder=torchcrepe.decode.argmax) +``` + + +### Filtering and thresholding + +When periodicity is low, the pitch is less reliable. For some problems, it +makes sense to mask these less reliable pitch values. However, the periodicity +can be noisy and the pitch has quantization artifacts. `torchcrepe` provides +submodules `filter` and `threshold` for this purpose. The filter and threshold +parameters should be tuned to your data. For clean speech, a 10-20 millisecond +window with a threshold of 0.21 has worked. + +```python +# We'll use a 15 millisecond window assuming a hop length of 5 milliseconds +win_length = 3 + +# Median filter noisy confidence value +periodicity = torchcrepe.filter.median(periodicity, win_length) + +# Remove inharmonic regions +pitch = torchcrepe.threshold.At(.21)(pitch, periodicity) + +# Optionally smooth pitch to remove quantization artifacts +pitch = torchcrepe.filter.mean(pitch, win_length) +``` + +For more fine-grained control over pitch thresholding, see +`torchcrepe.threshold.Hysteresis`. This is especially useful for removing +spurious voiced regions caused by noise in the periodicity values, but +has more parameters and may require more manual tuning to your data. + +CREPE was not trained on silent audio. Therefore, it sometimes assigns high +confidence to pitch bins in silent regions. You can use +`torchcrepe.threshold.Silence` to manually set the periodicity in silent +regions to zero. + +```python +periodicity = torchcrepe.threshold.Silence(-60.)(periodicity, + audio, + sr, + hop_length) +``` + + +### Computing the CREPE model output activations + +```python +batch = next(torchcrepe.preprocess(audio, sr, hop_length)) +probabilities = torchcrepe.infer(batch) +``` + + +### Computing the CREPE embedding space + +As in Differentiable Digital Signal Processing [2], this uses the output of the +fifth max-pooling layer as a pretrained pitch embedding + +```python +embeddings = torchcrepe.embed(audio, sr, hop_length) +``` + +### Computing from files + +`torchcrepe` defines the following functions convenient for predicting +directly from audio files on disk. Each of these functions also takes +a `device` argument that can be used for device placement (e.g., +`device='cuda:0'`). + +```python +torchcrepe.predict_from_file(audio_file, ...) +torchcrepe.predict_from_file_to_file( + audio_file, output_pitch_file, output_periodicity_file, ...) +torchcrepe.predict_from_files_to_files( + audio_files, output_pitch_files, output_periodicity_files, ...) + +torchcrepe.embed_from_file(audio_file, ...) +torchcrepe.embed_from_file_to_file(audio_file, output_file, ...) +torchcrepe.embed_from_files_to_files(audio_files, output_files, ...) +``` + +### Command-line interface + +```bash +usage: python -m torchcrepe + [-h] + --audio_files AUDIO_FILES [AUDIO_FILES ...] + --output_files OUTPUT_FILES [OUTPUT_FILES ...] + [--hop_length HOP_LENGTH] + [--output_periodicity_files OUTPUT_PERIODICITY_FILES [OUTPUT_PERIODICITY_FILES ...]] + [--embed] + [--fmin FMIN] + [--fmax FMAX] + [--model MODEL] + [--decoder DECODER] + [--gpu GPU] + [--no_pad] + +optional arguments: + -h, --help show this help message and exit + --audio_files AUDIO_FILES [AUDIO_FILES ...] + The audio file to process + --output_files OUTPUT_FILES [OUTPUT_FILES ...] + The file to save pitch or embedding + --hop_length HOP_LENGTH + The hop length of the analysis window + --output_periodicity_files OUTPUT_PERIODICITY_FILES [OUTPUT_PERIODICITY_FILES ...] + The file to save periodicity + --embed Performs embedding instead of pitch prediction + --fmin FMIN The minimum frequency allowed + --fmax FMAX The maximum frequency allowed + --model MODEL The model capacity. One of "tiny" or "full" + --decoder DECODER The decoder to use. One of "argmax", "viterbi", or + "weighted_argmax" + --gpu GPU The gpu to perform inference on + --no_pad Whether to pad the audio +``` + + +## Tests + +The module tests can be run as follows. + +```bash +pip install pytest +pytest +``` + + +## References +[1] J. W. Kim, J. Salamon, P. Li, and J. P. Bello, “Crepe: A +Convolutional Representation for Pitch Estimation,” in 2018 IEEE +International Conference on Acoustics, Speech and Signal +Processing (ICASSP). + +[2] J. H. Engel, L. Hantrakul, C. Gu, and A. Roberts, +“DDSP: Differentiable Digital Signal Processing,” in +2020 International Conference on Learning +Representations (ICLR). diff --git a/crepe/__init__.py b/crepe/__init__.py new file mode 100755 index 0000000000000000000000000000000000000000..f78e20d4a4a07cb7dfc37df643d96a34a4486ccd --- /dev/null +++ b/crepe/__init__.py @@ -0,0 +1,8 @@ +from . import decode +from .core import * +from .model import Crepe +from . import convert +from . import filter +from . import load +from . import loudness +from . import threshold diff --git a/crepe/__main__.py b/crepe/__main__.py new file mode 100755 index 0000000000000000000000000000000000000000..4d1a3120adea147778bc5829ae9b8037bed8efd0 --- /dev/null +++ b/crepe/__main__.py @@ -0,0 +1,148 @@ +import argparse +import os +import warnings + +import crepe + + +############################################################################### +# Entry point +############################################################################### + + +def parse_args(): + """Parse command-line arguments""" + parser = argparse.ArgumentParser() + + # Required arguments + parser.add_argument( + '--audio_files', + nargs='+', + required=True, + help='The audio file to process') + parser.add_argument( + '--output_files', + nargs='+', + required=True, + help='The file to save pitch or embedding') + parser.add_argument( + '--hop_length', + type=int, + help='The hop length of the analysis window') + + # Optionally save harmonicity [DEPRECATED] + parser.add_argument( + '--output_harmonicity_files', + nargs='+', + help='The file to save harmonicity') + # Optionally save periodicity + parser.add_argument( + '--output_periodicity_files', + nargs='+', + help='The files to save periodicity') + + # Optionally create embedding instead of pitch contour + parser.add_argument( + '--embed', + action='store_true', + help='Performs embedding instead of pitch prediction') + + # Optional arguments + parser.add_argument( + '--fmin', + default=50., + type=float, + help='The minimum frequency allowed') + parser.add_argument( + '--fmax', + default=crepe.MAX_FMAX, + type=float, + help='The maximum frequency allowed') + parser.add_argument( + '--model', + default='full', + help='The model capacity. One of "tiny" or "full"') + parser.add_argument( + '--decoder', + default='viterbi', + help='The decoder to use. One of "argmax", "viterbi", or ' + + '"weighted_argmax"') + parser.add_argument( + '--batch_size', + type=int, + help='The number of frames per batch') + parser.add_argument( + '--gpu', + type=int, + help='The gpu to perform inference on') + parser.add_argument( + '--no_pad', + action='store_true', + help='Whether to pad the audio') + + return parser.parse_args() + + +def make_parent_directory(file): + """Create parent directory for file if it does not already exist""" + parent = os.path.dirname(os.path.abspath(file)) + os.makedirs(parent, exist_ok=True) + + +def main(): + # Parse command-line arguments + args = parse_args() + + # Deprecate output_harmonicity_files + if args.output_harmonicity_files is not None: + message = ( + 'The crepe output_harmonicity_files argument is deprecated and ' + 'will be removed in a future release. Please use ' + 'output_periodicity_files. Rationale: if network confidence measured ' + 'harmonic content, the value would be low for non-harmonic, periodic ' + 'sounds (e.g., sine waves). But this is not observed.') + warnings.warn(message, DeprecationWarning) + args.output_periodicity_files = args.output_harmonicity_files + + # Ensure output directory exist + [make_parent_directory(file) for file in args.output_files] + if args.output_periodicity_files is not None: + [make_parent_directory(file) for file in args.output_periodicity_files] + + # Get inference device + device = 'cpu' if args.gpu is None else f'cuda:{args.gpu}' + + # Get decoder + if args.decoder == 'argmax': + decoder = crepe.decode.argmax + elif args.decoder == 'weighted_argmax': + decoder = crepe.decode.weighted_argmax + elif args.decoder == 'viterbi': + decoder = crepe.decode.viterbi + + # Infer pitch or embedding and save to disk + if args.embed: + crepe.embed_from_files_to_files(args.audio_files, + args.output_files, + args.hop_length, + args.model, + args.batch_size, + device, + not args.no_pad) + else: + crepe.predict_from_files_to_files(args.audio_files, + args.output_files, + None, + args.output_periodicity_files, + args.hop_length, + args.fmin, + args.fmax, + args.model, + decoder, + args.batch_size, + device, + not args.no_pad) + + +# Run module entry point +main() diff --git a/crepe/assets/.full.pth.cWG5V7 b/crepe/assets/.full.pth.cWG5V7 new file mode 100755 index 0000000000000000000000000000000000000000..063be278c6a76e18ec5093e5d0550a01be3299ad --- /dev/null +++ b/crepe/assets/.full.pth.cWG5V7 @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6c5372aabc3b837fabf3572e8522e1b8bd458b67ba078468ff3b87ee629b7a98 +size 9993654 diff --git a/crepe/convert.py b/crepe/convert.py new file mode 100755 index 0000000000000000000000000000000000000000..27ace1e111bb1c824894af50125c60a73af9bc20 --- /dev/null +++ b/crepe/convert.py @@ -0,0 +1,57 @@ +import scipy +import torch + +import crepe + + +############################################################################### +# Pitch unit conversions +############################################################################### + + +def bins_to_cents(bins): + """Converts pitch bins to cents""" + cents = crepe.CENTS_PER_BIN * bins + 1997.3794084376191 + + # Trade quantization error for noise + return dither(cents) + + +def bins_to_frequency(bins): + """Converts pitch bins to frequency in Hz""" + return cents_to_frequency(bins_to_cents(bins)) + + +def cents_to_bins(cents, quantize_fn=torch.floor): + """Converts cents to pitch bins""" + bins = (cents - 1997.3794084376191) / crepe.CENTS_PER_BIN + return quantize_fn(bins).int() + + +def cents_to_frequency(cents): + """Converts cents to frequency in Hz""" + return 10 * 2 ** (cents / 1200) + + +def frequency_to_bins(frequency, quantize_fn=torch.floor): + """Convert frequency in Hz to pitch bins""" + return cents_to_bins(frequency_to_cents(frequency), quantize_fn) + + +def frequency_to_cents(frequency): + """Convert frequency in Hz to cents""" + return 1200 * torch.log2(frequency / 10.) + + +############################################################################### +# Utilities +############################################################################### + + +def dither(cents): + """Dither the predicted pitch in cents to remove quantization error""" + noise = scipy.stats.triang.rvs(c=0.5, + loc=-crepe.CENTS_PER_BIN, + scale=2 * crepe.CENTS_PER_BIN, + size=cents.size()) + return cents + cents.new_tensor(noise) diff --git a/crepe/core.py b/crepe/core.py new file mode 100755 index 0000000000000000000000000000000000000000..fa7f0dd8e794ac3475a69cf7c80dc880a5f1598d --- /dev/null +++ b/crepe/core.py @@ -0,0 +1,738 @@ +import warnings + +import numpy as np +import resampy +import torch +import tqdm + +import crepe + + +__all__ = ['CENTS_PER_BIN', + 'MAX_FMAX', + 'PITCH_BINS', + 'SAMPLE_RATE', + 'WINDOW_SIZE', + 'UNVOICED', + 'embed', + 'embed_from_file', + 'embed_from_file_to_file', + 'embed_from_files_to_files', + 'infer', + 'predict', + 'predict_from_file', + 'predict_from_file_to_file', + 'predict_from_files_to_files', + 'preprocess', + 'postprocess', + 'resample'] + + +############################################################################### +# Constants +############################################################################### + + +CENTS_PER_BIN = 20 # cents +MAX_FMAX = 2006. # hz +PITCH_BINS = 360 +SAMPLE_RATE = 16000 # hz +WINDOW_SIZE = 1024 # samples +UNVOICED = np.nan + + +############################################################################### +# Crepe pitch prediction +############################################################################### + + +def predict(audio, + sample_rate, + hop_length=None, + fmin=50., + fmax=MAX_FMAX, + model='full', + decoder=crepe.decode.viterbi, + return_harmonicity=False, + return_periodicity=False, + batch_size=None, + device='cpu', + pad=True): + """Performs pitch estimation + + Arguments + audio (torch.tensor [shape=(1, time)]) + The audio signal + sample_rate (int) + The sampling rate in Hz + hop_length (int) + The hop_length in samples + fmin (float) + The minimum allowable frequency in Hz + fmax (float) + The maximum allowable frequency in Hz + model (string) + The model capacity. One of 'full' or 'tiny'. + decoder (function) + The decoder to use. See decode.py for decoders. + return_harmonicity (bool) [DEPRECATED] + Whether to also return the network confidence + return_periodicity (bool) + Whether to also return the network confidence + batch_size (int) + The number of frames per batch + device (string) + The device used to run inference + pad (bool) + Whether to zero-pad the audio + + Returns + pitch (torch.tensor [shape=(1, 1 + int(time // hop_length))]) + (Optional) periodicity (torch.tensor + [shape=(1, 1 + int(time // hop_length))]) + """ + # Deprecate return_harmonicity + if return_harmonicity: + message = ( + 'The crepe return_harmonicity argument is deprecated and ' + 'will be removed in a future release. Please use ' + 'return_periodicity. Rationale: if network confidence measured ' + 'harmonics, the value would be low for non-harmonic, periodic ' + 'sounds (e.g., sine waves). But this is not observed.') + warnings.warn(message, DeprecationWarning) + return_periodicity = return_harmonicity + + results = [] + + # Postprocessing breaks gradients, so just don't compute them + with torch.no_grad(): + + # Preprocess audio + generator = preprocess(audio, + sample_rate, + hop_length, + batch_size, + device, + pad) + for frames in generator: + + # Infer independent probabilities for each pitch bin + probabilities = infer(frames, model) + + # shape=(batch, 360, time / hop_length) + probabilities = probabilities.reshape( + audio.size(0), -1, PITCH_BINS).transpose(1, 2) + + # Convert probabilities to F0 and periodicity + result = postprocess(probabilities, + fmin, + fmax, + decoder, + return_harmonicity, + return_periodicity) + + # Place on same device as audio to allow very long inputs + if isinstance(result, tuple): + result = (result[0].to(audio.device), + result[1].to(audio.device)) + else: + result = result.to(audio.device) + + results.append(result) + + # Split pitch and periodicity + if return_periodicity: + pitch, periodicity = zip(*results) + return torch.cat(pitch, 1), torch.cat(periodicity, 1) + + # Concatenate + return torch.cat(results, 1) + + +def predict_from_file(audio_file, + hop_length=None, + fmin=50., + fmax=MAX_FMAX, + model='full', + decoder=crepe.decode.viterbi, + return_harmonicity=False, + return_periodicity=False, + batch_size=None, + device='cpu', + pad=True): + """Performs pitch estimation from file on disk + + Arguments + audio_file (string) + The file to perform pitch tracking on + hop_length (int) + The hop_length in samples + fmin (float) + The minimum allowable frequency in Hz + fmax (float) + The maximum allowable frequency in Hz + model (string) + The model capacity. One of 'full' or 'tiny'. + decoder (function) + The decoder to use. See decode.py for decoders. + return_harmonicity (bool) [DEPRECATED] + Whether to also return the network confidence + return_periodicity (bool) + Whether to also return the network confidence + batch_size (int) + The number of frames per batch + device (string) + The device used to run inference + pad (bool) + Whether to zero-pad the audio + + Returns + pitch (torch.tensor [shape=(1, 1 + int(time // hop_length))]) + (Optional) periodicity (torch.tensor + [shape=(1, 1 + int(time // hop_length))]) + """ + # Load audio + audio, sample_rate = crepe.load.audio(audio_file) + + # Predict + return predict(audio, + sample_rate, + hop_length, + fmin, + fmax, + model, + decoder, + return_harmonicity, + return_periodicity, + batch_size, + device, + pad) + + +def predict_from_file_to_file(audio_file, + output_pitch_file, + output_harmonicity_file=None, + output_periodicity_file=None, + hop_length=None, + fmin=50., + fmax=MAX_FMAX, + model='full', + decoder=crepe.decode.viterbi, + batch_size=None, + device='cpu', + pad=True): + """Performs pitch estimation from file on disk + + Arguments + audio_file (string) + The file to perform pitch tracking on + output_pitch_file (string) + The file to save predicted pitch + output_harmonicity_file (string or None) [DEPRECATED] + The file to save predicted harmonicity + output_periodicity_file (string or None) + The file to save predicted periodicity + hop_length (int) + The hop_length in samples + fmin (float) + The minimum allowable frequency in Hz + fmax (float) + The maximum allowable frequency in Hz + model (string) + The model capacity. One of 'full' or 'tiny'. + decoder (function) + The decoder to use. See decode.py for decoders. + batch_size (int) + The number of frames per batch + device (string) + The device used to run inference + pad (bool) + Whether to zero-pad the audio + """ + # Deprecate output_harmonicity_file + if output_harmonicity_file is not None: + message = ( + 'The crepe output_harmonicity_file argument is deprecated and ' + 'will be removed in a future release. Please use ' + 'output_periodicity_file. Rationale: if network confidence measured ' + 'harmonic content, the value would be low for non-harmonic, periodic ' + 'sounds (e.g., sine waves). But this is not observed.') + warnings.warn(message, DeprecationWarning) + output_periodicity_file = output_harmonicity_file + + # Predict from file + prediction = predict_from_file(audio_file, + hop_length, + fmin, + fmax, + model, + decoder, + False, + output_periodicity_file is not None, + batch_size, + device, + pad) + + # Save to disk + if output_periodicity_file is not None: + torch.save(prediction[0].detach(), output_pitch_file) + torch.save(prediction[1].detach(), output_periodicity_file) + else: + torch.save(prediction.detach(), output_pitch_file) + + +def predict_from_files_to_files(audio_files, + output_pitch_files, + output_harmonicity_files=None, + output_periodicity_files=None, + hop_length=None, + fmin=50., + fmax=MAX_FMAX, + model='full', + decoder=crepe.decode.viterbi, + batch_size=None, + device='cpu', + pad=True): + """Performs pitch estimation from files on disk without reloading model + + Arguments + audio_files (list[string]) + The files to perform pitch tracking on + output_pitch_files (list[string]) + The files to save predicted pitch + output_harmonicity_files (list[string] or None) [DEPRECATED] + The files to save predicted harmonicity + output_periodicity_files (list[string] or None) + The files to save predicted periodicity + hop_length (int) + The hop_length in samples + fmin (float) + The minimum allowable frequency in Hz + fmax (float) + The maximum allowable frequency in Hz + model (string) + The model capacity. One of 'full' or 'tiny'. + decoder (function) + The decoder to use. See decode.py for decoders. + batch_size (int) + The number of frames per batch + device (string) + The device used to run inference + pad (bool) + Whether to zero-pad the audio + """ + # Deprecate output_harmonicity_files + if output_harmonicity_files is not None: + message = ( + 'The crepe output_harmonicity_files argument is deprecated and ' + 'will be removed in a future release. Please use ' + 'output_periodicity_files. Rationale: if network confidence measured ' + 'harmonic content, the value would be low for non-harmonic, periodic ' + 'sounds (e.g., sine waves). But this is not observed.') + warnings.warn(message, DeprecationWarning) + output_periodicity_files = output_harmonicity_files + + if output_periodicity_files is None: + output_periodicity_files = len(audio_files) * [None] + + # Setup iterator + iterator = zip(audio_files, output_pitch_files, output_periodicity_files) + iterator = tqdm.tqdm(iterator, desc='crepe', dynamic_ncols=True) + for audio_file, output_pitch_file, output_periodicity_file in iterator: + + # Predict a file + predict_from_file_to_file(audio_file, + output_pitch_file, + None, + output_periodicity_file, + hop_length, + fmin, + fmax, + model, + decoder, + batch_size, + device, + pad) + +############################################################################### +# Crepe pitch embedding +############################################################################### + + +def embed(audio, + sample_rate, + hop_length=None, + model='full', + batch_size=None, + device='cpu', + pad=True): + """Embeds audio to the output of CREPE's fifth maxpool layer + + Arguments + audio (torch.tensor [shape=(1, time)]) + The audio signals + sample_rate (int) + The sampling rate in Hz + hop_length (int) + The hop_length in samples + model (string) + The model capacity. One of 'full' or 'tiny'. + batch_size (int) + The number of frames per batch + device (string) + The device to run inference on + pad (bool) + Whether to zero-pad the audio + + Returns + embedding (torch.tensor [shape=(1, + 1 + int(time // hop_length), 32, -1)]) + """ + results = [] + + # Preprocess audio + generator = preprocess(audio, + sample_rate, + hop_length, + batch_size, + device, + pad) + for frames in generator: + + # Infer pitch embeddings + embedding = infer(frames, model, embed=True) + + # shape=(batch, time / hop_length, 32, embedding_size) + result = embedding.reshape(audio.size(0), frames.size(0), 32, -1) + + # Place on same device as audio. This allows for large inputs. + results.append(result.to(audio.device)) + + # Concatenate + return torch.cat(results, 1) + + +def embed_from_file(audio_file, + hop_length=None, + model='full', + batch_size=None, + device='cpu', + pad=True): + """Embeds audio from disk to the output of CREPE's fifth maxpool layer + + Arguments + audio_file (string) + The wav file containing the audio to embed + hop_length (int) + The hop_length in samples + model (string) + The model capacity. One of 'full' or 'tiny'. + batch_size (int) + The number of frames per batch + device (string) + The device to run inference on + pad (bool) + Whether to zero-pad the audio + + Returns + embedding (torch.tensor [shape=(1, + 1 + int(time // hop_length), 32, -1)]) + """ + # Load audio + audio, sample_rate = crepe.load.audio(audio_file) + + # Embed + return embed(audio, + sample_rate, + hop_length, + model, + batch_size, + device, + pad) + + +def embed_from_file_to_file(audio_file, + output_file, + hop_length=None, + model='full', + batch_size=None, + device='cpu', + pad=True): + """Embeds audio from disk and saves to disk + + Arguments + audio_file (string) + The wav file containing the audio to embed + hop_length (int) + The hop_length in samples + output_file (string) + The file to save the embedding + model (string) + The model capacity. One of 'full' or 'tiny'. + batch_size (int) + The number of frames per batch + device (string) + The device to run inference on + pad (bool) + Whether to zero-pad the audio + """ + # No use computing gradients if we're just saving to file + with torch.no_grad(): + + # Embed + embedding = embed_from_file(audio_file, + hop_length, + model, + batch_size, + device, + pad) + + # Save to disk + torch.save(embedding.detach(), output_file) + + +def embed_from_files_to_files(audio_files, + output_files, + hop_length=None, + model='full', + batch_size=None, + device='cpu', + pad=True): + """Embeds audio from disk and saves to disk without reloading model + + Arguments + audio_files (list[string]) + The wav files containing the audio to embed + output_files (list[string]) + The files to save the embeddings + hop_length (int) + The hop_length in samples + model (string) + The model capacity. One of 'full' or 'tiny'. + batch_size (int) + The number of frames per batch + device (string) + The device to run inference on + pad (bool) + Whether to zero-pad the audio + """ + # Setup iterator + iterator = zip(audio_files, output_files) + iterator = tqdm.tqdm(iterator, desc='crepe', dynamic_ncols=True) + for audio_file, output_file in iterator: + + # Embed a file + embed_from_file_to_file(audio_file, + output_file, + hop_length, + model, + batch_size, + device, + pad) + + +############################################################################### +# Components for step-by-step prediction +############################################################################### + + +def infer(frames, model='full', embed=False): + """Forward pass through the model + + Arguments + frames (torch.tensor [shape=(time / hop_length, 1024)]) + The network input + model (string) + The model capacity. One of 'full' or 'tiny'. + embed (bool) + Whether to stop inference at the intermediate embedding layer + + Returns + logits (torch.tensor [shape=(1 + int(time // hop_length), 360)]) OR + embedding (torch.tensor [shape=(1 + int(time // hop_length), + embedding_size)]) + """ + # Load the model if necessary + if not hasattr(infer, 'model') or not hasattr(infer, 'capacity') or \ + (hasattr(infer, 'capacity') and infer.capacity != model): + crepe.load.model(frames.device, model) + + # Move model to correct device (no-op if devices are the same) + infer.model = infer.model.to(frames.device) + + # Apply model + return infer.model(frames, embed=embed) + + +def postprocess(probabilities, + fmin=0., + fmax=MAX_FMAX, + decoder=crepe.decode.viterbi, + return_harmonicity=False, + return_periodicity=False): + """Convert model output to F0 and periodicity + + Arguments + probabilities (torch.tensor [shape=(1, 360, time / hop_length)]) + The probabilities for each pitch bin inferred by the network + fmin (float) + The minimum allowable frequency in Hz + fmax (float) + The maximum allowable frequency in Hz + viterbi (bool) + Whether to use viterbi decoding + return_harmonicity (bool) [DEPRECATED] + Whether to also return the network confidence + return_periodicity (bool) + Whether to also return the network confidence + + Returns + pitch (torch.tensor [shape=(1, 1 + int(time // hop_length))]) + periodicity (torch.tensor [shape=(1, 1 + int(time // hop_length))]) + """ + # Sampling is non-differentiable, so remove from graph + probabilities = probabilities.detach() + + # Convert frequency range to pitch bin range + minidx = crepe.convert.frequency_to_bins(torch.tensor(fmin)) + maxidx = crepe.convert.frequency_to_bins(torch.tensor(fmax), + torch.ceil) + + # Remove frequencies outside of allowable range + probabilities[:, :minidx] = -float('inf') + probabilities[:, maxidx:] = -float('inf') + + # Perform argmax or viterbi sampling + bins, pitch = decoder(probabilities) + + # Deprecate return_harmonicity + if return_harmonicity: + message = ( + 'The crepe return_harmonicity argument is deprecated and ' + 'will be removed in a future release. Please use ' + 'return_periodicity. Rationale: if network confidence measured ' + 'harmonics, the value would be low for non-harmonic, periodic ' + 'sounds (e.g., sine waves). But this is not observed.') + warnings.warn(message, DeprecationWarning) + return_periodicity = return_harmonicity + + if not return_periodicity: + return pitch + + # Compute periodicity from probabilities and decoded pitch bins + return pitch, periodicity(probabilities, bins) + + +def preprocess(audio, + sample_rate, + hop_length=None, + batch_size=None, + device='cpu', + pad=True): + """Convert audio to model input + + Arguments + audio (torch.tensor [shape=(1, time)]) + The audio signals + sample_rate (int) + The sampling rate in Hz + hop_length (int) + The hop_length in samples + batch_size (int) + The number of frames per batch + device (string) + The device to run inference on + pad (bool) + Whether to zero-pad the audio + + Returns + frames (torch.tensor [shape=(1 + int(time // hop_length), 1024)]) + """ + # Default hop length of 10 ms + hop_length = sample_rate // 100 if hop_length is None else hop_length + + # Resample + if sample_rate != SAMPLE_RATE: + audio = resample(audio, sample_rate) + hop_length = int(hop_length * SAMPLE_RATE / sample_rate) + + # Get total number of frames + + # Maybe pad + if pad: + total_frames = 1 + int(audio.size(1) // hop_length) + audio = torch.nn.functional.pad( + audio, + (WINDOW_SIZE // 2, WINDOW_SIZE // 2)) + else: + total_frames = 1 + int((audio.size(1) - WINDOW_SIZE) // hop_length) + + # Default to running all frames in a single batch + batch_size = total_frames if batch_size is None else batch_size + + # Generate batches + for i in range(0, total_frames, batch_size): + + # Batch indices + start = max(0, i * hop_length) + end = min(audio.size(1), + (i + batch_size - 1) * hop_length + WINDOW_SIZE) + + # Chunk + frames = torch.nn.functional.unfold( + audio[:, None, None, start:end], + kernel_size=(1, WINDOW_SIZE), + stride=(1, hop_length)) + + # shape=(1 + int(time / hop_length, 1024) + frames = frames.transpose(1, 2).reshape(-1, WINDOW_SIZE) + + # Place on device + frames = frames.to(device) + + # Mean-center + frames -= frames.mean(dim=1, keepdim=True) + + # Scale + # Note: during silent frames, this produces very large values. But + # this seems to be what the network expects. + frames /= torch.max(torch.tensor(1e-10, device=frames.device), + frames.std(dim=1, keepdim=True)) + + yield frames + + +############################################################################### +# Utilities +############################################################################### + + +def periodicity(probabilities, bins): + """Computes the periodicity from the network output and pitch bins""" + # shape=(batch * time / hop_length, 360) + probs_stacked = probabilities.transpose(1, 2).reshape(-1, PITCH_BINS) + + # shape=(batch * time / hop_length, 1) + bins_stacked = bins.reshape(-1, 1).to(torch.int64) + + # Use maximum logit over pitch bins as periodicity + periodicity = probs_stacked.gather(1, bins_stacked) + + # shape=(batch, time / hop_length) + return periodicity.reshape(probabilities.size(0), probabilities.size(2)) + + +def resample(audio, sample_rate): + """Resample audio""" + # Store device for later placement + device = audio.device + + # Convert to numpy + audio = audio.detach().cpu().numpy().squeeze(0) + + # Resample + # We have to use resampy if we want numbers to match Crepe + audio = resampy.resample(audio, sample_rate, SAMPLE_RATE) + + # Convert to pytorch + return torch.tensor(audio, device=device).unsqueeze(0) diff --git a/crepe/decode.py b/crepe/decode.py new file mode 100755 index 0000000000000000000000000000000000000000..559e566b8e2c09fb7634c6ac9ce867731295901b --- /dev/null +++ b/crepe/decode.py @@ -0,0 +1,80 @@ +import librosa +import numpy as np +import torch + +import crepe + + +############################################################################### +# Probability sequence decoding methods +############################################################################### + + +def argmax(logits): + """Sample observations by taking the argmax""" + bins = logits.argmax(dim=1) + + # Convert to frequency in Hz + return bins, crepe.convert.bins_to_frequency(bins) + + +def weighted_argmax(logits): + """Sample observations using weighted sum near the argmax""" + # Find center of analysis window + bins = logits.argmax(dim=1) + + # Find bounds of analysis window + start = torch.max(torch.tensor(0, device=logits.device), bins - 4) + end = torch.min(torch.tensor(logits.size(1), device=logits.device), bins + 5) + + # Mask out everything outside of window + for batch in range(logits.size(0)): + for time in range(logits.size(2)): + logits[batch, :start[batch, time], time] = -float('inf') + logits[batch, end[batch, time]:, time] = -float('inf') + + # Construct weights + if not hasattr(weighted_argmax, 'weights'): + weights = crepe.convert.bins_to_cents(torch.arange(360)) + weighted_argmax.weights = weights[None, :, None] + + # Ensure devices are the same (no-op if they are) + weighted_argmax.weights = weighted_argmax.weights.to(logits.device) + + # Convert to probabilities + with torch.no_grad(): + probs = torch.sigmoid(logits) + + # Apply weights + cents = (weighted_argmax.weights * probs).sum(dim=1) / probs.sum(dim=1) + + # Convert to frequency in Hz + return bins, crepe.convert.cents_to_frequency(cents) + + +def viterbi(logits): + """Sample observations using viterbi decoding""" + # Create viterbi transition matrix + if not hasattr(viterbi, 'transition'): + xx, yy = np.meshgrid(range(360), range(360)) + transition = np.maximum(12 - abs(xx - yy), 0) + transition = transition / transition.sum(axis=1, keepdims=True) + viterbi.transition = transition + + # Normalize logits + with torch.no_grad(): + probs = torch.nn.functional.softmax(logits, dim=1) + + # Convert to numpy + sequences = probs.cpu().numpy() + + # Perform viterbi decoding + bins = np.array([ + librosa.sequence.viterbi(sequence, viterbi.transition).astype(np.int64) + for sequence in sequences]) + + # Convert to pytorch + bins = torch.tensor(bins, device=probs.device) + + # Convert to frequency in Hz + return bins, crepe.convert.bins_to_frequency(bins) diff --git a/crepe/filter.py b/crepe/filter.py new file mode 100755 index 0000000000000000000000000000000000000000..dd62ef59c7e2c7dd0c2544ae17b5ef60d0b642f6 --- /dev/null +++ b/crepe/filter.py @@ -0,0 +1,195 @@ +import numpy as np +import torch +from torch.nn import functional as F + +############################################################################### +# Sequence filters +############################################################################### + + +def mean(signals, win_length=9): + """Averave filtering for signals containing nan values + + Arguments + signals (torch.tensor (shape=(batch, time))) + The signals to filter + win_length + The size of the analysis window + + Returns + filtered (torch.tensor (shape=(batch, time))) + """ + + assert signals.dim() == 2, "Input tensor must have 2 dimensions (batch_size, width)" + signals = signals.unsqueeze(1) + + # Apply the mask by setting masked elements to zero, or make NaNs zero + mask = ~torch.isnan(signals) + masked_x = torch.where(mask, signals, torch.zeros_like(signals)) + + # Create a ones kernel with the same number of channels as the input tensor + ones_kernel = torch.ones(signals.size(1), 1, win_length, device=signals.device) + + # Perform sum pooling + sum_pooled = F.conv1d( + masked_x, + ones_kernel, + stride=1, + padding=win_length // 2, + ) + + # Count the non-masked (valid) elements in each pooling window + valid_count = F.conv1d( + mask.float(), + ones_kernel, + stride=1, + padding=win_length // 2, + ) + valid_count = valid_count.clamp(min=1) # Avoid division by zero + + # Perform masked average pooling + avg_pooled = sum_pooled / valid_count + + # Fill zero values with NaNs + avg_pooled[avg_pooled == 0] = float("nan") + + return avg_pooled.squeeze(1) + + +def median(signals, win_length): + """Median filtering for signals containing nan values + + Arguments + signals (torch.tensor (shape=(batch, time))) + The signals to filter + win_length + The size of the analysis window + + Returns + filtered (torch.tensor (shape=(batch, time))) + """ + + assert signals.dim() == 2, "Input tensor must have 2 dimensions (batch_size, width)" + signals = signals.unsqueeze(1) + + mask = ~torch.isnan(signals) + masked_x = torch.where(mask, signals, torch.zeros_like(signals)) + padding = win_length // 2 + + x = F.pad(masked_x, (padding, padding), mode="reflect") + mask = F.pad(mask.float(), (padding, padding), mode="constant", value=0) + + x = x.unfold(2, win_length, 1) + mask = mask.unfold(2, win_length, 1) + + x = x.contiguous().view(x.size()[:3] + (-1,)) + mask = mask.contiguous().view(mask.size()[:3] + (-1,)) + + # Combine the mask with the input tensor + x_masked = torch.where(mask.bool(), x.double(), float("inf")).to(x) + + # Sort the masked tensor along the last dimension + x_sorted, _ = torch.sort(x_masked, dim=-1) + + # Compute the count of non-masked (valid) values + valid_count = mask.sum(dim=-1) + + # Calculate the index of the median value for each pooling window + median_idx = ((valid_count - 1) // 2).clamp(min=0) + + # Gather the median values using the calculated indices + median_pooled = x_sorted.gather(-1, median_idx.unsqueeze(-1).long()).squeeze(-1) + + # Fill infinite values with NaNs + median_pooled[torch.isinf(median_pooled)] = float("nan") + + return median_pooled.squeeze(1) + + +############################################################################### +# Utilities +############################################################################### + + +def nanfilter(signals, win_length, filter_fn): + """Filters a sequence, ignoring nan values + + Arguments + signals (torch.tensor (shape=(batch, time))) + The signals to filter + win_length + The size of the analysis window + filter_fn (function) + The function to use for filtering + + Returns + filtered (torch.tensor (shape=(batch, time))) + """ + # Output buffer + filtered = torch.empty_like(signals) + + # Loop over frames + for i in range(signals.size(1)): + + # Get analysis window bounds + start = max(0, i - win_length // 2) + end = min(signals.size(1), i + win_length // 2 + 1) + + # Apply filter to window + filtered[:, i] = filter_fn(signals[:, start:end]) + + return filtered + + +def nanmean(signals): + """Computes the mean, ignoring nans + + Arguments + signals (torch.tensor [shape=(batch, time)]) + The signals to filter + + Returns + filtered (torch.tensor [shape=(batch, time)]) + """ + signals = signals.clone() + + # Find nans + nans = torch.isnan(signals) + + # Set nans to 0. + signals[nans] = 0. + + # Compute average + return signals.sum(dim=1) / (~nans).float().sum(dim=1) + + +def nanmedian(signals): + """Computes the median, ignoring nans + + Arguments + signals (torch.tensor [shape=(batch, time)]) + The signals to filter + + Returns + filtered (torch.tensor [shape=(batch, time)]) + """ + # Find nans + nans = torch.isnan(signals) + + # Compute median for each slice + medians = [nanmedian1d(signal[~nan]) for signal, nan in zip(signals, nans)] + + # Stack results + return torch.tensor(medians, dtype=signals.dtype, device=signals.device) + + +def nanmedian1d(signal): + """Computes the median. If signal is empty, returns torch.nan + + Arguments + signal (torch.tensor [shape=(time,)]) + + Returns + median (torch.tensor [shape=(1,)]) + """ + return torch.median(signal) if signal.numel() else np.nan diff --git a/crepe/load.py b/crepe/load.py new file mode 100755 index 0000000000000000000000000000000000000000..bb5a3c355b31f0495721d6dcfc4fbc57927c4f91 --- /dev/null +++ b/crepe/load.py @@ -0,0 +1,36 @@ +import os + +import numpy as np +import torch +import crepe +from scipy.io import wavfile + + +def audio(filename): + """Load audio from disk""" + sample_rate, audio = wavfile.read(filename) + + # Convert to float32 + if audio.dtype == np.int16: + audio = audio.astype(np.float32) / np.iinfo(np.int16).max + + # PyTorch is not compatible with non-writeable arrays, so we make a copy + return torch.tensor(np.copy(audio))[None], sample_rate + + +def model(device, capacity='full'): + """Preloads model from disk""" + # Bind model and capacity + crepe.infer.capacity = capacity + crepe.infer.model = crepe.Crepe(capacity) + + # Load weights + file = os.path.join(os.path.dirname(__file__), 'assets', f'{capacity}.pth') + crepe.infer.model.load_state_dict( + torch.load(file, map_location=device)) + + # Place on device + crepe.infer.model = crepe.infer.model.to(torch.device(device)) + + # Eval mode + crepe.infer.model.eval() diff --git a/crepe/loudness.py b/crepe/loudness.py new file mode 100755 index 0000000000000000000000000000000000000000..e6f5c4a648b6adfa7c0a0c8988f4ae0bfd7b051d --- /dev/null +++ b/crepe/loudness.py @@ -0,0 +1,78 @@ +import warnings + +import librosa +import numpy as np +import resampy +import torch + +import crepe + + +############################################################################### +# Constants +############################################################################### + + +# Minimum decibel level +MIN_DB = -100. + +# Reference decibel level +REF_DB = 20. + + +############################################################################### +# A-weighted loudness +############################################################################### + + +def a_weighted(audio, sample_rate, hop_length=None, pad=True): + """Retrieve the per-frame loudness""" + # Save device + device = audio.device + + # Default hop length of 10 ms + hop_length = sample_rate // 100 if hop_length is None else hop_length + + # Convert to numpy + audio = audio.detach().cpu().numpy().squeeze(0) + + # Resample + if sample_rate != crepe.SAMPLE_RATE: + audio = resampy.resample(audio, sample_rate, crepe.SAMPLE_RATE) + hop_length = int(hop_length * crepe.SAMPLE_RATE / sample_rate) + + # Cache weights + if not hasattr(a_weighted, 'weights'): + a_weighted.weights = perceptual_weights() + + # Take stft + stft = librosa.stft(audio, + n_fft=crepe.WINDOW_SIZE, + hop_length=hop_length, + win_length=crepe.WINDOW_SIZE, + center=pad, + pad_mode='constant') + + # Compute magnitude on db scale + db = librosa.amplitude_to_db(np.abs(stft)) + + # Apply A-weighting + weighted = db + a_weighted.weights + + # Threshold + weighted[weighted < MIN_DB] = MIN_DB + + # Average over weighted frequencies + return torch.from_numpy(weighted.mean(axis=0)).float().to(device)[None] + + +def perceptual_weights(): + """A-weighted frequency-dependent perceptual loudness weights""" + frequencies = librosa.fft_frequencies(sr=crepe.SAMPLE_RATE, + n_fft=crepe.WINDOW_SIZE) + + # A warning is raised for nearly inaudible frequencies, but it ends up + # defaulting to -100 db. That default is fine for our purposes. + with warnings.catch_warnings(): + warnings.simplefilter('ignore', RuntimeWarning) + return librosa.A_weighting(frequencies)[:, None] - REF_DB diff --git a/crepe/model.py b/crepe/model.py new file mode 100755 index 0000000000000000000000000000000000000000..e1c1a5b687773211d77e89d096e0e0189014ac54 --- /dev/null +++ b/crepe/model.py @@ -0,0 +1,134 @@ +import functools + +import torch +import torch.nn.functional as F + +import crepe + + +########################################################################### +# Model definition +########################################################################### + + +class Crepe(torch.nn.Module): + """Crepe model definition""" + + def __init__(self, model='full'): + super().__init__() + + # Model-specific layer parameters + if model == 'full': + in_channels = [1, 1024, 128, 128, 128, 256] + out_channels = [1024, 128, 128, 128, 256, 512] + self.in_features = 2048 + elif model == 'tiny': + in_channels = [1, 128, 16, 16, 16, 32] + out_channels = [128, 16, 16, 16, 32, 64] + self.in_features = 256 + else: + raise ValueError(f'Model {model} is not supported') + + # Shared layer parameters + kernel_sizes = [(512, 1)] + 5 * [(64, 1)] + strides = [(4, 1)] + 5 * [(1, 1)] + + # Overload with eps and momentum conversion given by MMdnn + batch_norm_fn = functools.partial(torch.nn.BatchNorm2d, + eps=0.0010000000474974513, + momentum=0.0) + + # Layer definitions + self.conv1 = torch.nn.Conv2d( + in_channels=in_channels[0], + out_channels=out_channels[0], + kernel_size=kernel_sizes[0], + stride=strides[0]) + self.conv1_BN = batch_norm_fn( + num_features=out_channels[0]) + + self.conv2 = torch.nn.Conv2d( + in_channels=in_channels[1], + out_channels=out_channels[1], + kernel_size=kernel_sizes[1], + stride=strides[1]) + self.conv2_BN = batch_norm_fn( + num_features=out_channels[1]) + + self.conv3 = torch.nn.Conv2d( + in_channels=in_channels[2], + out_channels=out_channels[2], + kernel_size=kernel_sizes[2], + stride=strides[2]) + self.conv3_BN = batch_norm_fn( + num_features=out_channels[2]) + + self.conv4 = torch.nn.Conv2d( + in_channels=in_channels[3], + out_channels=out_channels[3], + kernel_size=kernel_sizes[3], + stride=strides[3]) + self.conv4_BN = batch_norm_fn( + num_features=out_channels[3]) + + self.conv5 = torch.nn.Conv2d( + in_channels=in_channels[4], + out_channels=out_channels[4], + kernel_size=kernel_sizes[4], + stride=strides[4]) + self.conv5_BN = batch_norm_fn( + num_features=out_channels[4]) + + self.conv6 = torch.nn.Conv2d( + in_channels=in_channels[5], + out_channels=out_channels[5], + kernel_size=kernel_sizes[5], + stride=strides[5]) + self.conv6_BN = batch_norm_fn( + num_features=out_channels[5]) + + self.classifier = torch.nn.Linear( + in_features=self.in_features, + out_features=crepe.PITCH_BINS) + + def forward(self, x, embed=False): + # Forward pass through first five layers + x = self.embed(x) + + if embed: + return x + + # Forward pass through layer six + x = self.layer(x, self.conv6, self.conv6_BN) + + # shape=(batch, self.in_features) + x = x.permute(0, 2, 1, 3).reshape(-1, self.in_features) + + # Compute logits + return torch.sigmoid(self.classifier(x)) + + ########################################################################### + # Forward pass utilities + ########################################################################### + + def embed(self, x): + """Map input audio to pitch embedding""" + # shape=(batch, 1, 1024, 1) + x = x[:, None, :, None] + + # Forward pass through first five layers + x = self.layer(x, self.conv1, self.conv1_BN, (0, 0, 254, 254)) + x = self.layer(x, self.conv2, self.conv2_BN) + x = self.layer(x, self.conv3, self.conv3_BN) + x = self.layer(x, self.conv4, self.conv4_BN) + x = self.layer(x, self.conv5, self.conv5_BN) + + return x + + def layer(self, x, conv, batch_norm, padding=(0, 0, 31, 32)): + """Forward pass through one layer""" + x = F.pad(x, padding) + x = conv(x) + x = F.relu(x) + x = batch_norm(x) + return F.max_pool2d(x, (2, 1), (2, 1)) diff --git a/crepe/threshold.py b/crepe/threshold.py new file mode 100755 index 0000000000000000000000000000000000000000..85d6ec9bef2d03b0eb101c6b7fa4f3464cdf1554 --- /dev/null +++ b/crepe/threshold.py @@ -0,0 +1,134 @@ +import numpy as np +import torch + +import crepe + + +############################################################################### +# Pitch thresholding methods +############################################################################### + + +class At: + """Simple thresholding at a specified probability value""" + + def __init__(self, value): + self.value = value + + def __call__(self, pitch, periodicity): + # Make a copy to prevent in-place modification + pitch = torch.clone(pitch) + + # Threshold + pitch[periodicity < self.value] = crepe.UNVOICED + return pitch + + +class Hysteresis: + """Hysteresis thresholding""" + + def __init__(self, + lower_bound=.19, + upper_bound=.31, + width=.2, + stds=1.7, + return_threshold=False): + self.lower_bound = lower_bound + self.upper_bound = upper_bound + self.width = width + self.stds = stds + self.return_threshold = return_threshold + + def __call__(self, pitch, periodicity): + # Save output device + device = pitch.device + + # Perform hysteresis in log-2 space + pitch = torch.log2(pitch).detach().flatten().cpu().numpy() + + # Flatten periodicity + periodicity = periodicity.flatten().cpu().numpy() + + # Ignore confidently unvoiced pitch + pitch[periodicity < self.lower_bound] = crepe.UNVOICED + + # Whiten pitch + mean, std = np.nanmean(pitch), np.nanstd(pitch) + pitch = (pitch - mean) / std + + # Require high confidence to make predictions far from the mean + parabola = self.width * pitch ** 2 - self.width * self.stds ** 2 + threshold = \ + self.lower_bound + np.clip(parabola, 0, 1 - self.lower_bound) + threshold[np.isnan(threshold)] = self.lower_bound + + # Apply hysteresis to prevent short, unconfident voiced regions + i = 0 + while i < len(periodicity) - 1: + + # Detect unvoiced to voiced transition + if periodicity[i] < threshold[i] and \ + periodicity[i + 1] > threshold[i + 1]: + + # Grow region until next unvoiced or end of array + start, end, keep = i + 1, i + 1, False + while end < len(periodicity) and \ + periodicity[end] > threshold[end]: + if periodicity[end] > self.upper_bound: + keep = True + end += 1 + + # Force unvoiced if we didn't pass the confidence required by + # the hysteresis + if not keep: + threshold[start:end] = 1 + + i = end + + else: + i += 1 + + # Remove pitch with low periodicity + pitch[periodicity < threshold] = crepe.UNVOICED + + # Unwhiten + pitch = pitch * std + mean + + # Convert to Hz + pitch = torch.tensor(2 ** pitch, device=device)[None, :] + + # Optionally return threshold + if self.return_threshold: + return pitch, torch.tensor(threshold, device=device) + + return pitch + + +############################################################################### +# Periodicity thresholding methods +############################################################################### + + +class Silence: + """Set periodicity to zero in silent regions""" + + def __init__(self, value=-60): + self.value = value + + def __call__(self, + periodicity, + audio, + sample_rate=crepe.SAMPLE_RATE, + hop_length=None, + pad=True): + # Don't modify in-place + periodicity = torch.clone(periodicity) + + # Compute loudness + loudness = crepe.loudness.a_weighted( + audio, sample_rate, hop_length, pad) + + # Threshold silence + periodicity[loudness < self.value] = 0. + + return periodicity diff --git a/data/codec_targets.py b/data/codec_targets.py new file mode 100644 index 0000000000000000000000000000000000000000..abf36b6b1fefff42f2e8733445c39f5b8f866e59 --- /dev/null +++ b/data/codec_targets.py @@ -0,0 +1,78 @@ +import os +import argparse +import glob +import torch +from pathlib import Path +import dac +import soundfile as sf +import warnings + +warnings.filterwarnings("ignore", category=FutureWarning) + +class CodecExtractor: + def __init__(self, device='cuda'): + self.device = device + print(f"Loading DAC model on {device}...") + self.codec = dac.utils.load_model(tag="latest", model_type="44khz").to(self.device).eval() + for param in self.codec.parameters(): + param.requires_grad = False + print("Initialized Frozen Codec") + + @torch.no_grad() + def extract_targets(self, wav_tensor, sample_rate): + """ + Runs the waveform through the frozen codec encoder to get the quantized continuous vectors `z_target`. + """ + from torchaudio.functional import resample + # DAC 44khz model requires exactly 44100 Hz + if sample_rate != 44100: + wav_tensor = resample(wav_tensor, sample_rate, 44100) + + wav_tensor = self.codec.preprocess(wav_tensor, 44100) + + # 'encode' returns z (continuous), codes (discrete), latents, _, _ + z, _, _, _, _ = self.codec.encode(wav_tensor) + return z + +def process_corpus(wav_dir, out_dir, device='cuda'): + extractor = CodecExtractor(device=device) + os.makedirs(out_dir, exist_ok=True) + + wav_paths = glob.glob(os.path.join(wav_dir, "**/*.wav"), recursive=True) + print(f"Found {len(wav_paths)} wav files.") + + for wav_path in wav_paths: + try: + wav_data, sr = sf.read(wav_path) + # Ensure shape is (1, 1, T) + if len(wav_data.shape) > 1: + wav_data = wav_data[:, 0] # take first channel + wav_tensor = torch.from_numpy(wav_data).unsqueeze(0).unsqueeze(0).float().to(device) + + z_target = extractor.extract_targets(wav_tensor, sample_rate=sr) + + file_id = Path(wav_path).stem + speaker_dir = Path(wav_path).parent.name + out_spk_dir = os.path.join(out_dir, speaker_dir) + os.makedirs(out_spk_dir, exist_ok=True) + + out_path = os.path.join(out_spk_dir, f"{file_id}_ztarget.pt") + torch.save(z_target.cpu(), out_path) + print(f"Saved extracted target for {speaker_dir}/{file_id}: shape {z_target.shape}") + except Exception as e: + print(f"Skipping {wav_path} due to error: {e}") + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument("-w", "--wav_dir", default="./data_svc/waves-32k") + parser.add_argument("-o", "--out_dir", default="./data_svc/codec_targets") + args = parser.parse_args() + + if torch.cuda.is_available(): + device = "cuda" + elif torch.backends.mps.is_available(): + device = "mps" + else: + device = "cpu" + process_corpus(args.wav_dir, args.out_dir, device) + print("Offline processing complete.") diff --git a/dataset_raw_singers/speaker_mapping.json b/dataset_raw_singers/speaker_mapping.json new file mode 100644 index 0000000000000000000000000000000000000000..c9328dea392816a891506c1fc91d14485b38f8bf --- /dev/null +++ b/dataset_raw_singers/speaker_mapping.json @@ -0,0 +1,78 @@ +{ + "ManRaw_13": "singer_0001", + "ManRaw_21": "singer_0002", + "ManRaw_25": "singer_0003", + "ManRaw_22": "singer_0004", + "ManRaw_16": "singer_0005", + "ManRaw_12": "singer_0006", + "ManRaw_1": "singer_0007", + "ManRaw_10": "singer_0008", + "ManRaw_19": "singer_0009", + "ManRaw_8": "singer_0010", + "ManRaw_4": "singer_0011", + "ManRaw_0": "singer_0012", + "ManRaw_6": "singer_0013", + "ManRaw_7": "singer_0014", + "ManRaw_20": "singer_0015", + "ManRaw_26": "singer_0016", + "ManRaw_9": "singer_0017", + "ManRaw_2": "singer_0018", + "ManRaw_3": "singer_0019", + "ManRaw_11": "singer_0020", + "ManRaw_27": "singer_0021", + "ManRaw_14": "singer_0022", + "ManRaw_18": "singer_0023", + "ManRaw_24": "singer_0024", + "ManRaw_23": "singer_0025", + "ManRaw_15": "singer_0026", + "ManRaw_17": "singer_0027", + "ManRaw_5": "singer_0028", + "WomanRaw_24": "singer_0029", + "WomanRaw_42": "singer_0030", + "WomanRaw_41": "singer_0031", + "WomanRaw_43": "singer_0032", + "WomanRaw_47": "singer_0033", + "WomanRaw_35": "singer_0034", + "WomanRaw_38": "singer_0035", + "WomanRaw_19": "singer_0036", + "WomanRaw_22": "singer_0037", + "WomanRaw_26": "singer_0038", + "WomanRaw_36": "singer_0039", + "WomanRaw_9": "singer_0040", + "WomanRaw_18": "singer_0041", + "WomanRaw_44": "singer_0042", + "WomanRaw_34": "singer_0043", + "WomanRaw_2": "singer_0044", + "WomanRaw_14": "singer_0045", + "WomanRaw_40": "singer_0046", + "WomanRaw_27": "singer_0047", + "WomanRaw_45": "singer_0048", + "WomanRaw_32": "singer_0049", + "WomanRaw_21": "singer_0050", + "WomanRaw_25": "singer_0051", + "WomanRaw_23": "singer_0052", + "WomanRaw_3": "singer_0053", + "WomanRaw_11": "singer_0054", + "WomanRaw_31": "singer_0055", + "WomanRaw_39": "singer_0056", + "WomanRaw_1": "singer_0057", + "WomanRaw_16": "singer_0058", + "WomanRaw_5": "singer_0059", + "WomanRaw_30": "singer_0060", + "WomanRaw_28": "singer_0061", + "WomanRaw_13": "singer_0062", + "WomanRaw_17": "singer_0063", + "WomanRaw_0": "singer_0064", + "WomanRaw_33": "singer_0065", + "WomanRaw_8": "singer_0066", + "WomanRaw_12": "singer_0067", + "WomanRaw_15": "singer_0068", + "WomanRaw_29": "singer_0069", + "WomanRaw_7": "singer_0070", + "WomanRaw_10": "singer_0071", + "WomanRaw_6": "singer_0072", + "WomanRaw_20": "singer_0073", + "WomanRaw_4": "singer_0074", + "WomanRaw_46": "singer_0075", + "WomanRaw_37": "singer_0076" +} \ No newline at end of file diff --git a/diagram/architecture.aux b/diagram/architecture.aux new file mode 100644 index 0000000000000000000000000000000000000000..b64012178f9cbc148dca209ceaf227aee4833711 --- /dev/null +++ b/diagram/architecture.aux @@ -0,0 +1,2 @@ +\relax +\gdef \@abspage@last{1} diff --git a/diagram/architecture.fdb_latexmk b/diagram/architecture.fdb_latexmk new file mode 100644 index 0000000000000000000000000000000000000000..940e7f0cf2cb5d750eecb7ad8a02d44301d63164 --- /dev/null +++ b/diagram/architecture.fdb_latexmk @@ -0,0 +1,127 @@ +# Fdb version 4 +["pdflatex"] 1772299688.64924 "/Users/hongyuli/genai/svc/matcha_svc/diagram/architecture.tex" "architecture.pdf" "architecture" 1772299689.11855 0 + "/Users/hongyuli/genai/svc/matcha_svc/diagram/architecture.tex" 1772299630.08098 4255 aa1c21a2510686b5a44d4afb58d9de9a "" + "/usr/local/texlive/2025/texmf-dist/fonts/map/fontname/texfonts.map" 1577235249 3524 cb3e574dea2d1052e39280babc910dc8 "" + "/usr/local/texlive/2025/texmf-dist/fonts/tfm/public/cm/cmbx10.tfm" 1136768653 1328 c834bbb027764024c09d3d2bf908b5f0 "" + "/usr/local/texlive/2025/texmf-dist/fonts/tfm/public/cm/cmbx7.tfm" 1136768653 1336 3125ccb448c1a09074e3aa4a9832f130 "" + "/usr/local/texlive/2025/texmf-dist/fonts/tfm/public/cm/cmmi6.tfm" 1136768653 1512 f21f83efb36853c0b70002322c1ab3ad "" + "/usr/local/texlive/2025/texmf-dist/fonts/tfm/public/cm/cmmi9.tfm" 1136768653 1524 d89e2d087a9828407a196f428428ef4a "" + "/usr/local/texlive/2025/texmf-dist/fonts/tfm/public/cm/cmr6.tfm" 1136768653 1300 b62933e007d01cfd073f79b963c01526 "" + "/usr/local/texlive/2025/texmf-dist/fonts/tfm/public/cm/cmr9.tfm" 1136768653 1292 6b21b9c2c7bebb38aa2273f7ca0fb3af "" + "/usr/local/texlive/2025/texmf-dist/fonts/tfm/public/cm/cmss10.tfm" 1136768653 1316 b636689f1933f24d1294acdf6041daaa "" + "/usr/local/texlive/2025/texmf-dist/fonts/tfm/public/cm/cmss9.tfm" 1136768653 1320 49357c421c0d469f88b867dd0c3d10e8 "" + "/usr/local/texlive/2025/texmf-dist/fonts/tfm/public/cm/cmsy6.tfm" 1136768653 1116 933a60c408fc0a863a92debe84b2d294 "" + "/usr/local/texlive/2025/texmf-dist/fonts/tfm/public/cm/cmsy9.tfm" 1136768653 1116 25a7bf822c58caf309a702ef79f4afbb "" + "/usr/local/texlive/2025/texmf-dist/fonts/type1/public/amsfonts/cm/cmbx10.pfb" 1248133631 34811 78b52f49e893bcba91bd7581cdc144c0 "" + "/usr/local/texlive/2025/texmf-dist/fonts/type1/public/amsfonts/cm/cmbx7.pfb" 1248133631 32007 e8fa0078355f39467039935974716569 "" + "/usr/local/texlive/2025/texmf-dist/fonts/type1/public/amsfonts/cm/cmmi6.pfb" 1248133631 37166 8ab3487cbe3ab49ebce74c29ea2418db "" + "/usr/local/texlive/2025/texmf-dist/fonts/type1/public/amsfonts/cm/cmmi9.pfb" 1248133631 36094 798f80770b3b148ceedd006d487db67c "" + "/usr/local/texlive/2025/texmf-dist/fonts/type1/public/amsfonts/cm/cmr6.pfb" 1248133631 32734 69e00a6b65cedb993666e42eedb3d48f "" + "/usr/local/texlive/2025/texmf-dist/fonts/type1/public/amsfonts/cm/cmr9.pfb" 1248133631 33993 9b89b85fd2d9df0482bd47194d1d3bf3 "" + "/usr/local/texlive/2025/texmf-dist/fonts/type1/public/amsfonts/cm/cmss9.pfb" 1248133631 24373 a91d375736817a75026663adcb2190c1 "" + "/usr/local/texlive/2025/texmf-dist/fonts/type1/public/amsfonts/cm/cmsy9.pfb" 1248133631 32442 c975af247b6702f7ca0c299af3616b80 "" + "/usr/local/texlive/2025/texmf-dist/tex/context/base/mkii/supp-pdf.mkii" 1461363279 71627 94eb9990bed73c364d7f53f960cc8c5b "" + "/usr/local/texlive/2025/texmf-dist/tex/generic/iftex/ifluatex.sty" 1572645307 492 1994775aa15b0d1289725a0b1bbc2d4c "" + "/usr/local/texlive/2025/texmf-dist/tex/generic/iftex/iftex.sty" 1734129479 7984 7dbb9280f03c0a315425f1b4f35d43ee "" + "/usr/local/texlive/2025/texmf-dist/tex/generic/pgf/basiclayer/pgfcore.code.tex" 1673816307 1016 1c2b89187d12a2768764b83b4945667c "" + "/usr/local/texlive/2025/texmf-dist/tex/generic/pgf/basiclayer/pgfcorearrows.code.tex" 1601326656 43820 1fef971b75380574ab35a0d37fd92608 "" + "/usr/local/texlive/2025/texmf-dist/tex/generic/pgf/basiclayer/pgfcoreexternal.code.tex" 1601326656 19324 f4e4c6403dd0f1605fd20ed22fa79dea "" + "/usr/local/texlive/2025/texmf-dist/tex/generic/pgf/basiclayer/pgfcoregraphicstate.code.tex" 1601326656 6038 ccb406740cc3f03bbfb58ad504fe8c27 "" + "/usr/local/texlive/2025/texmf-dist/tex/generic/pgf/basiclayer/pgfcoreimage.code.tex" 1673816307 6911 f6d4cf5a3fef5cc879d668b810e82868 "" + "/usr/local/texlive/2025/texmf-dist/tex/generic/pgf/basiclayer/pgfcorelayers.code.tex" 1601326656 4883 42daaf41e27c3735286e23e48d2d7af9 "" + "/usr/local/texlive/2025/texmf-dist/tex/generic/pgf/basiclayer/pgfcoreobjects.code.tex" 1601326656 2544 8c06d2a7f0f469616ac9e13db6d2f842 "" + "/usr/local/texlive/2025/texmf-dist/tex/generic/pgf/basiclayer/pgfcorepathconstruct.code.tex" 1601326656 44195 5e390c414de027626ca5e2df888fa68d "" + "/usr/local/texlive/2025/texmf-dist/tex/generic/pgf/basiclayer/pgfcorepathprocessing.code.tex" 1601326656 17311 2ef6b2e29e2fc6a2fc8d6d652176e257 "" + "/usr/local/texlive/2025/texmf-dist/tex/generic/pgf/basiclayer/pgfcorepathusage.code.tex" 1601326656 21302 788a79944eb22192a4929e46963a3067 "" + "/usr/local/texlive/2025/texmf-dist/tex/generic/pgf/basiclayer/pgfcorepatterns.code.tex" 1673816307 9691 3d42d89522f4650c2f3dc616ca2b925e "" + "/usr/local/texlive/2025/texmf-dist/tex/generic/pgf/basiclayer/pgfcorepoints.code.tex" 1601326656 33335 dd1fa4814d4e51f18be97d88bf0da60c "" + "/usr/local/texlive/2025/texmf-dist/tex/generic/pgf/basiclayer/pgfcorequick.code.tex" 1601326656 2965 4c2b1f4e0826925746439038172e5d6f "" + "/usr/local/texlive/2025/texmf-dist/tex/generic/pgf/basiclayer/pgfcorerdf.code.tex" 1601326656 5196 2cc249e0ee7e03da5f5f6589257b1e5b "" + "/usr/local/texlive/2025/texmf-dist/tex/generic/pgf/basiclayer/pgfcorescopes.code.tex" 1673816307 20821 7579108c1e9363e61a0b1584778804aa "" + "/usr/local/texlive/2025/texmf-dist/tex/generic/pgf/basiclayer/pgfcoreshade.code.tex" 1601326656 35249 abd4adf948f960299a4b3d27c5dddf46 "" + "/usr/local/texlive/2025/texmf-dist/tex/generic/pgf/basiclayer/pgfcoretransformations.code.tex" 1673816307 22012 81b34a0aa8fa1a6158cc6220b00e4f10 "" + "/usr/local/texlive/2025/texmf-dist/tex/generic/pgf/basiclayer/pgfcoretransparency.code.tex" 1601326656 8893 e851de2175338fdf7c17f3e091d94618 "" + "/usr/local/texlive/2025/texmf-dist/tex/generic/pgf/frontendlayer/tikz/libraries/tikzlibrarybackgrounds.code.tex" 1601326656 4572 4a19637ef65ce88ad2f2d5064b69541d "" + "/usr/local/texlive/2025/texmf-dist/tex/generic/pgf/frontendlayer/tikz/libraries/tikzlibrarycalc.code.tex" 1601326656 15929 463535aa2c4268fead6674a75c0e8266 "" + "/usr/local/texlive/2025/texmf-dist/tex/generic/pgf/frontendlayer/tikz/libraries/tikzlibraryfit.code.tex" 1673816307 3626 2d87dc681257fa32d07a8b3934b10f88 "" + "/usr/local/texlive/2025/texmf-dist/tex/generic/pgf/frontendlayer/tikz/libraries/tikzlibrarypositioning.code.tex" 1601326656 3937 3f208572dd82c71103831da976d74f1a "" + "/usr/local/texlive/2025/texmf-dist/tex/generic/pgf/frontendlayer/tikz/libraries/tikzlibraryshapes.geometric.code.tex" 1601326656 339 be0fe46d92a80e3385dd6a83511a46f2 "" + "/usr/local/texlive/2025/texmf-dist/tex/generic/pgf/frontendlayer/tikz/libraries/tikzlibrarytopaths.code.tex" 1608933718 11518 738408f795261b70ce8dd47459171309 "" + "/usr/local/texlive/2025/texmf-dist/tex/generic/pgf/frontendlayer/tikz/tikz.code.tex" 1673816307 186782 af500404a9edec4d362912fe762ded92 "" + "/usr/local/texlive/2025/texmf-dist/tex/generic/pgf/libraries/pgflibraryarrows.meta.code.tex" 1601326656 58801 1e750fb0692eb99aaac45698bbec96b1 "" + "/usr/local/texlive/2025/texmf-dist/tex/generic/pgf/libraries/pgflibraryplothandlers.code.tex" 1601326656 32995 ac577023e12c0e4bd8aa420b2e852d1a "" + "/usr/local/texlive/2025/texmf-dist/tex/generic/pgf/libraries/shapes/pgflibraryshapes.geometric.code.tex" 1673816307 161011 76ab54df0aa1a9d3b27a94864771d38d "" + "/usr/local/texlive/2025/texmf-dist/tex/generic/pgf/math/pgfint.code.tex" 1557692582 3063 8c415c68a0f3394e45cfeca0b65f6ee6 "" + "/usr/local/texlive/2025/texmf-dist/tex/generic/pgf/math/pgfmath.code.tex" 1673816307 949 cea70942e7b7eddabfb3186befada2e6 "" + "/usr/local/texlive/2025/texmf-dist/tex/generic/pgf/math/pgfmathcalc.code.tex" 1673816307 13270 2e54f2ce7622437bf37e013d399743e3 "" + "/usr/local/texlive/2025/texmf-dist/tex/generic/pgf/math/pgfmathfloat.code.tex" 1673816307 104717 9b2393fbf004a0ce7fa688dbce423848 "" + "/usr/local/texlive/2025/texmf-dist/tex/generic/pgf/math/pgfmathfunctions.base.code.tex" 1601326656 10165 cec5fa73d49da442e56efc2d605ef154 "" + "/usr/local/texlive/2025/texmf-dist/tex/generic/pgf/math/pgfmathfunctions.basic.code.tex" 1601326656 28178 41c17713108e0795aac6fef3d275fbca "" + "/usr/local/texlive/2025/texmf-dist/tex/generic/pgf/math/pgfmathfunctions.code.tex" 1673816307 9649 85779d3d8d573bfd2cd4137ba8202e60 "" + "/usr/local/texlive/2025/texmf-dist/tex/generic/pgf/math/pgfmathfunctions.comparison.code.tex" 1601326656 3865 ac538ab80c5cf82b345016e474786549 "" + "/usr/local/texlive/2025/texmf-dist/tex/generic/pgf/math/pgfmathfunctions.integerarithmetics.code.tex" 1557692582 3177 27d85c44fbfe09ff3b2cf2879e3ea434 "" + "/usr/local/texlive/2025/texmf-dist/tex/generic/pgf/math/pgfmathfunctions.misc.code.tex" 1621110968 11024 0179538121bc2dba172013a3ef89519f "" + "/usr/local/texlive/2025/texmf-dist/tex/generic/pgf/math/pgfmathfunctions.random.code.tex" 1673816307 7890 0a86dbf4edfd88d022e0d889ec78cc03 "" + "/usr/local/texlive/2025/texmf-dist/tex/generic/pgf/math/pgfmathfunctions.round.code.tex" 1601326656 3379 781797a101f647bab82741a99944a229 "" + "/usr/local/texlive/2025/texmf-dist/tex/generic/pgf/math/pgfmathfunctions.trigonometric.code.tex" 1601326656 92405 f515f31275db273f97b9d8f52e1b0736 "" + "/usr/local/texlive/2025/texmf-dist/tex/generic/pgf/math/pgfmathparser.code.tex" 1673816307 37466 97b0a1ba732e306a1a2034f5a73e239f "" + "/usr/local/texlive/2025/texmf-dist/tex/generic/pgf/math/pgfmathutil.code.tex" 1601326656 8471 c2883569d03f69e8e1cabfef4999cfd7 "" + "/usr/local/texlive/2025/texmf-dist/tex/generic/pgf/modules/pgfmodulematrix.code.tex" 1673816307 21211 1e73ec76bd73964d84197cc3d2685b01 "" + "/usr/local/texlive/2025/texmf-dist/tex/generic/pgf/modules/pgfmoduleplot.code.tex" 1601326656 16121 346f9013d34804439f7436ff6786cef7 "" + "/usr/local/texlive/2025/texmf-dist/tex/generic/pgf/modules/pgfmoduleshapes.code.tex" 1673816307 44792 271e2e1934f34c759f4dedb1e14a5015 "" + "/usr/local/texlive/2025/texmf-dist/tex/generic/pgf/pgf.revision.tex" 1673816307 114 e6d443369d0673933b38834bf99e422d "" + "/usr/local/texlive/2025/texmf-dist/tex/generic/pgf/systemlayer/pgf.cfg" 1601326656 926 2963ea0dcf6cc6c0a770b69ec46a477b "" + "/usr/local/texlive/2025/texmf-dist/tex/generic/pgf/systemlayer/pgfsys-common-pdf.def" 1673816307 5542 32f75a31ea6c3a7e1148cd6d5e93dbb7 "" + "/usr/local/texlive/2025/texmf-dist/tex/generic/pgf/systemlayer/pgfsys-pdftex.def" 1673816307 12612 7774ba67bfd72e593c4436c2de6201e3 "" + "/usr/local/texlive/2025/texmf-dist/tex/generic/pgf/systemlayer/pgfsys.code.tex" 1673816307 61351 bc5f86e0355834391e736e97a61abced "" + "/usr/local/texlive/2025/texmf-dist/tex/generic/pgf/systemlayer/pgfsysprotocol.code.tex" 1601326656 1896 b8e0ca0ac371d74c0ca05583f6313c91 "" + "/usr/local/texlive/2025/texmf-dist/tex/generic/pgf/systemlayer/pgfsyssoftpath.code.tex" 1601326656 7778 53c8b5623d80238f6a20aa1df1868e63 "" + "/usr/local/texlive/2025/texmf-dist/tex/generic/pgf/utilities/pgffor.code.tex" 1673816307 24033 d8893a1ec4d1bfa101b172754743d340 "" + "/usr/local/texlive/2025/texmf-dist/tex/generic/pgf/utilities/pgfkeys.code.tex" 1673816307 39784 414c54e866ebab4b801e2ad81d9b21d8 "" + "/usr/local/texlive/2025/texmf-dist/tex/generic/pgf/utilities/pgfkeyslibraryfiltered.code.tex" 1673816307 37433 940bc6d409f1ffd298adfdcaf125dd86 "" + "/usr/local/texlive/2025/texmf-dist/tex/generic/pgf/utilities/pgfrcs.code.tex" 1673816307 4385 510565c2f07998c8a0e14f0ec07ff23c "" + "/usr/local/texlive/2025/texmf-dist/tex/generic/pgf/utilities/pgfutil-common.tex" 1673816307 29239 22e8c7516012992a49873eff0d868fed "" + "/usr/local/texlive/2025/texmf-dist/tex/generic/pgf/utilities/pgfutil-latex.def" 1673816307 6950 8524a062d82b7afdc4a88a57cb377784 "" + "/usr/local/texlive/2025/texmf-dist/tex/generic/xkeyval/keyval.tex" 1655411236 2725 1a42bd9e7e57e25fc7763c445f4b785b "" + "/usr/local/texlive/2025/texmf-dist/tex/generic/xkeyval/xkeyval.tex" 1655411236 19231 27205ee17aaa2902aea3e0c07a3cfc65 "" + "/usr/local/texlive/2025/texmf-dist/tex/generic/xkeyval/xkvutils.tex" 1655411236 7677 9cb1a74d945bc9331f2181c0a59ff34a "" + "/usr/local/texlive/2025/texmf-dist/tex/latex/base/article.cls" 1738182759 20144 63d8bacaf52e5abf4db3bc322373e1d4 "" + "/usr/local/texlive/2025/texmf-dist/tex/latex/base/size10.clo" 1738182759 8448 5cf247d4bd0c7d5d711bbbdf111fae2e "" + "/usr/local/texlive/2025/texmf-dist/tex/latex/epstopdf-pkg/epstopdf-base.sty" 1579991033 13886 d1306dcf79a944f6988e688c1785f9ce "" + "/usr/local/texlive/2025/texmf-dist/tex/latex/graphics-cfg/color.cfg" 1459978653 1213 620bba36b25224fa9b7e1ccb4ecb76fd "" + "/usr/local/texlive/2025/texmf-dist/tex/latex/graphics-cfg/graphics.cfg" 1465944070 1224 978390e9c2234eab29404bc21b268d1e "" + "/usr/local/texlive/2025/texmf-dist/tex/latex/graphics-def/pdftex.def" 1713382759 19440 9da9dcbb27470349a580fca7372d454b "" + "/usr/local/texlive/2025/texmf-dist/tex/latex/graphics/graphics.sty" 1730496337 18363 dee506cb8d56825d8a4d020f5d5f8704 "" + "/usr/local/texlive/2025/texmf-dist/tex/latex/graphics/graphicx.sty" 1717359999 8010 6f2ad8c2b2ffbd607af6475441c7b5e4 "" + "/usr/local/texlive/2025/texmf-dist/tex/latex/graphics/keyval.sty" 1717359999 2671 70891d50dac933918b827d326687c6e8 "" + "/usr/local/texlive/2025/texmf-dist/tex/latex/graphics/mathcolor.ltx" 1667332637 2885 9c645d672ae17285bba324998918efd8 "" + "/usr/local/texlive/2025/texmf-dist/tex/latex/graphics/trig.sty" 1717359999 4023 2c9f39712cf7b43d3eb93a8bbd5c8f67 "" + "/usr/local/texlive/2025/texmf-dist/tex/latex/l3backend/l3backend-pdftex.def" 1716410060 29785 9f93ab201fe5dd053afcc6c1bcf7d266 "" + "/usr/local/texlive/2025/texmf-dist/tex/latex/latexconfig/epstopdf-sys.cfg" 1279039959 678 4792914a8f45be57bb98413425e4c7af "" + "/usr/local/texlive/2025/texmf-dist/tex/latex/pgf/basiclayer/pgf.sty" 1601326656 1090 bae35ef70b3168089ef166db3e66f5b2 "" + "/usr/local/texlive/2025/texmf-dist/tex/latex/pgf/basiclayer/pgfcore.sty" 1673816307 373 00b204b1d7d095b892ad31a7494b0373 "" + "/usr/local/texlive/2025/texmf-dist/tex/latex/pgf/compatibility/pgfcomp-version-0-65.sty" 1601326656 21013 f4ff83d25bb56552493b030f27c075ae "" + "/usr/local/texlive/2025/texmf-dist/tex/latex/pgf/compatibility/pgfcomp-version-1-18.sty" 1601326656 989 c49c8ae06d96f8b15869da7428047b1e "" + "/usr/local/texlive/2025/texmf-dist/tex/latex/pgf/frontendlayer/tikz.sty" 1601326656 339 c2e180022e3afdb99c7d0ea5ce469b7d "" + "/usr/local/texlive/2025/texmf-dist/tex/latex/pgf/math/pgfmath.sty" 1601326656 306 c56a323ca5bf9242f54474ced10fca71 "" + "/usr/local/texlive/2025/texmf-dist/tex/latex/pgf/systemlayer/pgfsys.sty" 1601326656 443 8c872229db56122037e86bcda49e14f3 "" + "/usr/local/texlive/2025/texmf-dist/tex/latex/pgf/utilities/pgffor.sty" 1601326656 348 ee405e64380c11319f0e249fed57e6c5 "" + "/usr/local/texlive/2025/texmf-dist/tex/latex/pgf/utilities/pgfkeys.sty" 1601326656 274 5ae372b7df79135d240456a1c6f2cf9a "" + "/usr/local/texlive/2025/texmf-dist/tex/latex/pgf/utilities/pgfrcs.sty" 1601326656 325 f9f16d12354225b7dd52a3321f085955 "" + "/usr/local/texlive/2025/texmf-dist/tex/latex/preview/preview.sty" 1719778176 13875 09967d4ab60287cfaa3ce0e42a1524aa "" + "/usr/local/texlive/2025/texmf-dist/tex/latex/standalone/standalone.cfg" 1740345147 1015 4f7ef49662222d6288d944cd07fcac9b "" + "/usr/local/texlive/2025/texmf-dist/tex/latex/standalone/standalone.cls" 1740345147 31554 025731bd61fa01ab395682f1e6e2e146 "" + "/usr/local/texlive/2025/texmf-dist/tex/latex/tools/shellesc.sty" 1717359999 4121 6039ae6d0916154d7ba5f20a77b9ab2c "" + "/usr/local/texlive/2025/texmf-dist/tex/latex/xcolor/xcolor.sty" 1727642399 55384 b454dec21c2d9f45ec0b793f0995b992 "" + "/usr/local/texlive/2025/texmf-dist/tex/latex/xkeyval/xkeyval.sty" 1655411236 4937 4ce600ce9bd4ec84d0250eb6892fcf4f "" + "/usr/local/texlive/2025/texmf-dist/web2c/texmf.cnf" 1739380943 42148 61becc7c670cd061bb319c643c27fdd4 "" + "/usr/local/texlive/2025/texmf-var/fonts/map/pdftex/updmap/pdftex.map" 1750650403 5467155 19efa205003f9ecad95fbbaa6ff24da1 "" + "/usr/local/texlive/2025/texmf-var/web2c/pdftex/pdflatex.fmt" 1741450574 3345740 46b66fdb0378f7bf5921b5eabf1762b8 "" + "/usr/local/texlive/2025/texmf.cnf" 1741450484 577 418a7058ec8e006d8704f60ecd22c938 "" + "architecture.aux" 1772299689.08618 32 3985256e7290058c681f74d7a3565a19 "pdflatex" + "architecture.tex" 1772299630.08098 4255 aa1c21a2510686b5a44d4afb58d9de9a "" + (generated) + "architecture.aux" + "architecture.log" + "architecture.pdf" + (rewritten before read) diff --git a/diagram/architecture.fls b/diagram/architecture.fls new file mode 100644 index 0000000000000000000000000000000000000000..b2e8f27224b9e263ea8f1b65fe7818aa91c72c83 --- /dev/null +++ b/diagram/architecture.fls @@ -0,0 +1,198 @@ +PWD /Users/hongyuli/genai/svc/matcha_svc/diagram +INPUT /usr/local/texlive/2025/texmf.cnf +INPUT /usr/local/texlive/2025/texmf-dist/web2c/texmf.cnf +INPUT /usr/local/texlive/2025/texmf-var/web2c/pdftex/pdflatex.fmt +INPUT /Users/hongyuli/genai/svc/matcha_svc/diagram/architecture.tex +OUTPUT architecture.log +INPUT /usr/local/texlive/2025/texmf-dist/tex/latex/standalone/standalone.cls +INPUT /usr/local/texlive/2025/texmf-dist/tex/latex/standalone/standalone.cls +INPUT /usr/local/texlive/2025/texmf-dist/tex/latex/tools/shellesc.sty +INPUT /usr/local/texlive/2025/texmf-dist/tex/latex/tools/shellesc.sty +INPUT /usr/local/texlive/2025/texmf-dist/tex/latex/tools/shellesc.sty +INPUT /usr/local/texlive/2025/texmf-dist/tex/generic/iftex/ifluatex.sty +INPUT /usr/local/texlive/2025/texmf-dist/tex/generic/iftex/ifluatex.sty +INPUT /usr/local/texlive/2025/texmf-dist/tex/generic/iftex/ifluatex.sty +INPUT /usr/local/texlive/2025/texmf-dist/tex/generic/iftex/iftex.sty +INPUT /usr/local/texlive/2025/texmf-dist/tex/generic/iftex/iftex.sty +INPUT /usr/local/texlive/2025/texmf-dist/tex/latex/xkeyval/xkeyval.sty +INPUT /usr/local/texlive/2025/texmf-dist/tex/latex/xkeyval/xkeyval.sty +INPUT /usr/local/texlive/2025/texmf-dist/tex/generic/xkeyval/xkeyval.tex +INPUT /usr/local/texlive/2025/texmf-dist/tex/generic/xkeyval/xkvutils.tex +INPUT /usr/local/texlive/2025/texmf-dist/tex/generic/xkeyval/keyval.tex +INPUT /usr/local/texlive/2025/texmf-dist/tex/latex/standalone/standalone.cfg +INPUT /usr/local/texlive/2025/texmf-dist/tex/latex/standalone/standalone.cfg +INPUT /usr/local/texlive/2025/texmf-dist/tex/latex/standalone/standalone.cfg +INPUT /usr/local/texlive/2025/texmf-dist/tex/latex/preview/preview.sty +INPUT /usr/local/texlive/2025/texmf-dist/tex/latex/base/article.cls +INPUT /usr/local/texlive/2025/texmf-dist/tex/latex/base/article.cls +INPUT /usr/local/texlive/2025/texmf-dist/tex/latex/base/size10.clo +INPUT /usr/local/texlive/2025/texmf-dist/tex/latex/base/size10.clo +INPUT /usr/local/texlive/2025/texmf-dist/tex/latex/base/size10.clo +INPUT /usr/local/texlive/2025/texmf-dist/tex/latex/pgf/frontendlayer/tikz.sty +INPUT /usr/local/texlive/2025/texmf-dist/tex/latex/pgf/frontendlayer/tikz.sty +INPUT /usr/local/texlive/2025/texmf-dist/tex/latex/pgf/basiclayer/pgf.sty +INPUT /usr/local/texlive/2025/texmf-dist/tex/latex/pgf/basiclayer/pgf.sty +INPUT /usr/local/texlive/2025/texmf-dist/tex/latex/pgf/utilities/pgfrcs.sty +INPUT /usr/local/texlive/2025/texmf-dist/tex/latex/pgf/utilities/pgfrcs.sty +INPUT /usr/local/texlive/2025/texmf-dist/tex/generic/pgf/utilities/pgfutil-common.tex +INPUT /usr/local/texlive/2025/texmf-dist/tex/generic/pgf/utilities/pgfutil-latex.def +INPUT /usr/local/texlive/2025/texmf-dist/tex/generic/pgf/utilities/pgfrcs.code.tex +INPUT /usr/local/texlive/2025/texmf-dist/tex/generic/pgf/utilities/pgfrcs.code.tex +INPUT /usr/local/texlive/2025/texmf-dist/tex/generic/pgf/utilities/pgfrcs.code.tex +INPUT /usr/local/texlive/2025/texmf-dist/tex/generic/pgf/pgf.revision.tex +INPUT /usr/local/texlive/2025/texmf-dist/tex/generic/pgf/pgf.revision.tex +INPUT /usr/local/texlive/2025/texmf-dist/tex/latex/pgf/basiclayer/pgfcore.sty +INPUT /usr/local/texlive/2025/texmf-dist/tex/latex/pgf/basiclayer/pgfcore.sty +INPUT /usr/local/texlive/2025/texmf-dist/tex/latex/graphics/graphicx.sty +INPUT /usr/local/texlive/2025/texmf-dist/tex/latex/graphics/graphicx.sty +INPUT /usr/local/texlive/2025/texmf-dist/tex/latex/graphics/keyval.sty +INPUT /usr/local/texlive/2025/texmf-dist/tex/latex/graphics/graphics.sty +INPUT /usr/local/texlive/2025/texmf-dist/tex/latex/graphics/graphics.sty +INPUT /usr/local/texlive/2025/texmf-dist/tex/latex/graphics/trig.sty +INPUT /usr/local/texlive/2025/texmf-dist/tex/latex/graphics/trig.sty +INPUT /usr/local/texlive/2025/texmf-dist/tex/latex/graphics-cfg/graphics.cfg +INPUT /usr/local/texlive/2025/texmf-dist/tex/latex/graphics-cfg/graphics.cfg +INPUT /usr/local/texlive/2025/texmf-dist/tex/latex/graphics-cfg/graphics.cfg +INPUT /usr/local/texlive/2025/texmf-dist/tex/latex/graphics-def/pdftex.def +INPUT /usr/local/texlive/2025/texmf-dist/tex/latex/graphics-def/pdftex.def +INPUT /usr/local/texlive/2025/texmf-dist/tex/latex/graphics-def/pdftex.def +INPUT /usr/local/texlive/2025/texmf-dist/tex/latex/pgf/systemlayer/pgfsys.sty +INPUT /usr/local/texlive/2025/texmf-dist/tex/latex/pgf/systemlayer/pgfsys.sty +INPUT /usr/local/texlive/2025/texmf-dist/tex/generic/pgf/systemlayer/pgfsys.code.tex +INPUT /usr/local/texlive/2025/texmf-dist/tex/generic/pgf/systemlayer/pgfsys.code.tex +INPUT /usr/local/texlive/2025/texmf-dist/tex/generic/pgf/systemlayer/pgfsys.code.tex +INPUT /usr/local/texlive/2025/texmf-dist/tex/generic/pgf/utilities/pgfkeys.code.tex +INPUT /usr/local/texlive/2025/texmf-dist/tex/generic/pgf/utilities/pgfkeyslibraryfiltered.code.tex +INPUT /usr/local/texlive/2025/texmf-dist/tex/generic/pgf/systemlayer/pgf.cfg +INPUT /usr/local/texlive/2025/texmf-dist/tex/generic/pgf/systemlayer/pgfsys-pdftex.def +INPUT /usr/local/texlive/2025/texmf-dist/tex/generic/pgf/systemlayer/pgfsys-pdftex.def +INPUT /usr/local/texlive/2025/texmf-dist/tex/generic/pgf/systemlayer/pgfsys-common-pdf.def +INPUT /usr/local/texlive/2025/texmf-dist/tex/generic/pgf/systemlayer/pgfsyssoftpath.code.tex +INPUT /usr/local/texlive/2025/texmf-dist/tex/generic/pgf/systemlayer/pgfsyssoftpath.code.tex +INPUT /usr/local/texlive/2025/texmf-dist/tex/generic/pgf/systemlayer/pgfsyssoftpath.code.tex +INPUT /usr/local/texlive/2025/texmf-dist/tex/generic/pgf/systemlayer/pgfsysprotocol.code.tex +INPUT /usr/local/texlive/2025/texmf-dist/tex/generic/pgf/systemlayer/pgfsysprotocol.code.tex +INPUT /usr/local/texlive/2025/texmf-dist/tex/generic/pgf/systemlayer/pgfsysprotocol.code.tex +INPUT /usr/local/texlive/2025/texmf-dist/tex/latex/xcolor/xcolor.sty +INPUT /usr/local/texlive/2025/texmf-dist/tex/latex/xcolor/xcolor.sty +INPUT /usr/local/texlive/2025/texmf-dist/tex/latex/graphics-cfg/color.cfg +INPUT /usr/local/texlive/2025/texmf-dist/tex/latex/graphics-cfg/color.cfg +INPUT /usr/local/texlive/2025/texmf-dist/tex/latex/graphics-cfg/color.cfg +INPUT /usr/local/texlive/2025/texmf-dist/tex/latex/graphics/mathcolor.ltx +INPUT /usr/local/texlive/2025/texmf-dist/tex/latex/graphics/mathcolor.ltx +INPUT /usr/local/texlive/2025/texmf-dist/tex/latex/graphics/mathcolor.ltx +INPUT /usr/local/texlive/2025/texmf-dist/tex/generic/pgf/basiclayer/pgfcore.code.tex +INPUT /usr/local/texlive/2025/texmf-dist/tex/generic/pgf/basiclayer/pgfcore.code.tex +INPUT /usr/local/texlive/2025/texmf-dist/tex/generic/pgf/basiclayer/pgfcore.code.tex +INPUT /usr/local/texlive/2025/texmf-dist/tex/generic/pgf/math/pgfmath.code.tex +INPUT /usr/local/texlive/2025/texmf-dist/tex/generic/pgf/math/pgfmathutil.code.tex +INPUT /usr/local/texlive/2025/texmf-dist/tex/generic/pgf/math/pgfmathparser.code.tex +INPUT /usr/local/texlive/2025/texmf-dist/tex/generic/pgf/math/pgfmathfunctions.code.tex +INPUT /usr/local/texlive/2025/texmf-dist/tex/generic/pgf/math/pgfmathfunctions.basic.code.tex +INPUT /usr/local/texlive/2025/texmf-dist/tex/generic/pgf/math/pgfmathfunctions.trigonometric.code.tex +INPUT /usr/local/texlive/2025/texmf-dist/tex/generic/pgf/math/pgfmathfunctions.random.code.tex +INPUT /usr/local/texlive/2025/texmf-dist/tex/generic/pgf/math/pgfmathfunctions.comparison.code.tex +INPUT /usr/local/texlive/2025/texmf-dist/tex/generic/pgf/math/pgfmathfunctions.base.code.tex +INPUT /usr/local/texlive/2025/texmf-dist/tex/generic/pgf/math/pgfmathfunctions.round.code.tex +INPUT /usr/local/texlive/2025/texmf-dist/tex/generic/pgf/math/pgfmathfunctions.misc.code.tex +INPUT /usr/local/texlive/2025/texmf-dist/tex/generic/pgf/math/pgfmathfunctions.integerarithmetics.code.tex +INPUT /usr/local/texlive/2025/texmf-dist/tex/generic/pgf/math/pgfmathcalc.code.tex +INPUT /usr/local/texlive/2025/texmf-dist/tex/generic/pgf/math/pgfmathfloat.code.tex +INPUT /usr/local/texlive/2025/texmf-dist/tex/generic/pgf/math/pgfint.code.tex +INPUT /usr/local/texlive/2025/texmf-dist/tex/generic/pgf/basiclayer/pgfcorepoints.code.tex +INPUT /usr/local/texlive/2025/texmf-dist/tex/generic/pgf/basiclayer/pgfcorepathconstruct.code.tex +INPUT /usr/local/texlive/2025/texmf-dist/tex/generic/pgf/basiclayer/pgfcorepathusage.code.tex +INPUT /usr/local/texlive/2025/texmf-dist/tex/generic/pgf/basiclayer/pgfcorescopes.code.tex +INPUT /usr/local/texlive/2025/texmf-dist/tex/generic/pgf/basiclayer/pgfcoregraphicstate.code.tex +INPUT /usr/local/texlive/2025/texmf-dist/tex/generic/pgf/basiclayer/pgfcoretransformations.code.tex +INPUT /usr/local/texlive/2025/texmf-dist/tex/generic/pgf/basiclayer/pgfcorequick.code.tex +INPUT /usr/local/texlive/2025/texmf-dist/tex/generic/pgf/basiclayer/pgfcoreobjects.code.tex +INPUT /usr/local/texlive/2025/texmf-dist/tex/generic/pgf/basiclayer/pgfcorepathprocessing.code.tex +INPUT /usr/local/texlive/2025/texmf-dist/tex/generic/pgf/basiclayer/pgfcorearrows.code.tex +INPUT /usr/local/texlive/2025/texmf-dist/tex/generic/pgf/basiclayer/pgfcoreshade.code.tex +INPUT /usr/local/texlive/2025/texmf-dist/tex/generic/pgf/basiclayer/pgfcoreimage.code.tex +INPUT /usr/local/texlive/2025/texmf-dist/tex/generic/pgf/basiclayer/pgfcoreexternal.code.tex +INPUT /usr/local/texlive/2025/texmf-dist/tex/generic/pgf/basiclayer/pgfcorelayers.code.tex +INPUT /usr/local/texlive/2025/texmf-dist/tex/generic/pgf/basiclayer/pgfcoretransparency.code.tex +INPUT /usr/local/texlive/2025/texmf-dist/tex/generic/pgf/basiclayer/pgfcorepatterns.code.tex +INPUT /usr/local/texlive/2025/texmf-dist/tex/generic/pgf/basiclayer/pgfcorerdf.code.tex +INPUT /usr/local/texlive/2025/texmf-dist/tex/generic/pgf/modules/pgfmoduleshapes.code.tex +INPUT /usr/local/texlive/2025/texmf-dist/tex/generic/pgf/modules/pgfmoduleplot.code.tex +INPUT /usr/local/texlive/2025/texmf-dist/tex/latex/pgf/compatibility/pgfcomp-version-0-65.sty +INPUT /usr/local/texlive/2025/texmf-dist/tex/latex/pgf/compatibility/pgfcomp-version-0-65.sty +INPUT /usr/local/texlive/2025/texmf-dist/tex/latex/pgf/compatibility/pgfcomp-version-1-18.sty +INPUT /usr/local/texlive/2025/texmf-dist/tex/latex/pgf/compatibility/pgfcomp-version-1-18.sty +INPUT /usr/local/texlive/2025/texmf-dist/tex/latex/pgf/utilities/pgffor.sty +INPUT /usr/local/texlive/2025/texmf-dist/tex/latex/pgf/utilities/pgffor.sty +INPUT /usr/local/texlive/2025/texmf-dist/tex/latex/pgf/utilities/pgfkeys.sty +INPUT /usr/local/texlive/2025/texmf-dist/tex/latex/pgf/utilities/pgfkeys.sty +INPUT /usr/local/texlive/2025/texmf-dist/tex/generic/pgf/utilities/pgfkeys.code.tex +INPUT /usr/local/texlive/2025/texmf-dist/tex/generic/pgf/utilities/pgfkeys.code.tex +INPUT /usr/local/texlive/2025/texmf-dist/tex/generic/pgf/utilities/pgfkeys.code.tex +INPUT /usr/local/texlive/2025/texmf-dist/tex/latex/pgf/math/pgfmath.sty +INPUT /usr/local/texlive/2025/texmf-dist/tex/latex/pgf/math/pgfmath.sty +INPUT /usr/local/texlive/2025/texmf-dist/tex/generic/pgf/math/pgfmath.code.tex +INPUT /usr/local/texlive/2025/texmf-dist/tex/generic/pgf/math/pgfmath.code.tex +INPUT /usr/local/texlive/2025/texmf-dist/tex/generic/pgf/math/pgfmath.code.tex +INPUT /usr/local/texlive/2025/texmf-dist/tex/generic/pgf/utilities/pgffor.code.tex +INPUT /usr/local/texlive/2025/texmf-dist/tex/generic/pgf/utilities/pgffor.code.tex +INPUT /usr/local/texlive/2025/texmf-dist/tex/generic/pgf/utilities/pgffor.code.tex +INPUT /usr/local/texlive/2025/texmf-dist/tex/generic/pgf/frontendlayer/tikz/tikz.code.tex +INPUT /usr/local/texlive/2025/texmf-dist/tex/generic/pgf/frontendlayer/tikz/tikz.code.tex +INPUT /usr/local/texlive/2025/texmf-dist/tex/generic/pgf/frontendlayer/tikz/tikz.code.tex +INPUT /usr/local/texlive/2025/texmf-dist/tex/generic/pgf/libraries/pgflibraryplothandlers.code.tex +INPUT /usr/local/texlive/2025/texmf-dist/tex/generic/pgf/libraries/pgflibraryplothandlers.code.tex +INPUT /usr/local/texlive/2025/texmf-dist/tex/generic/pgf/modules/pgfmodulematrix.code.tex +INPUT /usr/local/texlive/2025/texmf-dist/tex/generic/pgf/frontendlayer/tikz/libraries/tikzlibrarytopaths.code.tex +INPUT /usr/local/texlive/2025/texmf-dist/tex/generic/pgf/frontendlayer/tikz/libraries/tikzlibrarytopaths.code.tex +INPUT /usr/local/texlive/2025/texmf-dist/tex/generic/pgf/frontendlayer/tikz/libraries/tikzlibraryshapes.geometric.code.tex +INPUT /usr/local/texlive/2025/texmf-dist/tex/generic/pgf/frontendlayer/tikz/libraries/tikzlibraryshapes.geometric.code.tex +INPUT /usr/local/texlive/2025/texmf-dist/tex/generic/pgf/libraries/shapes/pgflibraryshapes.geometric.code.tex +INPUT /usr/local/texlive/2025/texmf-dist/tex/generic/pgf/libraries/shapes/pgflibraryshapes.geometric.code.tex +INPUT /usr/local/texlive/2025/texmf-dist/tex/generic/pgf/libraries/pgflibraryarrows.meta.code.tex +INPUT /usr/local/texlive/2025/texmf-dist/tex/generic/pgf/libraries/pgflibraryarrows.meta.code.tex +INPUT /usr/local/texlive/2025/texmf-dist/tex/generic/pgf/libraries/pgflibraryarrows.meta.code.tex +INPUT /usr/local/texlive/2025/texmf-dist/tex/generic/pgf/frontendlayer/tikz/libraries/tikzlibrarypositioning.code.tex +INPUT /usr/local/texlive/2025/texmf-dist/tex/generic/pgf/frontendlayer/tikz/libraries/tikzlibrarypositioning.code.tex +INPUT /usr/local/texlive/2025/texmf-dist/tex/generic/pgf/frontendlayer/tikz/libraries/tikzlibrarycalc.code.tex +INPUT /usr/local/texlive/2025/texmf-dist/tex/generic/pgf/frontendlayer/tikz/libraries/tikzlibrarycalc.code.tex +INPUT /usr/local/texlive/2025/texmf-dist/tex/generic/pgf/frontendlayer/tikz/libraries/tikzlibraryfit.code.tex +INPUT /usr/local/texlive/2025/texmf-dist/tex/generic/pgf/frontendlayer/tikz/libraries/tikzlibraryfit.code.tex +INPUT /usr/local/texlive/2025/texmf-dist/tex/generic/pgf/frontendlayer/tikz/libraries/tikzlibrarybackgrounds.code.tex +INPUT /usr/local/texlive/2025/texmf-dist/tex/generic/pgf/frontendlayer/tikz/libraries/tikzlibrarybackgrounds.code.tex +INPUT /usr/local/texlive/2025/texmf-dist/tex/latex/l3backend/l3backend-pdftex.def +INPUT /usr/local/texlive/2025/texmf-dist/tex/latex/l3backend/l3backend-pdftex.def +INPUT ./architecture.aux +INPUT ./architecture.aux +INPUT architecture.aux +OUTPUT architecture.aux +INPUT /usr/local/texlive/2025/texmf-dist/tex/context/base/mkii/supp-pdf.mkii +INPUT /usr/local/texlive/2025/texmf-dist/tex/context/base/mkii/supp-pdf.mkii +INPUT /usr/local/texlive/2025/texmf-dist/tex/context/base/mkii/supp-pdf.mkii +INPUT /usr/local/texlive/2025/texmf-dist/tex/latex/epstopdf-pkg/epstopdf-base.sty +INPUT /usr/local/texlive/2025/texmf-dist/tex/latex/epstopdf-pkg/epstopdf-base.sty +INPUT /usr/local/texlive/2025/texmf-dist/tex/latex/latexconfig/epstopdf-sys.cfg +INPUT /usr/local/texlive/2025/texmf-dist/tex/latex/latexconfig/epstopdf-sys.cfg +INPUT /usr/local/texlive/2025/texmf-dist/tex/latex/latexconfig/epstopdf-sys.cfg +INPUT /usr/local/texlive/2025/texmf-dist/fonts/map/fontname/texfonts.map +INPUT /usr/local/texlive/2025/texmf-dist/fonts/tfm/public/cm/cmss10.tfm +INPUT /usr/local/texlive/2025/texmf-dist/fonts/tfm/public/cm/cmss9.tfm +INPUT /usr/local/texlive/2025/texmf-dist/fonts/tfm/public/cm/cmbx7.tfm +INPUT /usr/local/texlive/2025/texmf-dist/fonts/tfm/public/cm/cmr9.tfm +INPUT /usr/local/texlive/2025/texmf-dist/fonts/tfm/public/cm/cmr6.tfm +INPUT /usr/local/texlive/2025/texmf-dist/fonts/tfm/public/cm/cmmi9.tfm +INPUT /usr/local/texlive/2025/texmf-dist/fonts/tfm/public/cm/cmmi6.tfm +INPUT /usr/local/texlive/2025/texmf-dist/fonts/tfm/public/cm/cmsy9.tfm +INPUT /usr/local/texlive/2025/texmf-dist/fonts/tfm/public/cm/cmsy6.tfm +INPUT /usr/local/texlive/2025/texmf-dist/fonts/tfm/public/cm/cmbx10.tfm +OUTPUT architecture.pdf +INPUT /usr/local/texlive/2025/texmf-var/fonts/map/pdftex/updmap/pdftex.map +INPUT architecture.aux +INPUT /usr/local/texlive/2025/texmf-dist/fonts/type1/public/amsfonts/cm/cmbx10.pfb +INPUT /usr/local/texlive/2025/texmf-dist/fonts/type1/public/amsfonts/cm/cmbx7.pfb +INPUT /usr/local/texlive/2025/texmf-dist/fonts/type1/public/amsfonts/cm/cmmi6.pfb +INPUT /usr/local/texlive/2025/texmf-dist/fonts/type1/public/amsfonts/cm/cmmi9.pfb +INPUT /usr/local/texlive/2025/texmf-dist/fonts/type1/public/amsfonts/cm/cmr6.pfb +INPUT /usr/local/texlive/2025/texmf-dist/fonts/type1/public/amsfonts/cm/cmr9.pfb +INPUT /usr/local/texlive/2025/texmf-dist/fonts/type1/public/amsfonts/cm/cmss9.pfb +INPUT /usr/local/texlive/2025/texmf-dist/fonts/type1/public/amsfonts/cm/cmsy9.pfb diff --git a/diagram/architecture.log b/diagram/architecture.log new file mode 100644 index 0000000000000000000000000000000000000000..3713a1c77de8722a9a34e3717e76f12eaeabe51c --- /dev/null +++ b/diagram/architecture.log @@ -0,0 +1,352 @@ +This is pdfTeX, Version 3.141592653-2.6-1.40.27 (TeX Live 2025) (preloaded format=pdflatex 2025.3.8) 28 FEB 2026 17:28 +entering extended mode + restricted \write18 enabled. + file:line:error style messages enabled. + %&-line parsing enabled. +**/Users/hongyuli/genai/svc/matcha_svc/diagram/architecture.tex +(/Users/hongyuli/genai/svc/matcha_svc/diagram/architecture.tex +LaTeX2e <2024-11-01> patch level 2 +L3 programming layer <2025-01-18> +(/usr/local/texlive/2025/texmf-dist/tex/latex/standalone/standalone.cls +Document Class: standalone 2025/02/22 v1.5a Class to compile TeX sub-files standalone +(/usr/local/texlive/2025/texmf-dist/tex/latex/tools/shellesc.sty +Package: shellesc 2023/07/08 v1.0d unified shell escape interface for LaTeX +Package shellesc Info: Restricted shell escape enabled on input line 77. +) (/usr/local/texlive/2025/texmf-dist/tex/generic/iftex/ifluatex.sty +Package: ifluatex 2019/10/25 v1.5 ifluatex legacy package. Use iftex instead. + (/usr/local/texlive/2025/texmf-dist/tex/generic/iftex/iftex.sty +Package: iftex 2024/12/12 v1.0g TeX engine tests +)) (/usr/local/texlive/2025/texmf-dist/tex/latex/xkeyval/xkeyval.sty +Package: xkeyval 2022/06/16 v2.9 package option processing (HA) + (/usr/local/texlive/2025/texmf-dist/tex/generic/xkeyval/xkeyval.tex (/usr/local/texlive/2025/texmf-dist/tex/generic/xkeyval/xkvutils.tex +\XKV@toks=\toks17 +\XKV@tempa@toks=\toks18 + (/usr/local/texlive/2025/texmf-dist/tex/generic/xkeyval/keyval.tex)) +\XKV@depth=\count196 +File: xkeyval.tex 2014/12/03 v2.7a key=value parser (HA) +)) +\sa@internal=\count197 +\c@sapage=\count198 + (/usr/local/texlive/2025/texmf-dist/tex/latex/standalone/standalone.cfg +File: standalone.cfg 2025/02/22 v1.5a Default configuration file for 'standalone' class +) (/usr/local/texlive/2025/texmf-dist/tex/latex/base/article.cls +Document Class: article 2024/06/29 v1.4n Standard LaTeX document class +(/usr/local/texlive/2025/texmf-dist/tex/latex/base/size10.clo +File: size10.clo 2024/06/29 v1.4n Standard LaTeX file (size option) +) +\c@part=\count199 +\c@section=\count266 +\c@subsection=\count267 +\c@subsubsection=\count268 +\c@paragraph=\count269 +\c@subparagraph=\count270 +\c@figure=\count271 +\c@table=\count272 +\abovecaptionskip=\skip49 +\belowcaptionskip=\skip50 +\bibindent=\dimen141 +) (/usr/local/texlive/2025/texmf-dist/tex/latex/pgf/frontendlayer/tikz.sty (/usr/local/texlive/2025/texmf-dist/tex/latex/pgf/basiclayer/pgf.sty (/usr/local/texlive/2025/texmf-dist/tex/latex/pgf/utilities/pgfrcs.sty (/usr/local/texlive/2025/texmf-dist/tex/generic/pgf/utilities/pgfutil-common.tex +\pgfutil@everybye=\toks19 +\pgfutil@tempdima=\dimen142 +\pgfutil@tempdimb=\dimen143 +) (/usr/local/texlive/2025/texmf-dist/tex/generic/pgf/utilities/pgfutil-latex.def +\pgfutil@abb=\box52 +) (/usr/local/texlive/2025/texmf-dist/tex/generic/pgf/utilities/pgfrcs.code.tex (/usr/local/texlive/2025/texmf-dist/tex/generic/pgf/pgf.revision.tex) +Package: pgfrcs 2023-01-15 v3.1.10 (3.1.10) +)) +Package: pgf 2023-01-15 v3.1.10 (3.1.10) + (/usr/local/texlive/2025/texmf-dist/tex/latex/pgf/basiclayer/pgfcore.sty (/usr/local/texlive/2025/texmf-dist/tex/latex/graphics/graphicx.sty +Package: graphicx 2021/09/16 v1.2d Enhanced LaTeX Graphics (DPC,SPQR) + (/usr/local/texlive/2025/texmf-dist/tex/latex/graphics/graphics.sty +Package: graphics 2024/08/06 v1.4g Standard LaTeX Graphics (DPC,SPQR) + (/usr/local/texlive/2025/texmf-dist/tex/latex/graphics/trig.sty +Package: trig 2023/12/02 v1.11 sin cos tan (DPC) +) (/usr/local/texlive/2025/texmf-dist/tex/latex/graphics-cfg/graphics.cfg +File: graphics.cfg 2016/06/04 v1.11 sample graphics configuration +) +Package graphics Info: Driver file: pdftex.def on input line 106. + (/usr/local/texlive/2025/texmf-dist/tex/latex/graphics-def/pdftex.def +File: pdftex.def 2024/04/13 v1.2c Graphics/color driver for pdftex +)) +\Gin@req@height=\dimen144 +\Gin@req@width=\dimen145 +) (/usr/local/texlive/2025/texmf-dist/tex/latex/pgf/systemlayer/pgfsys.sty (/usr/local/texlive/2025/texmf-dist/tex/generic/pgf/systemlayer/pgfsys.code.tex +Package: pgfsys 2023-01-15 v3.1.10 (3.1.10) + (/usr/local/texlive/2025/texmf-dist/tex/generic/pgf/utilities/pgfkeys.code.tex +\pgfkeys@pathtoks=\toks20 +\pgfkeys@temptoks=\toks21 + (/usr/local/texlive/2025/texmf-dist/tex/generic/pgf/utilities/pgfkeyslibraryfiltered.code.tex +\pgfkeys@tmptoks=\toks22 +)) +\pgf@x=\dimen146 +\pgf@y=\dimen147 +\pgf@xa=\dimen148 +\pgf@ya=\dimen149 +\pgf@xb=\dimen150 +\pgf@yb=\dimen151 +\pgf@xc=\dimen152 +\pgf@yc=\dimen153 +\pgf@xd=\dimen154 +\pgf@yd=\dimen155 +\w@pgf@writea=\write3 +\r@pgf@reada=\read2 +\c@pgf@counta=\count273 +\c@pgf@countb=\count274 +\c@pgf@countc=\count275 +\c@pgf@countd=\count276 +\t@pgf@toka=\toks23 +\t@pgf@tokb=\toks24 +\t@pgf@tokc=\toks25 +\pgf@sys@id@count=\count277 + (/usr/local/texlive/2025/texmf-dist/tex/generic/pgf/systemlayer/pgf.cfg +File: pgf.cfg 2023-01-15 v3.1.10 (3.1.10) +) +Driver file for pgf: pgfsys-pdftex.def + (/usr/local/texlive/2025/texmf-dist/tex/generic/pgf/systemlayer/pgfsys-pdftex.def +File: pgfsys-pdftex.def 2023-01-15 v3.1.10 (3.1.10) + (/usr/local/texlive/2025/texmf-dist/tex/generic/pgf/systemlayer/pgfsys-common-pdf.def +File: pgfsys-common-pdf.def 2023-01-15 v3.1.10 (3.1.10) +))) (/usr/local/texlive/2025/texmf-dist/tex/generic/pgf/systemlayer/pgfsyssoftpath.code.tex +File: pgfsyssoftpath.code.tex 2023-01-15 v3.1.10 (3.1.10) +\pgfsyssoftpath@smallbuffer@items=\count278 +\pgfsyssoftpath@bigbuffer@items=\count279 +) (/usr/local/texlive/2025/texmf-dist/tex/generic/pgf/systemlayer/pgfsysprotocol.code.tex +File: pgfsysprotocol.code.tex 2023-01-15 v3.1.10 (3.1.10) +)) (/usr/local/texlive/2025/texmf-dist/tex/latex/xcolor/xcolor.sty +Package: xcolor 2024/09/29 v3.02 LaTeX color extensions (UK) + (/usr/local/texlive/2025/texmf-dist/tex/latex/graphics-cfg/color.cfg +File: color.cfg 2016/01/02 v1.6 sample color configuration +) +Package xcolor Info: Driver file: pdftex.def on input line 274. + (/usr/local/texlive/2025/texmf-dist/tex/latex/graphics/mathcolor.ltx) +Package xcolor Info: Model `cmy' substituted by `cmy0' on input line 1349. +Package xcolor Info: Model `hsb' substituted by `rgb' on input line 1353. +Package xcolor Info: Model `RGB' extended on input line 1365. +Package xcolor Info: Model `HTML' substituted by `rgb' on input line 1367. +Package xcolor Info: Model `Hsb' substituted by `hsb' on input line 1368. +Package xcolor Info: Model `tHsb' substituted by `hsb' on input line 1369. +Package xcolor Info: Model `HSB' substituted by `hsb' on input line 1370. +Package xcolor Info: Model `Gray' substituted by `gray' on input line 1371. +Package xcolor Info: Model `wave' substituted by `hsb' on input line 1372. +) (/usr/local/texlive/2025/texmf-dist/tex/generic/pgf/basiclayer/pgfcore.code.tex +Package: pgfcore 2023-01-15 v3.1.10 (3.1.10) + (/usr/local/texlive/2025/texmf-dist/tex/generic/pgf/math/pgfmath.code.tex (/usr/local/texlive/2025/texmf-dist/tex/generic/pgf/math/pgfmathutil.code.tex) (/usr/local/texlive/2025/texmf-dist/tex/generic/pgf/math/pgfmathparser.code.tex +\pgfmath@dimen=\dimen156 +\pgfmath@count=\count280 +\pgfmath@box=\box53 +\pgfmath@toks=\toks26 +\pgfmath@stack@operand=\toks27 +\pgfmath@stack@operation=\toks28 +) (/usr/local/texlive/2025/texmf-dist/tex/generic/pgf/math/pgfmathfunctions.code.tex) (/usr/local/texlive/2025/texmf-dist/tex/generic/pgf/math/pgfmathfunctions.basic.code.tex) (/usr/local/texlive/2025/texmf-dist/tex/generic/pgf/math/pgfmathfunctions.trigonometric.code.tex) (/usr/local/texlive/2025/texmf-dist/tex/generic/pgf/math/pgfmathfunctions.random.code.tex) (/usr/local/texlive/2025/texmf-dist/tex/generic/pgf/math/pgfmathfunctions.comparison.code.tex) (/usr/local/texlive/2025/texmf-dist/tex/generic/pgf/math/pgfmathfunctions.base.code.tex) (/usr/local/texlive/2025/texmf-dist/tex/generic/pgf/math/pgfmathfunctions.round.code.tex) (/usr/local/texlive/2025/texmf-dist/tex/generic/pgf/math/pgfmathfunctions.misc.code.tex) (/usr/local/texlive/2025/texmf-dist/tex/generic/pgf/math/pgfmathfunctions.integerarithmetics.code.tex) (/usr/local/texlive/2025/texmf-dist/tex/generic/pgf/math/pgfmathcalc.code.tex) (/usr/local/texlive/2025/texmf-dist/tex/generic/pgf/math/pgfmathfloat.code.tex +\c@pgfmathroundto@lastzeros=\count281 +)) (/usr/local/texlive/2025/texmf-dist/tex/generic/pgf/math/pgfint.code.tex) (/usr/local/texlive/2025/texmf-dist/tex/generic/pgf/basiclayer/pgfcorepoints.code.tex +File: pgfcorepoints.code.tex 2023-01-15 v3.1.10 (3.1.10) +\pgf@picminx=\dimen157 +\pgf@picmaxx=\dimen158 +\pgf@picminy=\dimen159 +\pgf@picmaxy=\dimen160 +\pgf@pathminx=\dimen161 +\pgf@pathmaxx=\dimen162 +\pgf@pathminy=\dimen163 +\pgf@pathmaxy=\dimen164 +\pgf@xx=\dimen165 +\pgf@xy=\dimen166 +\pgf@yx=\dimen167 +\pgf@yy=\dimen168 +\pgf@zx=\dimen169 +\pgf@zy=\dimen170 +) (/usr/local/texlive/2025/texmf-dist/tex/generic/pgf/basiclayer/pgfcorepathconstruct.code.tex +File: pgfcorepathconstruct.code.tex 2023-01-15 v3.1.10 (3.1.10) +\pgf@path@lastx=\dimen171 +\pgf@path@lasty=\dimen172 +) (/usr/local/texlive/2025/texmf-dist/tex/generic/pgf/basiclayer/pgfcorepathusage.code.tex +File: pgfcorepathusage.code.tex 2023-01-15 v3.1.10 (3.1.10) +\pgf@shorten@end@additional=\dimen173 +\pgf@shorten@start@additional=\dimen174 +) (/usr/local/texlive/2025/texmf-dist/tex/generic/pgf/basiclayer/pgfcorescopes.code.tex +File: pgfcorescopes.code.tex 2023-01-15 v3.1.10 (3.1.10) +\pgfpic=\box54 +\pgf@hbox=\box55 +\pgf@layerbox@main=\box56 +\pgf@picture@serial@count=\count282 +) (/usr/local/texlive/2025/texmf-dist/tex/generic/pgf/basiclayer/pgfcoregraphicstate.code.tex +File: pgfcoregraphicstate.code.tex 2023-01-15 v3.1.10 (3.1.10) +\pgflinewidth=\dimen175 +) (/usr/local/texlive/2025/texmf-dist/tex/generic/pgf/basiclayer/pgfcoretransformations.code.tex +File: pgfcoretransformations.code.tex 2023-01-15 v3.1.10 (3.1.10) +\pgf@pt@x=\dimen176 +\pgf@pt@y=\dimen177 +\pgf@pt@temp=\dimen178 +) (/usr/local/texlive/2025/texmf-dist/tex/generic/pgf/basiclayer/pgfcorequick.code.tex +File: pgfcorequick.code.tex 2023-01-15 v3.1.10 (3.1.10) +) (/usr/local/texlive/2025/texmf-dist/tex/generic/pgf/basiclayer/pgfcoreobjects.code.tex +File: pgfcoreobjects.code.tex 2023-01-15 v3.1.10 (3.1.10) +) (/usr/local/texlive/2025/texmf-dist/tex/generic/pgf/basiclayer/pgfcorepathprocessing.code.tex +File: pgfcorepathprocessing.code.tex 2023-01-15 v3.1.10 (3.1.10) +) (/usr/local/texlive/2025/texmf-dist/tex/generic/pgf/basiclayer/pgfcorearrows.code.tex +File: pgfcorearrows.code.tex 2023-01-15 v3.1.10 (3.1.10) +\pgfarrowsep=\dimen179 +) (/usr/local/texlive/2025/texmf-dist/tex/generic/pgf/basiclayer/pgfcoreshade.code.tex +File: pgfcoreshade.code.tex 2023-01-15 v3.1.10 (3.1.10) +\pgf@max=\dimen180 +\pgf@sys@shading@range@num=\count283 +\pgf@shadingcount=\count284 +) (/usr/local/texlive/2025/texmf-dist/tex/generic/pgf/basiclayer/pgfcoreimage.code.tex +File: pgfcoreimage.code.tex 2023-01-15 v3.1.10 (3.1.10) +) (/usr/local/texlive/2025/texmf-dist/tex/generic/pgf/basiclayer/pgfcoreexternal.code.tex +File: pgfcoreexternal.code.tex 2023-01-15 v3.1.10 (3.1.10) +\pgfexternal@startupbox=\box57 +) (/usr/local/texlive/2025/texmf-dist/tex/generic/pgf/basiclayer/pgfcorelayers.code.tex +File: pgfcorelayers.code.tex 2023-01-15 v3.1.10 (3.1.10) +) (/usr/local/texlive/2025/texmf-dist/tex/generic/pgf/basiclayer/pgfcoretransparency.code.tex +File: pgfcoretransparency.code.tex 2023-01-15 v3.1.10 (3.1.10) +) (/usr/local/texlive/2025/texmf-dist/tex/generic/pgf/basiclayer/pgfcorepatterns.code.tex +File: pgfcorepatterns.code.tex 2023-01-15 v3.1.10 (3.1.10) +) (/usr/local/texlive/2025/texmf-dist/tex/generic/pgf/basiclayer/pgfcorerdf.code.tex +File: pgfcorerdf.code.tex 2023-01-15 v3.1.10 (3.1.10) +))) (/usr/local/texlive/2025/texmf-dist/tex/generic/pgf/modules/pgfmoduleshapes.code.tex +File: pgfmoduleshapes.code.tex 2023-01-15 v3.1.10 (3.1.10) +\pgfnodeparttextbox=\box58 +) (/usr/local/texlive/2025/texmf-dist/tex/generic/pgf/modules/pgfmoduleplot.code.tex +File: pgfmoduleplot.code.tex 2023-01-15 v3.1.10 (3.1.10) +) (/usr/local/texlive/2025/texmf-dist/tex/latex/pgf/compatibility/pgfcomp-version-0-65.sty +Package: pgfcomp-version-0-65 2023-01-15 v3.1.10 (3.1.10) +\pgf@nodesepstart=\dimen181 +\pgf@nodesepend=\dimen182 +) (/usr/local/texlive/2025/texmf-dist/tex/latex/pgf/compatibility/pgfcomp-version-1-18.sty +Package: pgfcomp-version-1-18 2023-01-15 v3.1.10 (3.1.10) +)) (/usr/local/texlive/2025/texmf-dist/tex/latex/pgf/utilities/pgffor.sty (/usr/local/texlive/2025/texmf-dist/tex/latex/pgf/utilities/pgfkeys.sty (/usr/local/texlive/2025/texmf-dist/tex/generic/pgf/utilities/pgfkeys.code.tex)) (/usr/local/texlive/2025/texmf-dist/tex/latex/pgf/math/pgfmath.sty (/usr/local/texlive/2025/texmf-dist/tex/generic/pgf/math/pgfmath.code.tex)) (/usr/local/texlive/2025/texmf-dist/tex/generic/pgf/utilities/pgffor.code.tex +Package: pgffor 2023-01-15 v3.1.10 (3.1.10) +\pgffor@iter=\dimen183 +\pgffor@skip=\dimen184 +\pgffor@stack=\toks29 +\pgffor@toks=\toks30 +)) (/usr/local/texlive/2025/texmf-dist/tex/generic/pgf/frontendlayer/tikz/tikz.code.tex +Package: tikz 2023-01-15 v3.1.10 (3.1.10) + (/usr/local/texlive/2025/texmf-dist/tex/generic/pgf/libraries/pgflibraryplothandlers.code.tex +File: pgflibraryplothandlers.code.tex 2023-01-15 v3.1.10 (3.1.10) +\pgf@plot@mark@count=\count285 +\pgfplotmarksize=\dimen185 +) +\tikz@lastx=\dimen186 +\tikz@lasty=\dimen187 +\tikz@lastxsaved=\dimen188 +\tikz@lastysaved=\dimen189 +\tikz@lastmovetox=\dimen190 +\tikz@lastmovetoy=\dimen191 +\tikzleveldistance=\dimen192 +\tikzsiblingdistance=\dimen193 +\tikz@figbox=\box59 +\tikz@figbox@bg=\box60 +\tikz@tempbox=\box61 +\tikz@tempbox@bg=\box62 +\tikztreelevel=\count286 +\tikznumberofchildren=\count287 +\tikznumberofcurrentchild=\count288 +\tikz@fig@count=\count289 + (/usr/local/texlive/2025/texmf-dist/tex/generic/pgf/modules/pgfmodulematrix.code.tex +File: pgfmodulematrix.code.tex 2023-01-15 v3.1.10 (3.1.10) +\pgfmatrixcurrentrow=\count290 +\pgfmatrixcurrentcolumn=\count291 +\pgf@matrix@numberofcolumns=\count292 +) +\tikz@expandcount=\count293 + (/usr/local/texlive/2025/texmf-dist/tex/generic/pgf/frontendlayer/tikz/libraries/tikzlibrarytopaths.code.tex +File: tikzlibrarytopaths.code.tex 2023-01-15 v3.1.10 (3.1.10) +))) +\sa@box=\box63 +) (/usr/local/texlive/2025/texmf-dist/tex/generic/pgf/frontendlayer/tikz/libraries/tikzlibraryshapes.geometric.code.tex +File: tikzlibraryshapes.geometric.code.tex 2023-01-15 v3.1.10 (3.1.10) + (/usr/local/texlive/2025/texmf-dist/tex/generic/pgf/libraries/shapes/pgflibraryshapes.geometric.code.tex +File: pgflibraryshapes.geometric.code.tex 2023-01-15 v3.1.10 (3.1.10) +)) (/usr/local/texlive/2025/texmf-dist/tex/generic/pgf/libraries/pgflibraryarrows.meta.code.tex +File: pgflibraryarrows.meta.code.tex 2023-01-15 v3.1.10 (3.1.10) +\pgfarrowinset=\dimen194 +\pgfarrowlength=\dimen195 +\pgfarrowwidth=\dimen196 +\pgfarrowlinewidth=\dimen197 +) (/usr/local/texlive/2025/texmf-dist/tex/generic/pgf/frontendlayer/tikz/libraries/tikzlibrarypositioning.code.tex +File: tikzlibrarypositioning.code.tex 2023-01-15 v3.1.10 (3.1.10) +) (/usr/local/texlive/2025/texmf-dist/tex/generic/pgf/frontendlayer/tikz/libraries/tikzlibrarycalc.code.tex +File: tikzlibrarycalc.code.tex 2023-01-15 v3.1.10 (3.1.10) +) (/usr/local/texlive/2025/texmf-dist/tex/generic/pgf/frontendlayer/tikz/libraries/tikzlibraryfit.code.tex +File: tikzlibraryfit.code.tex 2023-01-15 v3.1.10 (3.1.10) +) (/usr/local/texlive/2025/texmf-dist/tex/generic/pgf/frontendlayer/tikz/libraries/tikzlibrarybackgrounds.code.tex +File: tikzlibrarybackgrounds.code.tex 2023-01-15 v3.1.10 (3.1.10) +\pgf@layerbox@background=\box64 +\pgf@layerboxsaved@background=\box65 +) (/usr/local/texlive/2025/texmf-dist/tex/latex/l3backend/l3backend-pdftex.def +File: l3backend-pdftex.def 2024-05-08 L3 backend support: PDF output (pdfTeX) +\l__color_backend_stack_int=\count294 +\l__pdf_internal_box=\box66 +) (./architecture.aux) +\openout1 = `architecture.aux'. + +LaTeX Font Info: Checking defaults for OML/cmm/m/it on input line 5. +LaTeX Font Info: ... okay on input line 5. +LaTeX Font Info: Checking defaults for OMS/cmsy/m/n on input line 5. +LaTeX Font Info: ... okay on input line 5. +LaTeX Font Info: Checking defaults for OT1/cmr/m/n on input line 5. +LaTeX Font Info: ... okay on input line 5. +LaTeX Font Info: Checking defaults for T1/cmr/m/n on input line 5. +LaTeX Font Info: ... okay on input line 5. +LaTeX Font Info: Checking defaults for TS1/cmr/m/n on input line 5. +LaTeX Font Info: ... okay on input line 5. +LaTeX Font Info: Checking defaults for OMX/cmex/m/n on input line 5. +LaTeX Font Info: ... okay on input line 5. +LaTeX Font Info: Checking defaults for U/cmr/m/n on input line 5. +LaTeX Font Info: ... okay on input line 5. + (/usr/local/texlive/2025/texmf-dist/tex/context/base/mkii/supp-pdf.mkii +[Loading MPS to PDF converter (version 2006.09.02).] +\scratchcounter=\count295 +\scratchdimen=\dimen198 +\scratchbox=\box67 +\nofMPsegments=\count296 +\nofMParguments=\count297 +\everyMPshowfont=\toks31 +\MPscratchCnt=\count298 +\MPscratchDim=\dimen199 +\MPnumerator=\count299 +\makeMPintoPDFobject=\count300 +\everyMPtoPDFconversion=\toks32 +) (/usr/local/texlive/2025/texmf-dist/tex/latex/epstopdf-pkg/epstopdf-base.sty +Package: epstopdf-base 2020-01-24 v2.11 Base part for package epstopdf +Package epstopdf-base Info: Redefining graphics rule for `.eps' on input line 485. + (/usr/local/texlive/2025/texmf-dist/tex/latex/latexconfig/epstopdf-sys.cfg +File: epstopdf-sys.cfg 2010/07/13 v1.3 Configuration of (r)epstopdf for TeX Live +)) +LaTeX Font Info: External font `cmex10' loaded for size +(Font) <9> on input line 59. +LaTeX Font Info: External font `cmex10' loaded for size +(Font) <6> on input line 59. +LaTeX Font Info: External font `cmex10' loaded for size +(Font) <5> on input line 59. + + +[1 + + +Non-PDF special ignored! + papersize=437.02661pt,472.99706pt +{/usr/local/texlive/2025/texmf-var/fonts/map/pdftex/updmap/pdftex.map}] (./architecture.aux) + *********** +LaTeX2e <2024-11-01> patch level 2 +L3 programming layer <2025-01-18> + *********** + ) +Here is how much of TeX's memory you used: + 13623 strings out of 473190 + 290342 string characters out of 5715801 + 693796 words of memory out of 5000000 + 36652 multiletter control sequences out of 15000+600000 + 561889 words of font info for 46 fonts, out of 8000000 for 9000 + 1141 hyphenation exceptions out of 8191 + 117i,9n,121p,459b,948s stack positions out of 10000i,1000n,20000p,200000b,200000s + +Output written on architecture.pdf (1 page, 89006 bytes). +PDF statistics: + 51 PDF objects out of 1000 (max. 8388607) + 31 compressed objects within 1 object stream + 0 named destinations out of 1000 (max. 500000) + 13 words of extra memory for PDF output out of 10000 (max. 10000000) + diff --git a/diagram/architecture.synctex.gz b/diagram/architecture.synctex.gz new file mode 100644 index 0000000000000000000000000000000000000000..190dbb6db3a076cb8ef5d0c5ce5f7f3169f100f3 Binary files /dev/null and b/diagram/architecture.synctex.gz differ diff --git a/diagram/architecture.tex b/diagram/architecture.tex new file mode 100644 index 0000000000000000000000000000000000000000..729a1b4015d7d37927a4509814344f421265b916 --- /dev/null +++ b/diagram/architecture.tex @@ -0,0 +1,90 @@ +\documentclass[tikz,border=10pt]{standalone} +\usepackage{tikz} +\usetikzlibrary{shapes.geometric, arrows.meta, positioning, calc, fit, backgrounds} + +\begin{document} + +\begin{tikzpicture}[ + font=\sffamily\small, + >=Stealth, + node distance=1.2cm and 1.8cm, + % Styles + data/.style={rectangle, draw=blue!60, fill=blue!5, very thick, minimum width=2.5cm, minimum height=0.8cm, align=center, rounded corners}, + process/.style={rectangle, draw=orange!80, fill=orange!5, very thick, minimum width=2.8cm, minimum height=0.8cm, align=center}, + model/.style={rectangle, draw=green!60!black, fill=green!5, very thick, minimum width=2.8cm, minimum height=0.8cm, align=center}, + frozen/.style={rectangle, draw=gray!80, fill=gray!10, very thick, minimum width=2.8cm, minimum height=0.8cm, align=center, dashed}, + box/.style={rectangle, draw=gray, dotted, inner sep=12pt, thick, rounded corners}, + device/.style={rectangle, fill=red!10, text=red!80!black, font=\scriptsize\bfseries, rounded corners, inner sep=2pt} +] + +% --- PIPELINE --- + +% 1. Raw Data Input +\node[data] (audio_in) {Raw Audio\\(.wav)}; +\node[device, anchor=south east] at (audio_in.north east) {macOS}; + +% 2. Preprocessing & Offline Feature Extraction +\node[process, below=of audio_in, yshift=0.3cm] (resample) {Resample\\16KHz \& 32KHz}; + +\node[frozen, below left=1cm and -1cm of resample] (whisper) {Whisper PPG\\(Phonemes)}; +\node[device, anchor=south east] at (whisper.north east) {mps/cpu}; + +\node[frozen, below right=1cm and -1.5cm of resample] (crepe) {CREPE\\(F0 Pitch)}; +\node[device, anchor=south east] at (crepe.north east) {mps}; + +\node[frozen, right=of crepe, xshift=-0.5cm] (dac) {DAC Codec\\(Target Z)}; +\node[device, anchor=south east] at (dac.north east) {mps}; + +\draw[->, thick] (audio_in) -- (resample); +\draw[->, thick] (resample) -| (whisper); +\draw[->, thick] (resample) -| (crepe); +\draw[->, thick] (audio_in) -| (dac); + +% 3. Offline Data Storage +\node[data, below=of whisper, yshift=0.3cm] (npy_ppg) {.ppg.npy}; +\node[data, below=of crepe, yshift=0.3cm] (npy_f0) {.pit.npy}; +\node[data, below=of dac, yshift=0.3cm] (pt_z) {z\_target.pt}; +\draw[->, thick] (whisper) -- (npy_ppg); +\draw[->, thick] (crepe) -- (npy_f0); +\draw[->, thick] (dac) -- (pt_z); + +% 4. Dataloader & Conditioning +\node[process, below=1.5cm of npy_f0] (cond_enc) {Cond Encoder\\(Resampling \& Fusion)}; +\node[device, anchor=south east] at (cond_enc.north east) {mps}; + +\draw[->, thick] (npy_ppg) |- (cond_enc); +\draw[->, thick] (npy_f0) -- (cond_enc); + +% 5. Flow Matching / DiT Training +\node[model, right=of cond_enc, xshift=1cm] (dit) {Diffusion Transformer\\(DiT) $v_\theta$}; +\node[device, anchor=south east] at (dit.north east) {mps}; +\node[process, above=of dit, yshift=-0.5cm] (ode) {ODE Solver / Flow\\$z_t \sim \mathcal{N}(0, I)$}; + +\draw[->, thick] (pt_z) edge[bend left=20] node[right, align=center] {Target\\$z_1$} (ode); +\draw[<->, thick, bend right=20, red] (ode) to node[left, align=center] {Predict $v$\\Step $z$} (dit); +\draw[->, thick, dashed] (cond_enc) -- node[above] {Condition $c$} (dit); + +% 6. Projection and Decode +\node[model, below=of dit, yshift=0.3cm] (projector) {Projector CNN\\$P(u)$}; +\node[device, anchor=south east] at (projector.north east) {mps}; + +\draw[->, thick] (ode) edge[bend right=45] node[left] {$\hat{u}$} (projector); + +\node[frozen, below=of projector, yshift=0.3cm] (decoder) {DAC Decoder\\(Frozen)}; + +\draw[->, thick] (projector) -- node[right] {$\hat{z}$} (decoder); +\draw[->, dashed, thick, blue] (pt_z) |- node[near start, right] {Loss} (projector); + +\node[data, below=of decoder, yshift=0.3cm] (audio_out) {Synthesized Audio\\(.wav)}; +\draw[->, thick] (decoder) -- (audio_out); + +% --- GROUPING BOXES --- +\begin{scope}[on background layer] + \node[box, fit=(audio_in) (resample) (whisper) (crepe) (dac) (pt_z), fill=gray!5, label={[font=\bfseries, text=black]90:1. Offline Preprocessing Tier}] {}; + \node[box, fit=(cond_enc) (dit) (ode), fill=blue!5, label={[font=\bfseries, text=black]90:2. Continuous Flow Matching (MPS GPU)}] {}; + \node[box, fit=(projector) (decoder) (audio_out), fill=green!5, label={[font=\bfseries, text=black]90:3. Codebook Re-Projection \& Output}] {}; +\end{scope} + +\end{tikzpicture} + +\end{document} diff --git a/distillation_job.log b/distillation_job.log new file mode 100644 index 0000000000000000000000000000000000000000..cd18613d0182f06ae30fd9aa49db31b0790c4dd6 --- /dev/null +++ b/distillation_job.log @@ -0,0 +1,39 @@ +./dataset_raw +./data_svc/waves-16k +16000 +>>>>>>>>>>singer_0038<<<<<<<<<< + Processing files: 0%| | 0/642 [00:00>>>>>>>>>singer_0032<<<<<<<<<< + Processing files: 0%| | 0/155 [00:00>>>>>>>>>singer_0045<<<<<<<<<< + Processing files: 0%| | 0/1244 [00:00>>>>>>>>>singer_0064<<<<<<<<<< + Processing files: 0%| | 0/326 [00:00>>>>>>>>>singer_0013<<<<<<<<<< + Processing files: 0%| | 0/467 [00:00>>>>>>>>>singer_0019<<<<<<<<<< + Processing files: 0%| | 0/438 [00:00>>>>>>>>>singer_0021<<<<<<<<<< + Processing files: 0%| | 0/498 [00:00>>>>>>>>>singer_0056<<<<<<<<<< + Processing files: 0%| | 0/604 [00:00>>>>>>>>>singer_0051<<<<<<<<<< + Processing files: 0%| | 0/632 [00:00>>>>>>>>>singer_0026<<<<<<<<<< + Processing files: 0%| | 0/410 [00:00>>>>>>>>>singer_0014<<<<<<<<<< + Processing files: 0%| | 0/378 [00:00>>>>>>>>>singer_0063<<<<<<<<<< + Processing files: 0%| | 0/572 [00:00>>>>>>>>>singer_0069<<<<<<<<<< + Processing files: 0%| | 0/761 [00:00>>>>>>>>>singer_0048<<<<<<<<<< + Processing files: 0%| | 0/675 [00:00>>>>>>>>>singer_0042<<<<<<<<<< + Processing files: 0%| | 0/269 [00:00>>>>>>>>>singer_0035<<<<<<<<<< + Processing files: 0%| | 0/624 [00:00>>>>>>>>>singer_0007<<<<<<<<<< + Processing files: 0%| | 0/534 [00:00>>>>>>>>>singer_0070<<<<<<<<<< + Processing files: 0%| | 0/497 [00:00.spk.npy (one per entry in OPENSINGER_SINGERS) + +Setup in the Space repo: + git lfs track "*.pt" "*.safetensors" "*.pth.tar" + git add .gitattributes + git commit && git push +""" + +import math +import os +import re +import subprocess +import sys +import tempfile +import time + +import numpy as np +import soundfile as sf +import torch +import torchaudio.functional as TAF +import gradio as gr + +PROJECT_ROOT = os.path.dirname(os.path.abspath(__file__)) +SAMPLE_RATE = 24000 + +# ── OpenSinger singers ──────────────────────────────────────────────────────── +# Add singer IDs that you have uploaded to HF_MODEL_REPO under opensinger/. +# For each ID "SingerXX" the manifest expects: +# opensinger/SingerXX.spk.npy — 256-d speaker embedding +# opensinger/SingerXX_ref.wav — short reference audio clip (~3-5 s, 24 kHz) +OPENSINGER_SINGERS: list = [ + # "Singer01", "Singer02", ← uncomment / extend once embeddings are ready +] + + +# ────────────────────────────────────────────────────────────────────────────── +# Startup: sanity-check that required files are present +# (commit them to the Space repo via Git LFS — no runtime download needed) +# ────────────────────────────────────────────────────────────────────────────── + +def _check_files(): + required = [ + os.path.join(PROJECT_ROOT, "chkpt_f5svc", "model_1200000.safetensors"), + os.path.join(PROJECT_ROOT, "chkpt_f5svc", "stage1_epoch_best.pt"), + os.path.join(PROJECT_ROOT, "whisper_pretrain", "large-v2.pt"), + os.path.join(PROJECT_ROOT, "hubert_pretrain", "hubert-soft-0d54a1f4.pt"), + os.path.join(PROJECT_ROOT, "speaker_pretrain", "best_model.pth.tar"), + os.path.join(PROJECT_ROOT, "speaker_pretrain", "config.json"), + os.path.join(PROJECT_ROOT, "examples", "obama.spk.npy"), + os.path.join(PROJECT_ROOT, "examples", "obama_ref.wav"), + ] + for sid in OPENSINGER_SINGERS: + required.append(os.path.join(PROJECT_ROOT, "examples", "opensinger", f"{sid}.spk.npy")) + missing = [p for p in required if not os.path.isfile(p)] + if missing: + for p in missing: + print(f" WARNING: missing {p}") + else: + print("All model files present.") + + +_check_files() + + +# ────────────────────────────────────────────────────────────────────────────── +# Device + model cache +# ────────────────────────────────────────────────────────────────────────────── + +def _device(): + if torch.cuda.is_available(): + return torch.device("cuda") + if torch.backends.mps.is_available(): + return torch.device("mps") + return torch.device("cpu") + + +_MODEL_CACHE: dict = {} + + +def _load_model(device): + import glob + best = os.path.join(PROJECT_ROOT, "chkpt_f5svc", "stage1_epoch_best.pt") + if not os.path.isfile(best): + pts = sorted(glob.glob(os.path.join(PROJECT_ROOT, "chkpt_f5svc", "stage1_epoch_*.pt"))) + if not pts: + raise FileNotFoundError("No Stage 1 checkpoint found.") + best = pts[-1] + + key = (best, str(device)) + if key in _MODEL_CACHE: + return _MODEL_CACHE[key] + + from models.f5_svc import build_f5svc + f5_ckpt = os.path.join(PROJECT_ROOT, "chkpt_f5svc", "model_1200000.safetensors") + model = build_f5svc(f5tts_ckpt_path=f5_ckpt, lora_rank=16).to(device) + s1 = torch.load(best, map_location=device, weights_only=False) + model.load_state_dict(s1.get("trainable", s1.get("model", s1)), strict=False) + model.eval() + _MODEL_CACHE[key] = model + print(f"Model cached: {os.path.basename(best)}") + return model + + +# ────────────────────────────────────────────────────────────────────────────── +# Source feature extraction +# ────────────────────────────────────────────────────────────────────────────── + +def _save_24k(src: str, dst: str): + arr, sr = sf.read(src, always_2d=True) + arr = arr.mean(axis=1).astype(np.float32) + if sr != SAMPLE_RATE: + arr = TAF.resample(torch.tensor(arr).unsqueeze(0), sr, SAMPLE_RATE).squeeze(0).numpy() + sf.write(dst, arr, SAMPLE_RATE, subtype="PCM_16") + + +def _extract(wav_path: str, file_id: str, log, infer_root: str): + waves_dir = os.path.join(infer_root, "waves-32k", "source") + os.makedirs(waves_dir, exist_ok=True) + _save_24k(wav_path, os.path.join(waves_dir, f"{file_id}.wav")) + + for name, cmd in [ + ("PPG", [sys.executable, "prepare/preprocess_ppg.py", + "-w", os.path.join(infer_root, "waves-32k"), + "-p", os.path.join(infer_root, "whisper")]), + ("HuBERT", [sys.executable, "prepare/preprocess_hubert.py", + "-w", os.path.join(infer_root, "waves-32k"), + "-v", os.path.join(infer_root, "hubert"), "-t", "1"]), + ("F0", [sys.executable, "prepare/preprocess_crepe.py", + "-w", os.path.join(infer_root, "waves-32k"), + "-p", os.path.join(infer_root, "pitch"), "-t", "1"]), + ]: + log(f" {name}...") + r = subprocess.run(cmd, capture_output=True, text=True, cwd=PROJECT_ROOT) + if r.returncode != 0: + log(f" Warning: {name} exited {r.returncode}") + + +def _load_tensors(file_id: str, f0_shift: float, device, infer_root: str): + from svc_data.mel_svc_dataset import ( + SAMPLE_RATE as SR, HOP_LENGTH, _build_mel_transform, _resample_to, + ) + + wav_path = os.path.join(infer_root, "waves-32k", "source", f"{file_id}.wav") + arr, sr = sf.read(wav_path, always_2d=True) + arr = arr.mean(axis=1).astype(np.float32) + if sr != SR: + arr = TAF.resample(torch.tensor(arr).unsqueeze(0), sr, SR).squeeze(0).numpy() + + mel = torch.log(_build_mel_transform()(torch.tensor(arr).unsqueeze(0)).clamp(min=1e-5)).squeeze(0).T + t_mel = mel.shape[0] + + def _npy(path, shape): + return torch.tensor(np.load(path)).float() if os.path.isfile(path) else torch.zeros(*shape) + + ppg = _resample_to(_npy(os.path.join(infer_root, "whisper", "source", f"{file_id}.ppg.npy"), (t_mel, 1280)), t_mel) + hubert = _resample_to(_npy(os.path.join(infer_root, "hubert", "source", f"{file_id}.vec.npy"), (t_mel, 256)), t_mel) + f0_raw = _npy(os.path.join(infer_root, "pitch", "source", f"{file_id}.pit.npy"), (t_mel,)) + if f0_shift != 0.0: + f0_raw = f0_raw * math.pow(2.0, f0_shift / 12.0) + f0 = _resample_to(torch.where(f0_raw > 0, torch.log(f0_raw.clamp(min=1.0)), torch.zeros_like(f0_raw)).unsqueeze(-1), t_mel) + + return (mel.unsqueeze(0).to(device), + ppg.unsqueeze(0).to(device), + hubert.unsqueeze(0).to(device), + f0.unsqueeze(0).to(device)) + + +# ────────────────────────────────────────────────────────────────────────────── +# Inference +# ────────────────────────────────────────────────────────────────────────────── + +def run_inference(source_audio, ref_audio, spk_source, opensinger_singer, + custom_spk, steps, solver, ref_sec, f0_shift): + if source_audio is None: + return None, "Please upload a source singing audio file." + + log_lines = [] + def log(l): + log_lines.append(l) + return "\n".join(log_lines) + + import shutil + infer_root = tempfile.mkdtemp(prefix="f5svc_") + try: + return _run_inference_inner( + source_audio, ref_audio, spk_source, opensinger_singer, + custom_spk, steps, solver, ref_sec, f0_shift, + infer_root, log, log_lines, + ) + finally: + shutil.rmtree(infer_root, ignore_errors=True) + + +def _run_inference_inner(source_audio, ref_audio, spk_source, opensinger_singer, + custom_spk, steps, solver, ref_sec, f0_shift, + infer_root, log, log_lines): + dev = _device() + log(f"Device: {dev}") + + # Features + log("Extracting source features...") + file_id = re.sub(r"[^\w\-.]", "_", os.path.splitext(os.path.basename(source_audio))[0]) + try: + _extract(source_audio, file_id, log, infer_root) + except Exception as e: + return None, log(f"Feature extraction failed: {e}") + + # Model + log("Loading model...") + try: + model = _load_model(dev) + except Exception as e: + return None, log(f"Model load failed: {e}") + + # Speaker + try: + if custom_spk is not None: + spk_arr = np.load(custom_spk) + log("Using uploaded speaker embedding.") + elif spk_source == "obama (example)": + spk_arr = np.load(os.path.join(PROJECT_ROOT, "examples", "obama.spk.npy")) + log("Using Obama example speaker.") + elif spk_source == "opensinger (select below)" and opensinger_singer: + spk_path = os.path.join(PROJECT_ROOT, "examples", "opensinger", f"{opensinger_singer}.spk.npy") + spk_arr = np.load(spk_path) + log(f"Using OpenSinger speaker: {opensinger_singer}") + else: + spk_arr = np.zeros(256, dtype=np.float32) + log("Warning: no speaker — using zeros.") + spk = torch.tensor(spk_arr).float().unsqueeze(0).to(dev) + except Exception as e: + return None, log(f"Speaker load failed: {e}") + + # Sequence + try: + from svc_data.mel_svc_dataset import SAMPLE_RATE as SR, HOP_LENGTH, N_MELS, _build_mel_transform + + source_mel, ppg, hubert, f0 = _load_tensors(file_id, float(f0_shift), dev, infer_root) + T = source_mel.shape[1] + log(f"Source: {T} frames ({T * HOP_LENGTH / SR:.1f}s)") + + # Reference mel — OpenSinger singers run without ref (embedding-only conditioning) + ref_frames = 0 + ref_mel_raw = None + rpath = ref_audio or ( + os.path.join(PROJECT_ROOT, "examples", "obama_ref.wav") + if spk_source == "obama (example)" else None + ) + if rpath and os.path.isfile(rpath): + arr, sr = sf.read(rpath, always_2d=True) + arr = arr.mean(axis=1).astype(np.float32) + if sr != SR: + arr = TAF.resample(torch.tensor(arr).unsqueeze(0), sr, SR).squeeze(0).numpy() + ref_mel_raw = torch.log(_build_mel_transform()(torch.tensor(arr).unsqueeze(0)).clamp(min=1e-5)).squeeze(0).T + ref_frames = min(int(float(ref_sec) * SR / HOP_LENGTH), ref_mel_raw.shape[0]) + log(f"Reference: {ref_frames} frames ({ref_frames * HOP_LENGTH / SR:.1f}s)") + + T_total = ref_frames + T + cond_mel = torch.zeros(1, T_total, N_MELS, device=dev) + if ref_frames > 0: + cond_mel[0, :ref_frames] = ref_mel_raw[:ref_frames].to(dev) + + def _pad(_ignored, d): + return torch.zeros(1, ref_frames, d, device=dev) + + if ref_frames > 0: + ppg_full = torch.cat([_pad(ppg, ppg.shape[2]), ppg], dim=1) + hubert_full = torch.cat([_pad(hubert, hubert.shape[2]), hubert], dim=1) + f0_full = torch.cat([_pad(f0, f0.shape[2]), f0], dim=1) + inpaint_mask = torch.zeros(1, T_total, dtype=torch.bool, device=dev) + inpaint_mask[0, :ref_frames] = True + else: + ppg_full, hubert_full, f0_full = ppg, hubert, f0 + inpaint_mask = None + + except Exception as e: + import traceback + return None, log(f"Sequence build failed: {e}\n{traceback.format_exc()}") + + # ODE + log(f"Sampling ({solver.upper()}, {steps} steps)...") + try: + from infer_f5_svc import ode_sample + pred_mel = ode_sample( + model=model, ref_mel=cond_mel, + ppg=ppg_full, hubert=hubert_full, f0=f0_full, spk=spk, + inpaint_mask=inpaint_mask, steps=int(steps), + method=solver, inpaint_mode="none", device=dev, + ) + if ref_frames > 0: + pred_mel = pred_mel[:, ref_frames:] + log(f"Output: {pred_mel.shape[1]} frames ({pred_mel.shape[1] * HOP_LENGTH / SR:.1f}s)") + except Exception as e: + import traceback + return None, log(f"ODE failed: {e}\n{traceback.format_exc()}") + + # Vocos + log("Decoding with Vocos...") + try: + from vocos import Vocos + vocos = Vocos.from_pretrained("charactr/vocos-mel-24khz").to(dev) + vocos.eval() + with torch.no_grad(): + wav_out = vocos.decode(pred_mel.transpose(1, 2)).squeeze(0).cpu().numpy() + out = os.path.join(tempfile.gettempdir(), f"f5svc_{int(time.time())}.wav") + sf.write(out, wav_out, SR) + log(f"Done — {len(wav_out) / SR:.1f}s") + return out, "\n".join(log_lines) + except Exception as e: + import traceback + return None, log(f"Vocos decode failed: {e}\n{traceback.format_exc()}") + + +# ────────────────────────────────────────────────────────────────────────────── +# Gradio UI +# ────────────────────────────────────────────────────────────────────────────── + +with gr.Blocks(title="F5-SVC Demo", theme=gr.themes.Soft()) as demo: + gr.Markdown( + """ + # F5-SVC — Singing Voice Conversion + Upload singing audio and convert it to a target speaker's voice. + Powered by [F5-TTS](https://github.com/SWivid/F5-TTS) backbone with LoRA adaptation. + """ + ) + + with gr.Row(): + with gr.Column(scale=1): + source_audio = gr.Audio(label="Source Singing (upload .wav)", type="filepath") + spk_radio = gr.Radio( + choices=["obama (example)", "opensinger (select below)", "custom (upload below)"], + value="obama (example)", + label="Target Speaker", + ) + opensinger_dd = gr.Dropdown( + choices=OPENSINGER_SINGERS, + value=OPENSINGER_SINGERS[0] if OPENSINGER_SINGERS else None, + label="OpenSinger Singer", + visible=False, + ) + ref_audio = gr.Audio( + label="Reference Audio (leave blank to use obama example)", + type="filepath", + ) + custom_spk = gr.File( + label="Custom Speaker Embedding (.spk.npy)", + file_types=[".npy"], + visible=False, + ) + with gr.Row(): + solver_dd = gr.Dropdown(["euler", "heun", "rk4"], value="heun", label="Solver") + steps_sl = gr.Slider(4, 64, step=1, value=32, label="Steps") + with gr.Row(): + ref_sec_sl = gr.Slider(1.0, 8.0, step=0.5, value=3.0, label="Ref Seconds") + f0_sl = gr.Slider(-12, 12, step=1, value=0, label="F0 Shift (semitones)") + convert_btn = gr.Button("Convert", variant="primary") + + with gr.Column(scale=1): + audio_out = gr.Audio(label="Converted Audio", type="filepath") + log_box = gr.Textbox(label="Log", lines=16, interactive=False) + + def _update_spk_visibility(choice): + return ( + gr.update(visible=(choice == "opensinger (select below)")), + gr.update(visible=(choice == "custom (upload below)")), + ) + + spk_radio.change( + fn=_update_spk_visibility, + inputs=[spk_radio], + outputs=[opensinger_dd, custom_spk], + ) + + convert_btn.click( + fn=run_inference, + inputs=[source_audio, ref_audio, spk_radio, opensinger_dd, custom_spk, + steps_sl, solver_dd, ref_sec_sl, f0_sl], + outputs=[audio_out, log_box], + ) + + gr.Markdown("---\n**Vocoder**: Vocos 24kHz  |  **Solver**: Heun (2 NFE/step)") + +if __name__ == "__main__": + demo.queue().launch() diff --git a/hubert/LICENSE.txt b/hubert/LICENSE.txt new file mode 100644 index 0000000000000000000000000000000000000000..6eb2af050447968cc32481fcfe67b5a4c6cdc69e --- /dev/null +++ b/hubert/LICENSE.txt @@ -0,0 +1,21 @@ +MIT License + +Copyright (c) 2021 Benjamin van Niekerk + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. diff --git a/hubert/__init__.py b/hubert/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/hubert/hubert_model.py b/hubert/hubert_model.py new file mode 100644 index 0000000000000000000000000000000000000000..7fb642d89b07ca60792debab18e3454f52d8f357 --- /dev/null +++ b/hubert/hubert_model.py @@ -0,0 +1,222 @@ +import copy +import random +from typing import Optional, Tuple + +import torch +import torch.nn as nn +import torch.nn.functional as t_func +from torch.nn.modules.utils import consume_prefix_in_state_dict_if_present + + +class Hubert(nn.Module): + def __init__(self, num_label_embeddings: int = 100, mask: bool = True): + super().__init__() + self._mask = mask + self.feature_extractor = FeatureExtractor() + self.feature_projection = FeatureProjection() + self.positional_embedding = PositionalConvEmbedding() + self.norm = nn.LayerNorm(768) + self.dropout = nn.Dropout(0.1) + self.encoder = TransformerEncoder( + nn.TransformerEncoderLayer( + 768, 12, 3072, activation="gelu", batch_first=True + ), + 12, + ) + self.proj = nn.Linear(768, 256) + + self.masked_spec_embed = nn.Parameter(torch.FloatTensor(768).uniform_()) + self.label_embedding = nn.Embedding(num_label_embeddings, 256) + + def mask(self, x: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]: + mask = None + if self.training and self._mask: + mask = _compute_mask((x.size(0), x.size(1)), 0.8, 10, x.device, 2) + x[mask] = self.masked_spec_embed.to(x.dtype) + return x, mask + + def encode( + self, x: torch.Tensor, layer: Optional[int] = None + ) -> Tuple[torch.Tensor, torch.Tensor]: + x = self.feature_extractor(x) + x = self.feature_projection(x.transpose(1, 2)) + x, mask = self.mask(x) + x = x + self.positional_embedding(x) + x = self.dropout(self.norm(x)) + x = self.encoder(x, output_layer=layer) + return x, mask + + def logits(self, x: torch.Tensor) -> torch.Tensor: + logits = torch.cosine_similarity( + x.unsqueeze(2), + self.label_embedding.weight.unsqueeze(0).unsqueeze(0), + dim=-1, + ) + return logits / 0.1 + + def forward(self, x: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]: + x, mask = self.encode(x) + x = self.proj(x) + logits = self.logits(x) + return logits, mask + + +class HubertSoft(Hubert): + def __init__(self): + super().__init__() + + @torch.inference_mode() + def units(self, wav: torch.Tensor) -> torch.Tensor: + wav = t_func.pad(wav, ((400 - 320) // 2, (400 - 320) // 2)) + x, _ = self.encode(wav) + return self.proj(x) + + +class FeatureExtractor(nn.Module): + def __init__(self): + super().__init__() + self.conv0 = nn.Conv1d(1, 512, 10, 5, bias=False) + self.norm0 = nn.GroupNorm(512, 512) + self.conv1 = nn.Conv1d(512, 512, 3, 2, bias=False) + self.conv2 = nn.Conv1d(512, 512, 3, 2, bias=False) + self.conv3 = nn.Conv1d(512, 512, 3, 2, bias=False) + self.conv4 = nn.Conv1d(512, 512, 3, 2, bias=False) + self.conv5 = nn.Conv1d(512, 512, 2, 2, bias=False) + self.conv6 = nn.Conv1d(512, 512, 2, 2, bias=False) + + def forward(self, x: torch.Tensor) -> torch.Tensor: + x = t_func.gelu(self.norm0(self.conv0(x))) + x = t_func.gelu(self.conv1(x)) + x = t_func.gelu(self.conv2(x)) + x = t_func.gelu(self.conv3(x)) + x = t_func.gelu(self.conv4(x)) + x = t_func.gelu(self.conv5(x)) + x = t_func.gelu(self.conv6(x)) + return x + + +class FeatureProjection(nn.Module): + def __init__(self): + super().__init__() + self.norm = nn.LayerNorm(512) + self.projection = nn.Linear(512, 768) + self.dropout = nn.Dropout(0.1) + + def forward(self, x: torch.Tensor) -> torch.Tensor: + x = self.norm(x) + x = self.projection(x) + x = self.dropout(x) + return x + + +class PositionalConvEmbedding(nn.Module): + def __init__(self): + super().__init__() + self.conv = nn.Conv1d( + 768, + 768, + kernel_size=128, + padding=128 // 2, + groups=16, + ) + self.conv = nn.utils.weight_norm(self.conv, name="weight", dim=2) + + def forward(self, x: torch.Tensor) -> torch.Tensor: + x = self.conv(x.transpose(1, 2)) + x = t_func.gelu(x[:, :, :-1]) + return x.transpose(1, 2) + + +class TransformerEncoder(nn.Module): + def __init__( + self, encoder_layer: nn.TransformerEncoderLayer, num_layers: int + ) -> None: + super(TransformerEncoder, self).__init__() + self.layers = nn.ModuleList( + [copy.deepcopy(encoder_layer) for _ in range(num_layers)] + ) + self.num_layers = num_layers + + def forward( + self, + src: torch.Tensor, + mask: torch.Tensor = None, + src_key_padding_mask: torch.Tensor = None, + output_layer: Optional[int] = None, + ) -> torch.Tensor: + output = src + for layer in self.layers[:output_layer]: + output = layer( + output, src_mask=mask, src_key_padding_mask=src_key_padding_mask + ) + return output + + +def _compute_mask( + shape: Tuple[int, int], + mask_prob: float, + mask_length: int, + device: torch.device, + min_masks: int = 0, +) -> torch.Tensor: + batch_size, sequence_length = shape + + if mask_length < 1: + raise ValueError("`mask_length` has to be bigger than 0.") + + if mask_length > sequence_length: + raise ValueError( + f"`mask_length` has to be smaller than `sequence_length`, but got `mask_length`: {mask_length} and `sequence_length`: {sequence_length}`" + ) + + # compute number of masked spans in batch + num_masked_spans = int(mask_prob * sequence_length / mask_length + random.random()) + num_masked_spans = max(num_masked_spans, min_masks) + + # make sure num masked indices <= sequence_length + if num_masked_spans * mask_length > sequence_length: + num_masked_spans = sequence_length // mask_length + + # SpecAugment mask to fill + mask = torch.zeros((batch_size, sequence_length), device=device, dtype=torch.bool) + + # uniform distribution to sample from, make sure that offset samples are < sequence_length + uniform_dist = torch.ones( + (batch_size, sequence_length - (mask_length - 1)), device=device + ) + + # get random indices to mask + mask_indices = torch.multinomial(uniform_dist, num_masked_spans) + + # expand masked indices to masked spans + mask_indices = ( + mask_indices.unsqueeze(dim=-1) + .expand((batch_size, num_masked_spans, mask_length)) + .reshape(batch_size, num_masked_spans * mask_length) + ) + offsets = ( + torch.arange(mask_length, device=device)[None, None, :] + .expand((batch_size, num_masked_spans, mask_length)) + .reshape(batch_size, num_masked_spans * mask_length) + ) + mask_idxs = mask_indices + offsets + + # scatter indices to mask + mask = mask.scatter(1, mask_idxs, True) + + return mask + + +def hubert_soft( + path: str, +) -> HubertSoft: + r"""HuBERT-Soft from `"A Comparison of Discrete and Soft Speech Units for Improved Voice Conversion"`. + Args: + path (str): path of a pretrained model + """ + hubert = HubertSoft() + checkpoint = torch.load(path) + consume_prefix_in_state_dict_if_present(checkpoint, "module.") + hubert.load_state_dict(checkpoint) + hubert.eval() + return hubert diff --git a/hubert/inference.py b/hubert/inference.py new file mode 100644 index 0000000000000000000000000000000000000000..5f6ff7a8a77ed84a8f741a30e20302e88e9df357 --- /dev/null +++ b/hubert/inference.py @@ -0,0 +1,72 @@ +import sys,os +sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) +import numpy as np +import argparse +import torch + +from whisper.audio import load_audio +from hubert import hubert_model + + +def load_model(path, device): + model = hubert_model.hubert_soft(path) + model.eval() + if device == "cuda": + model.half() + model.to(device) + return model + + +def pred_vec(model, wavPath, vecPath, device): + audio = load_audio(wavPath) + audln = audio.shape[0] + vec_a = [] + idx_s = 0 + while (idx_s + 20 * 16000 < audln): + feats = audio[idx_s:idx_s + 20 * 16000] + feats = torch.from_numpy(feats).to(device) + feats = feats[None, None, :] + if device == "cuda": + feats = feats.half() + else: + feats = feats.float() + with torch.no_grad(): + vec = model.units(feats).squeeze().data.cpu().float().numpy() + vec_a.extend(vec) + idx_s = idx_s + 20 * 16000 + if (idx_s < audln): + feats = audio[idx_s:audln] + feats = torch.from_numpy(feats).to(device) + feats = feats[None, None, :] + if device == "cuda": + feats = feats.half() + else: + feats = feats.float() + with torch.no_grad(): + vec = model.units(feats).squeeze().data.cpu().float().numpy() + # print(vec.shape) # [length, dim=256] hop=320 + vec_a.extend(vec) + np.save(vecPath, vec_a, allow_pickle=False) + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.description = 'please enter embed parameter ...' + parser.add_argument("-w", "--wav", help="wav", dest="wav") + parser.add_argument("-v", "--vec", help="vec", dest="vec") + args = parser.parse_args() + print(args.wav) + print(args.vec) + + wavPath = args.wav + vecPath = args.vec + + if torch.cuda.is_available(): + device = "cuda" + elif torch.backends.mps.is_available(): + device = "mps" + else: + device = "cpu" + hubert = load_model(os.path.join( + "hubert_pretrain", "hubert-soft-0d54a1f4.pt"), device) + pred_vec(hubert, wavPath, vecPath, device) diff --git a/import_opensinger.py b/import_opensinger.py new file mode 100644 index 0000000000000000000000000000000000000000..0ae381c6c23492af1b161553af881cfed8eab7c7 --- /dev/null +++ b/import_opensinger.py @@ -0,0 +1,82 @@ +import os +import shutil +import argparse +import json +from tqdm import tqdm + +def import_dataset(input_dir, output_dir, use_symlink=True): + wavs = [] + print("Scanning directory for .wav files (this is much faster than glob)...") + for root, dirs, files in os.walk(input_dir): + for f in files: + if f.endswith(".wav"): + wavs.append(os.path.join(root, f)) + + if not wavs: + print(f"No .wav files found in {input_dir}.") + return + + print(f"Found {len(wavs)} pre-segmented .wav files. Preparing to import and sanitize...") + + speaker_mapping = {} + speaker_counter = 1 + + total_clips = 0 + os.makedirs(output_dir, exist_ok=True) + + for w in tqdm(wavs): + # OpenSinger actual structure: ManRaw/0_SongName/0_SongName_1.wav or WomanRaw/10_SongName/... + folder_name = os.path.basename(os.path.dirname(w)) + parent_folder = os.path.basename(os.path.dirname(os.path.dirname(w))) + + # We group by "ManRaw_0", "WomanRaw_10" etc., to treat the same singer across different songs + if "_" in folder_name: + singer_id = folder_name.split("_")[0] + raw_speaker = f"{parent_folder}_{singer_id}" + else: + raw_speaker = folder_name + + if not raw_speaker or raw_speaker == os.path.basename(input_dir): + raw_speaker = "singer_00" + + # Map to safe ASCII identities + if raw_speaker not in speaker_mapping: + speaker_mapping[raw_speaker] = f"singer_{speaker_counter:04d}" + speaker_counter += 1 + + safe_speaker = speaker_mapping[raw_speaker] + out_d = os.path.join(output_dir, safe_speaker) + os.makedirs(out_d, exist_ok=True) + + # Create a safe base name by stripping weird garbled chars + raw_base = os.path.basename(w).replace(".wav", "").replace(".", "_") + safe_base = "".join(c if c.isalnum() else "_" for c in raw_base) + + out_filename = os.path.join(out_d, f"{safe_base}.wav") + + try: + if use_symlink: + if not os.path.exists(out_filename): + os.symlink(os.path.abspath(w), out_filename) + else: + shutil.copy2(w, out_filename) + total_clips += 1 + except Exception as e: + print(f"Failed to copy/link {w}: {e}") + + mapping_file = os.path.join(output_dir, "speaker_mapping.json") + with open(mapping_file, "w", encoding="utf-8") as f: + json.dump(speaker_mapping, f, ensure_ascii=False, indent=4) + + print(f"\nImport complete! Safely transferred {total_clips} pre-segmented clips.") + print(f"Saved Chinese-to-ASCII speaker mapping to {mapping_file}") + print(f"Check results in {output_dir}") + +if __name__ == "__main__": + parser = argparse.ArgumentParser(description="Cleanly import pre-segmented massive dataset wavs.") + parser.add_argument("--input_dir", type=str, default="./opensinger", help="Folder containing raw dataset") + parser.add_argument("--output_dir", type=str, default="./dataset_raw", help="Folder mapping where slices go for train prep") + parser.add_argument("--no_symlink", action="store_true", help="Copy files instead of symlinking") + args = parser.parse_args() + + import_dataset(args.input_dir, args.output_dir, use_symlink=not args.no_symlink) diff --git a/infer.py b/infer.py new file mode 100644 index 0000000000000000000000000000000000000000..4c9fad3354cce590a94e9470e4244c8c7fe3c884 --- /dev/null +++ b/infer.py @@ -0,0 +1,235 @@ +import torch +from models.cond_encoder import CondEncoder +from models.codec_wrapper import CodecWrapper +from models.cfm import DiT +from samplers.ode import ODESampler + +import argparse + +@torch.no_grad() +def infer_pipeline(wave_path=None, epoch=None, target_spk=None, hubert_index_path=None, hubert_blend=0.75, use_ema=True): + if torch.cuda.is_available(): + device = torch.device('cuda') + elif torch.backends.mps.is_available(): + device = torch.device('mps') + else: + device = torch.device('cpu') + print(f"Using device {device}") + + # 1. Load trained models + codec_wrapper = CodecWrapper(latent_dim=1024).to(device) + cond_enc = CondEncoder(ppg_dim=1280, hubert_dim=256, f0_dim=1, spk_dim=256, cond_out_dim=1024).to(device) + dit = DiT(in_channels=1024, cond_dim=1024, hidden_dim=512, depth=8).to(device) + # Load checkpoints + import os + + # Resolve checkpoint paths (prefer EMA weights when --ema is set) + if epoch is not None: + dit_path = f"chkpt_cfm/dit_epoch_{epoch}.pt" + cond_path = f"chkpt_cfm/cond_encoder_epoch_{epoch}.pt" + proj_path = f"chkpt_cfm/projector_epoch_{epoch}.pt" + ema_dit_path = f"chkpt_cfm/ema_dit_epoch_{epoch}.pt" + ema_cond_path = f"chkpt_cfm/ema_cond_encoder_epoch_{epoch}.pt" + ema_proj_path = f"chkpt_cfm/ema_projector_epoch_{epoch}.pt" + else: + dit_path = "chkpt_cfm/dit_final.pt" + cond_path = "chkpt_cfm/cond_encoder_final.pt" + proj_path = "chkpt_cfm/projector_final.pt" + ema_dit_path = "chkpt_cfm/ema_dit_final.pt" + ema_cond_path = "chkpt_cfm/ema_cond_encoder_final.pt" + ema_proj_path = "chkpt_cfm/ema_projector_final.pt" + + # Select which weights to load + if use_ema and os.path.exists(ema_dit_path): + load_dit = ema_dit_path + load_cond = ema_cond_path + load_proj = ema_proj_path + print(f"Loading EMA checkpoints from {ema_dit_path}...") + elif os.path.exists(dit_path): + load_dit = dit_path + load_cond = cond_path + load_proj = proj_path + if use_ema: + print(f"WARNING: EMA checkpoints not found, falling back to regular weights.") + print(f"Loading trained checkpoints from {dit_path}...") + else: + load_dit = None + print(f"WARNING: Checkpoints not found at {dit_path}. Using untrained weights! Output will be noisy.") + + if load_dit is not None: + # Helper function to strip _orig_mod prefix from compiled PyTorch 2.0 checkpoints + def clean_sd(sd): + return {k.replace('_orig_mod.', ''): v for k, v in sd.items()} + + dit.load_state_dict(clean_sd(torch.load(load_dit, map_location=device, weights_only=True))) + cond_enc.load_state_dict(clean_sd(torch.load(load_cond, map_location=device, weights_only=True))) + codec_wrapper.projector.load_state_dict(clean_sd(torch.load(load_proj, map_location=device, weights_only=True))) + + # Load Latent Normalization + if os.path.exists("chkpt_cfm/latent_norm.pt"): + norm_data = torch.load("chkpt_cfm/latent_norm.pt", map_location=device, weights_only=True) + z_mean = norm_data['mean'].to(device) + z_std = norm_data['std'].to(device) + else: + z_mean = torch.zeros(1024).to(device) + z_std = torch.ones(1024).to(device) + + codec_wrapper.eval() + cond_enc.eval() + dit.eval() + + # Load FAISS Index if provided + hubert_index = None + target_hubert_vectors = None + if hubert_index_path and os.path.exists(hubert_index_path): + import faiss + import numpy as np + print(f"Loading target FAISS index from {hubert_index_path} with blend ratio {hubert_blend}...") + hubert_index = faiss.read_index(hubert_index_path) + vectors_path = hubert_index_path.replace(".index", "_vectors.npy") + if os.path.exists(vectors_path): + target_hubert_vectors = np.load(vectors_path) + print(f"Loaded {target_hubert_vectors.shape[0]} target vectors for real-time inference retrieval.") + else: + print(f"WARNING: Source vectors {vectors_path} missing. FAISS disabled.") + hubert_index = None + + total_T_latent = 200 + if wave_path: + import soundfile as sf + try: + wav_data, sr = sf.read(wave_path) + import math + total_T_latent = math.ceil(len(wav_data) / sr * 44100 / 512) + print(f"Loaded {wave_path}, calculating total T_latent={total_T_latent}") + except Exception as e: + print(f"Could not load wave: {e}") + total_T_latent = 200 + + # Max frames for ~25 seconds out_chunks + max_frames = 400 + overlap_frames = 50 # ~1.1 seconds overlap + step_frames = max_frames - overlap_frames + + final_audio = None + + print("Starting chunked inference pipeline (Heun, Overlap-Add)...") + import numpy as np + + if wave_path: + file_id = os.path.basename(wave_path).replace('.wav', '') + else: + file_id = "mock" + + for start_idx in range(0, total_T_latent, step_frames): + T_latent = min(max_frames, total_T_latent - start_idx) + print(f"--- Processing chunk from frame {start_idx} to {start_idx + T_latent} (Length: {T_latent}) ---") + + # 2. Extract Conditioning + time_start = start_idx * 512 / 44100.0 + time_end = (start_idx + T_latent) * 512 / 44100.0 + + try: + ppg_full = np.load(f"data_svc_infer/whisper/speaker0/{file_id}.ppg.npy") + hubert_full = np.load(f"data_svc_infer/hubert/speaker0/{file_id}.vec.npy") + f0_full = np.load(f"data_svc_infer/pitch/speaker0/{file_id}.pit.npy") + + if target_spk is not None: + spk_full = np.load(target_spk) + else: + spk_full = np.load(f"data_svc_infer/speaker/speaker0/{file_id}.spk.npy") + + ppg_start, ppg_end = int(time_start * 50), int(time_end * 50) + hubert_start, hubert_end = int(time_start * 50), int(time_end * 50) + f0_start, f0_end = int(time_start * 100), int(time_end * 100) + + ppg = torch.tensor(ppg_full[max(0, ppg_start) : max(1, ppg_end)]).float().unsqueeze(0).to(device) + hubert = torch.tensor(hubert_full[max(0, hubert_start) : max(1, hubert_end)]).float().unsqueeze(0).to(device) + + f0_raw = torch.tensor(f0_full[max(0, f0_start) : max(1, f0_end)]).float() + f0 = torch.where(f0_raw > 0, torch.log(f0_raw.clamp(min=1.0)), torch.zeros_like(f0_raw)).unsqueeze(-1).unsqueeze(0).to(device) + + spk = torch.tensor(spk_full).float().unsqueeze(0).to(device) + + # Failsafe + if ppg.shape[1] == 0: ppg = torch.randn(1, max(1, T_latent // 2), 1280).to(device) + if hubert.shape[1] == 0: hubert = torch.randn(1, max(1, T_latent // 2), 256).to(device) + if f0.shape[1] == 0: f0 = torch.randn(1, T_latent, 1).to(device) + + # --- FAISS HUBERT BLEND --- + if hubert_index is not None and target_hubert_vectors is not None: + source_hubert_np = hubert.squeeze(0).cpu().numpy().astype(np.float32) # (T, 256) + # Search FAISS index top k=4 + _, I = hubert_index.search(source_hubert_np, 4) + # Average neighbors + nn_hubert = target_hubert_vectors[I].mean(axis=1) # (T, 256) + # Soft blend based on arg + blended_hubert = hubert_blend * nn_hubert + (1.0 - hubert_blend) * source_hubert_np + hubert = torch.tensor(blended_hubert).float().unsqueeze(0).to(device) + + except FileNotFoundError: + ppg = torch.randn(1, max(1, T_latent // 2), 1280).to(device) + hubert = torch.randn(1, max(1, T_latent // 2), 256).to(device) + f0 = torch.randn(1, T_latent, 1).to(device) + spk = torch.randn(1, 256).to(device) + + c = cond_enc(ppg, hubert, f0, spk, target_seq_len=T_latent) + + # 3. Sample + sampler = ODESampler(dit, steps=12, solver='heun') + z_noise = torch.randn(1, T_latent, 1024).to(device) + u_hat = sampler.sample(z_noise, c) # (1, T_latent, 1024) + + # 4. Project (expecting normalized input to match training) + u_hat_transposed = u_hat.transpose(1, 2) + z_hat_norm = codec_wrapper.forward_project(u_hat_transposed) # (1, 1024, T_latent) + + # 5. Invert Normalization (before decoding to raw latent space) + z_hat_norm_transposed = z_hat_norm.transpose(1, 2) + z_hat_denorm = (z_hat_norm_transposed * z_std) + z_mean + z_hat = z_hat_denorm.transpose(1, 2) + + # 6. Decode + wav_chunk = codec_wrapper.decode(z_hat).cpu().squeeze().numpy() + + # Overlap Add Crossfade + if final_audio is None: + final_audio = wav_chunk + else: + # Crossfade overlap region + overlap_samples = overlap_frames * 512 + if len(wav_chunk) >= overlap_samples and len(final_audio) >= overlap_samples: + fade_in = np.linspace(0, 1, overlap_samples) + fade_out = 1 - fade_in + + # Apply crossfade + final_audio[-overlap_samples:] = final_audio[-overlap_samples:] * fade_out + wav_chunk[:overlap_samples] * fade_in + # Append rest + final_audio = np.concatenate([final_audio, wav_chunk[overlap_samples:]]) + else: + final_audio = np.concatenate([final_audio, wav_chunk]) + + if T_latent < max_frames: + break + + wav_out = final_audio + print(f"Inference complete! Final output waveform shape: {wav_out.shape}") + + import soundfile as sf + out_file = "output_sample.wav" + sf.write(out_file, wav_out, 44100) + print(f"Saved output to {out_file}") + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument("--wave", type=str, default=None, help="Path to input wave file") + parser.add_argument("--epoch", type=int, default=None, help="Epoch number to load checkpoints from (e.g., 30)") + parser.add_argument("--target_spk", type=str, default=None, help="Path to target speaker .npy array for voice conversion") + parser.add_argument("--hubert_index", type=str, default=None, help="Path to FAISS .index file for HuBERT retrieval") + parser.add_argument("--hubert_blend", type=float, default=0.75, help="Blend ratio for HuBERT retrieval (0.0=source, 1.0=target)") + parser.add_argument("--ema", action="store_true", default=True, help="Use EMA-averaged weights (default: True, use --no-ema to disable)") + parser.add_argument("--no-ema", dest="ema", action="store_false", help="Use raw training weights instead of EMA") + args = parser.parse_args() + + infer_pipeline(args.wave, args.epoch, args.target_spk, args.hubert_index, args.hubert_blend, use_ema=args.ema) + diff --git a/infer_f5_svc.py b/infer_f5_svc.py new file mode 100644 index 0000000000000000000000000000000000000000..734e35047c5b70dd1d7d0346cc0cda66f6056a79 --- /dev/null +++ b/infer_f5_svc.py @@ -0,0 +1,384 @@ +""" +F5-SVC Inference: convert source singing to target speaker's voice. + +Pipeline: + 1. Extract PPG / HuBERT / F0 from source singing (content) + 2. Load target speaker embedding from .spk.npy (identity) + 3. Take first ref_sec of target speaker's reference audio as ref_mel (timbre) + 4. Run Euler ODE sampling (N steps) using the trained F5SVCModel + 5. Decode log-mel → waveform with Vocos + +Usage: + python infer_f5_svc.py \ + --ckpt ./chkpt_f5svc/stage1_epoch_200.pt \ + --source ./source_singing.wav \ + --target_spk ./data_svc/speaker/ada_wong/ref.spk.npy \ + --ref_audio ./data_svc/audio/ada_wong/ref.wav \ + --output ./converted.wav \ + --steps 32 + + For Stage 2 adapted speaker (stacked LoRA + spk_proj), pass both: + --stage1_ckpt ./chkpt_f5svc/stage1_epoch_200.pt \ + --ckpt ./chkpt_f5svc/stage2_ada_wong.pt +""" + +import argparse +import math +import os + +import numpy as np +import torch +import torch.nn.functional as F +import torchaudio + +from svc_data.mel_svc_dataset import ( + SAMPLE_RATE, HOP_LENGTH, N_FFT, WIN_LENGTH, N_MELS, F_MIN, F_MAX, + _build_mel_transform, _resample_to, +) +from models.f5_svc import build_f5svc + + +@torch.no_grad() +def ode_sample( + model: torch.nn.Module, + ref_mel: torch.Tensor, # (1, T, N_MELS) clean mel in ref region, zeros elsewhere + ppg: torch.Tensor, # (1, T, 1280) + hubert: torch.Tensor, # (1, T, 256) + f0: torch.Tensor, # (1, T, 1) + spk: torch.Tensor, # (1, 256) + inpaint_mask: torch.Tensor | None = None, # (1, T) boolean mask where True = keep clean ref_mel + steps: int = 32, + method: str = "euler", # "euler", "heun", or "rk4" + inpaint_mode: str = "none", # "none", "soft", or "hard" + device: torch.device = torch.device("cpu"), +) -> torch.Tensor: # (1, T, N_MELS) + """ODE solver for rectified flow. + + Inpaint modes: + - 'none': No per-step restore. The model uses cond=ref_mel for timbre. + Ref region is initialized clean at t=0 only. + - 'soft': Fading alpha blend: alpha = 1 - t (fully clean at t=0, free at t=1). + Matches the training trajectory more closely. + - 'hard': Snap ref region back to clean at every step (original F5-TTS style). + Creates train/inference mismatch at the boundary. + """ + T = ref_mel.shape[1] + x = torch.randn(1, T, N_MELS, device=device) + + # Initialize reference region as clean mel at t=0 (matches training) + if inpaint_mask is not None: + x[inpaint_mask] = ref_mel[inpaint_mask] + + dt = 1.0 / steps + model_mask = torch.ones(1, T, dtype=torch.bool, device=device) + + def get_v(t_val, x_val): + t_tensor = torch.full((1,), t_val, device=device) + return model( + x=x_val, cond=ref_mel, + ppg=ppg, hubert=hubert, f0=f0, spk=spk, + time=t_tensor, mask=model_mask, + ) + + def _restore(x_cur, step_idx): + """Apply inpaint restore according to the selected mode.""" + if inpaint_mask is None or inpaint_mode == "none": + return x_cur + if inpaint_mode == "hard": + x_cur[inpaint_mask] = ref_mel[inpaint_mask] + elif inpaint_mode == "soft": + # alpha fades from 1.0 (fully clean) at t=0 to 0.0 (free) at t=1 + alpha = 1.0 - (step_idx + 1) / steps + x_cur[inpaint_mask] = ( + alpha * ref_mel[inpaint_mask] + (1 - alpha) * x_cur[inpaint_mask] + ) + return x_cur + + for i in range(steps): + t = i / steps + + if method == "euler": + v = get_v(t, x) + x = x + v * dt + elif method == "heun": + v1 = get_v(t, x) + x_test = _restore(x + v1 * dt, i) + v2 = get_v(t + dt, x_test) + x = x + 0.5 * (v1 + v2) * dt + elif method == "rk4": + k1 = get_v(t, x) + x2 = _restore(x + k1 * (dt / 2), i) + k2 = get_v(t + dt/2, x2) + x3 = _restore(x + k2 * (dt / 2), i) + k3 = get_v(t + dt/2, x3) + x4 = _restore(x + k3 * dt, i) + k4 = get_v(t + dt, x4) + x = x + (k1 + 2*k2 + 2*k3 + k4) * (dt / 6) + else: + raise ValueError(f"Unknown solver: {method}") + + x = _restore(x, i) + + return x + + +def load_features(source_wav: str, device: torch.device, feat_sr: float = 50.0, f0_shift: float = 0.0): + """ + Attempt to load pre-extracted features co-located with the source wav. + Falls back to zeros if not found (features should be extracted offline + with the same tools used for training: Whisper, HuBERT, CREPE/pyin). + """ + base = os.path.splitext(source_wav)[0] + spk_name = os.path.basename(os.path.dirname(source_wav)) + file_id = os.path.basename(base) + + def _try_npy(path, fallback_shape): + if os.path.isfile(path): + return torch.tensor(np.load(path)).float() + print(f"Warning: {path} not found, using zeros") + return torch.zeros(*fallback_shape) + + wav, sr = torchaudio.load(source_wav) + if wav.shape[0] > 1: + wav = wav.mean(dim=0, keepdim=True) + if sr != SAMPLE_RATE: + wav = torchaudio.functional.resample(wav, sr, SAMPLE_RATE) + + mel_tf = _build_mel_transform() + mel = mel_tf(wav).squeeze(0).T + mel = torch.log(mel.clamp(min=1e-5)) + t_mel = mel.shape[0] + + # Look for features in the standard data_svc directory structure + ppg_path = f"./data_svc/whisper/{spk_name}/{file_id}.ppg.npy" + hbt_path = f"./data_svc/hubert/{spk_name}/{file_id}.vec.npy" + f0_path = f"./data_svc/pitch/{spk_name}/{file_id}.pit.npy" + + ppg = _try_npy(ppg_path, (t_mel, 1280)) + hubert = _try_npy(hbt_path, (t_mel, 256)) + f0_raw = _try_npy(f0_path, (t_mel,)) + + if f0_shift != 0.0: + f0_raw = f0_raw * math.pow(2.0, f0_shift / 12.0) + + f0 = torch.where(f0_raw > 0, + torch.log(f0_raw.clamp(min=1.0)), + torch.zeros_like(f0_raw)).unsqueeze(-1) + + ppg = _resample_to(ppg, t_mel) + hubert = _resample_to(hubert, t_mel) + f0 = _resample_to(f0, t_mel) + + return (mel.unsqueeze(0).to(device), + ppg.unsqueeze(0).to(device), + hubert.unsqueeze(0).to(device), + f0.unsqueeze(0).to(device)) + + +def mel_to_audio(mel: torch.Tensor, device: torch.device) -> torch.Tensor: + """Decode log-mel spectrogram to waveform via Vocos. + + Args: + mel: (1, T, N_MELS) log-mel spectrogram + Returns: + waveform tensor (samples,) or None if Vocos unavailable + """ + try: + from vocos import Vocos + vocos = Vocos.from_pretrained("charactr/vocos-mel-24khz").to(device) + vocos.eval() + # Vocos.decode() expects log-mel in (B, N_MELS, T) layout + # (same format as its feature_extractor output) + mel_input = mel.transpose(1, 2) # (1, N_MELS, T) + wav = vocos.decode(mel_input) + return wav.squeeze(0) + except ImportError: + print("Vocos not installed (pip install vocos). Saving mel instead.") + return None + + +@torch.no_grad() +def infer(args): + device = torch.device("cuda" if torch.cuda.is_available() else "cpu") + + # Load model: base F5-TTS weights first, then trained adapter+LoRA on top + model = build_f5svc( + f5tts_ckpt_path=args.f5tts_ckpt, + lora_rank=args.lora_rank, + ).to(device) + + ckpt = torch.load(args.ckpt, map_location=device, weights_only=False) + is_stage2 = "stage2" in ckpt + + if is_stage2: + # Two-step load: Stage 1 weights first, then Stage 2 on top + if not args.stage1_ckpt: + raise ValueError("Stage 2 checkpoint requires --stage1_ckpt to load Stage 1 weights first.") + s1 = torch.load(args.stage1_ckpt, map_location=device, weights_only=False) + s1_weights = s1.get("trainable", s1.get("model", s1)) + model.load_state_dict(s1_weights, strict=False) + print(f"Loaded Stage 1 checkpoint: {args.stage1_ckpt}") + + # Inject A₂B₂ parameters before loading Stage 2 weights + from models.lora_utils import inject_lora_stage2 + stage2_rank = ckpt.get("stage2_rank", 4) + inject_lora_stage2(model.transformer, rank=stage2_rank) + model.to(device) + model.load_state_dict(ckpt["stage2"], strict=False) + print(f"Loaded Stage 2 checkpoint: {args.ckpt} (rank={stage2_rank})") + else: + weights = ckpt.get("trainable", ckpt.get("model", ckpt)) + model.load_state_dict(weights, strict=False) + print(f"Loaded Stage 1 checkpoint: {args.ckpt}") + + model.eval() + + # Source features + source_mel, ppg, hubert, f0 = load_features(args.source, device, f0_shift=args.f0_shift) + T = source_mel.shape[1] + + # Target speaker embedding + spk = torch.tensor(np.load(args.target_spk)).float().unsqueeze(0).to(device) # (1, 256) + + # Target speaker reference mel — PREPEND before source features + # Layout: [ref_region (clean timbre reference) | source_region (generated)] + # This ensures the full source audio gets converted. + ref_frames = 0 + if args.ref_audio and os.path.isfile(args.ref_audio): + ref_wav, ref_sr = torchaudio.load(args.ref_audio) + if ref_wav.shape[0] > 1: + ref_wav = ref_wav.mean(0, keepdim=True) + if ref_sr != SAMPLE_RATE: + ref_wav = torchaudio.functional.resample(ref_wav, ref_sr, SAMPLE_RATE) + mel_tf = _build_mel_transform() + ref_mel_raw = torch.log(mel_tf(ref_wav).clamp(min=1e-5)).squeeze(0).T # (T_ref, N_MELS) + ref_frames = min(int(args.ref_sec * SAMPLE_RATE / HOP_LENGTH), ref_mel_raw.shape[0]) + print(f"Reference: {ref_frames} frames ({ref_frames / (SAMPLE_RATE / HOP_LENGTH):.1f} sec)") + + T_total = ref_frames + T # total sequence length + + # Build conditioning: [ref_mel | zeros] + # During training, `ref_mel` target regions are masked with pure 0.0, NOT log-mel silence. + cond_mel = torch.zeros(1, T_total, N_MELS, device=device) + if ref_frames > 0: + cond_mel[0, :ref_frames] = ref_mel_raw[:ref_frames].to(device) + + # Pad source features at the front with the reference audio's features + # (If we use zeros, the model's self-attention receives out-of-distribution conditioning + # for the reference region, causing artifacts/pitch glitches). + if ref_frames > 0: + try: + _, ref_ppg, ref_hubert, ref_f0 = load_features(args.ref_audio, device) + pad_ppg = ref_ppg[:, :ref_frames, :] + pad_hbt = ref_hubert[:, :ref_frames, :] + pad_f0 = ref_f0[:, :ref_frames, :] + # If the loaded features are slightly shorter, pad the rest with zeros + if pad_ppg.shape[1] < ref_frames: + shortfall = ref_frames - pad_ppg.shape[1] + pad_ppg = torch.cat([pad_ppg, torch.zeros(1, shortfall, 1280, device=device)], dim=1) + pad_hbt = torch.cat([pad_hbt, torch.zeros(1, shortfall, 256, device=device)], dim=1) + pad_f0 = torch.cat([pad_f0, torch.zeros(1, shortfall, 1, device=device)], dim=1) + except Exception as e: + print(f"Warning: could not load features for ref_audio ({e}). Falling back to zeros.") + pad_ppg = torch.zeros(1, ref_frames, ppg.shape[2], device=device) + pad_hbt = torch.zeros(1, ref_frames, hubert.shape[2], device=device) + pad_f0 = torch.zeros(1, ref_frames, f0.shape[2], device=device) + else: + pad_ppg = torch.zeros(1, 0, ppg.shape[2], device=device) + pad_hbt = torch.zeros(1, 0, hubert.shape[2], device=device) + pad_f0 = torch.zeros(1, 0, f0.shape[2], device=device) + + ppg_full = torch.cat([pad_ppg, ppg], dim=1) + hubert_full = torch.cat([pad_hbt, hubert], dim=1) + f0_full = torch.cat([pad_f0, f0], dim=1) + + # Optional "Burn-in" padding for zero-shot stability using MIRROR PADDING + # Takes the first N frames of the actual source and mirrors them backward. + # This gives the attention mechanism a continuous acoustic runway to stabilize + # before the actual song starts, dissipating transition shocks. + pad_frames = int(args.pad_start * SAMPLE_RATE / HOP_LENGTH) + if pad_frames > 0: + d = torch.arange(pad_frames, 0, -1, device=device) + i = d - 1 + cycle = i // T + rem = i % T + pad_idx = torch.where(cycle % 2 == 0, rem, T - 1 - rem) + + pad_ppg = ppg[:, pad_idx, :] + pad_hbt = hubert[:, pad_idx, :] + pad_f0 = f0[:, pad_idx, :] + + # Insert the duplicated frames AFTER the ref region and BEFORE the source region + ppg_full = torch.cat([ppg_full[:, :ref_frames], pad_ppg, ppg_full[:, ref_frames:]], dim=1) + hubert_full = torch.cat([hubert_full[:, :ref_frames], pad_hbt, hubert_full[:, ref_frames:]], dim=1) + f0_full = torch.cat([f0_full[:, :ref_frames], pad_f0, f0_full[:, ref_frames:]], dim=1) + + pad_cond = torch.zeros(1, pad_frames, N_MELS, device=device) + cond_mel = torch.cat([cond_mel[:, :ref_frames], pad_cond, cond_mel[:, ref_frames:]], dim=1) + + T_total += pad_frames + + # Build inpaint mask (keep clean mel during generation) + # The clean ref mel is located from `0` to `ref_frames` + inpaint_mask = None + if ref_frames > 0: + inpaint_mask = torch.zeros(1, T_total, dtype=torch.bool, device=device) + inpaint_mask[0, :ref_frames] = True + + # Sampling + print(f"Sampling ({args.steps} {args.solver.upper()} steps, T_total={T_total} frames = " + f"{ref_frames} ref + {pad_frames} pad + {T} source)...") + pred_mel = ode_sample(model, cond_mel, ppg_full, hubert_full, f0_full, spk, + inpaint_mask=inpaint_mask, + steps=args.steps, method=args.solver, + inpaint_mode=args.inpaint, device=device) + + # Trim prepended reference region AND pad frames + trim_frames = ref_frames + (pad_frames if pad_frames > 0 else 0) + if trim_frames > 0: + pred_mel = pred_mel[:, trim_frames:] + print(f"Output: {pred_mel.shape[1]} frames ({pred_mel.shape[1] / (SAMPLE_RATE / HOP_LENGTH):.1f} sec)") + + # Decode + wav = mel_to_audio(pred_mel, device) # pred_mel: (1, T, N_MELS) + if wav is not None: + torchaudio.save(args.output, wav.cpu().unsqueeze(0), SAMPLE_RATE) + print(f"Saved: {args.output}") + else: + torch.save(pred_mel.cpu(), args.output.replace(".wav", "_mel.pt")) + print(f"Saved mel: {args.output.replace('.wav', '_mel.pt')}") + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument("--ckpt", type=str, required=True, + help="Stage 1 checkpoint, or Stage 2 speaker checkpoint (requires --stage1_ckpt)") + parser.add_argument("--stage1_ckpt", type=str, default=None, + help="Stage 1 checkpoint required when --ckpt is a Stage 2 checkpoint") + parser.add_argument("--f5tts_ckpt", type=str, + default="./chkpt_f5svc/model_1200000.safetensors", + help="Path to pretrained F5-TTS base checkpoint") + parser.add_argument("--source", type=str, required=True, + help="Source singing .wav to convert") + parser.add_argument("--target_spk", type=str, required=True, + help="Target speaker .spk.npy embedding") + parser.add_argument("--ref_audio", type=str, default=None, + help="Short .wav of target speaker for timbre reference") + parser.add_argument("--output", type=str, default="./converted.wav") + parser.add_argument("--steps", type=int, default=32, + help="Number of ODE steps. Default 32.") + parser.add_argument("--solver", type=str, default="heun", choices=["euler", "heun", "rk4"], + help="ODE solver. 'euler' is fast (1 NFE), 'heun' is better (2 NFE), 'rk4' is best but slow (4 NFE).") + parser.add_argument("--ref_sec", type=float, default=3.0, + help="Seconds of ref_audio to use as timbre reference") + parser.add_argument("--f0_shift", type=float, default=0.0, + help="Shift source F0 by N semitones (e.g. 12 for octave up)") + parser.add_argument("--pad_start", type=float, default=0.0, + help="Seconds of mirror padding to prepend (can help stabilize early generation)") + parser.add_argument("--inpaint", type=str, default="none", + choices=["none", "soft", "hard"], + help="Inpaint mode for ref region. 'none'=cond only (recommended), " + "'soft'=fading blend, 'hard'=snap every step.") + parser.add_argument("--lora_rank", type=int, default=16) + args = parser.parse_args() + infer(args) diff --git a/launcher.bat b/launcher.bat new file mode 100755 index 0000000000000000000000000000000000000000..3395a893d01923ee3037a49c31dd861feebee426 --- /dev/null +++ b/launcher.bat @@ -0,0 +1,2 @@ +.\runtime\python.exe .\app.py +pause \ No newline at end of file diff --git a/losses/cfm_loss.py b/losses/cfm_loss.py new file mode 100644 index 0000000000000000000000000000000000000000..ca99aa1600fb6e7e9d838e714dfa06921b2f3478 --- /dev/null +++ b/losses/cfm_loss.py @@ -0,0 +1,74 @@ +import torch +import torch.nn as nn +import torch.nn.functional as F + +class CFMLoss(nn.Module): + def __init__(self, lambda_proj=1.0): + super().__init__() + self.lambda_proj = lambda_proj + self.mse_loss = nn.MSELoss() + + def forward(self, z_target, c, dit_model, projector_model, mask=None): + """ + z_target: (B, T, D) - quantizer targets from frozen codec + c: (B, T, cond_dim) - resampled conditioning features + dit_model: v_theta(z_t, t, c) + projector_model: P(u) -> z_hat + mask: (B, T) - boolean or float mask where 1 is valid, 0 is padding + """ + device = z_target.device + B, T, D = z_target.shape + + if mask is None: + mask = torch.ones((B, T), device=device) + + # 1. Sample N(0, I) starting noise and logit-normal timesteps + # Logit-normal sampling (used in SD3, Matcha-TTS) concentrates t around 0.3-0.7 + # where the model must learn the actual flow trajectory, preventing the shortcut + # of predicting v ≈ z_target from conditioning while ignoring z_t and t. + z_noise = torch.randn_like(z_target) + t = torch.sigmoid(torch.randn((B, 1), device=device)) # (B, 1), logit-normal in [0, 1] + + # Broadcast t to match z shape for interpolation + t_expand = t.unsqueeze(-1).expand(-1, T, D) + + # Rectified flow straight-line path + z_t = (1 - t_expand) * z_noise + t_expand * z_target + + # Target velocity + v_target = z_target - z_noise + + # 2. Predict velocity using DiT + v_pred = dit_model(z_t, t, c, mask=mask) + + # Expand mask for element-wise loss logic + loss_mask = mask.unsqueeze(-1) # (B, T, 1) + valid_frames = loss_mask.sum() + if valid_frames == 0: + valid_frames = 1.0 + + # Standard Flow Loss (Masked) + flow_loss_elements = (v_pred - v_target) ** 2 + flow_loss = (flow_loss_elements * loss_mask).sum() / (valid_frames * D) + + # 3. Implied Target Prediction + z1_pred = z_t + (1 - t_expand) * v_pred + + # 4. Projection Loss + # Stop-gradient: proj_loss trains the projector only, not DiT. + # Without detach(), proj_loss backprops (1-t)-amplified gradients into DiT, + # creating high-variance gradient spikes at small t that cause explosion. + z1_pred_transposed = z1_pred.detach().transpose(1, 2) + z_target_transposed = z_target.transpose(1, 2) + + z_hat = projector_model(z1_pred_transposed) + + # Proj loss transposed back to (B, T, D) for masking + z_hat_t = z_hat.transpose(1, 2) + proj_loss_elements = (z_hat_t - z_target) ** 2 + proj_loss = (proj_loss_elements * loss_mask).sum() / (valid_frames * D) + + # Total Loss + loss = flow_loss + (self.lambda_proj * proj_loss) + + return loss, flow_loss, proj_loss diff --git a/measure_diff.py b/measure_diff.py new file mode 100644 index 0000000000000000000000000000000000000000..28dece9d91a4e975c4af1d40fef24eeadafa28c7 --- /dev/null +++ b/measure_diff.py @@ -0,0 +1,8 @@ +import soundfile as sf +import numpy as np +wav_gt, _ = sf.read('test_train_gt.wav') +wav_pred, _ = sf.read('test_overfit_pe.wav') +min_len = min(len(wav_gt), len(wav_pred)) +diff = np.abs(wav_gt[:min_len] - wav_pred[:min_len]).mean() +print(f"Mean Abs Diff: {diff:.6f}") +print(f"Max Abs Error: np.abs(wav_gt[:min_len] - wav_pred[:min_len]).max():.6f") diff --git a/models/cfm.py b/models/cfm.py new file mode 100644 index 0000000000000000000000000000000000000000..bce70e2e5cbb1262a4b3e19d0dfcaf319c7a19ed --- /dev/null +++ b/models/cfm.py @@ -0,0 +1,148 @@ +import torch +import torch.nn as nn +from torch.utils.checkpoint import checkpoint + +class AdaLN(nn.Module): + def __init__(self, cond_dim, hidden_dim): + super().__init__() + self.proj = nn.Linear(cond_dim, 2 * hidden_dim) + nn.init.constant_(self.proj.weight, 0) + nn.init.constant_(self.proj.bias, 0) + self.norm = nn.LayerNorm(hidden_dim, elementwise_affine=False) + + def forward(self, x, c): + # x: (B, T, D) + # c: (B, T, cond_dim) or (B, cond_dim) + if c.ndim == 2: + c = c.unsqueeze(1) + + gamma, beta = self.proj(c).chunk(2, dim=-1) + return self.norm(x) * (1 + gamma) + beta + + def custom_forward(self, x, c): + return self.forward(x, c) + +class DiTBlock(nn.Module): + def __init__(self, hidden_dim, num_heads, cond_dim): + super().__init__() + self.ada_ln_1 = AdaLN(cond_dim, hidden_dim) + self.attn = nn.MultiheadAttention(hidden_dim, num_heads, batch_first=True) + + self.ada_ln_2 = AdaLN(cond_dim, hidden_dim) + self.mlp = nn.Sequential( + nn.Linear(hidden_dim, hidden_dim * 4), + nn.GELU(), + nn.Linear(hidden_dim * 4, hidden_dim) + ) + + def forward(self, x, c, key_padding_mask=None): + # x: (B, T, D), c: (B, T, cond_dim) + h = self.ada_ln_1(x, c) + attn_out, _ = self.attn(h, h, h, key_padding_mask=key_padding_mask, need_weights=False) + x = x + attn_out + + h = self.ada_ln_2(x, c) + x = x + self.mlp(h) + return x + +import math + +class PositionalEncoding(nn.Module): + def __init__(self, d_model, max_len=16000): + super().__init__() + position = torch.arange(max_len).unsqueeze(1) + div_term = torch.exp(torch.arange(0, d_model, 2) * (-math.log(10000.0) / d_model)) + pe = torch.zeros(1, max_len, d_model) + pe[0, :, 0::2] = torch.sin(position * div_term) + pe[0, :, 1::2] = torch.cos(position * div_term) + self.register_buffer('pe', pe) + + def forward(self, x): + """x: (B, T, D)""" + return x + self.pe[:, :x.size(1), :] + +class SinusoidalTimeEmbedding(nn.Module): + """ + Projects scalar t in [0,1] to a rich Fourier feature vector, then through + a small MLP. This is the standard approach in DDPM/DiT/Matcha-TTS. + A raw Linear(1, D) only produces linearly-scaled variants of t before + activation, giving the model too little information to distinguish timesteps. + """ + def __init__(self, hidden_dim, cond_dim, max_period=10000): + super().__init__() + assert hidden_dim % 2 == 0 + self.hidden_dim = hidden_dim + self.max_period = max_period + # Learned projection from sinusoidal features to cond_dim + self.proj = nn.Sequential( + nn.Linear(hidden_dim, hidden_dim * 4), + nn.SiLU(), + nn.Linear(hidden_dim * 4, cond_dim), + ) + + def forward(self, t): + # t: (B, 1) in [0, 1] + half = self.hidden_dim // 2 + freqs = torch.exp( + -math.log(self.max_period) * torch.arange(half, device=t.device) / half + ) # (half,) + args = t * freqs.unsqueeze(0) # (B, half) + emb = torch.cat([torch.cos(args), torch.sin(args)], dim=-1) # (B, hidden_dim) + return self.proj(emb) # (B, cond_dim) + + +class DiT(nn.Module): + """ + Diffusion Transformer predicting the velocity field v_theta(z_t, t, c) -> dz/dt + """ + def __init__(self, in_channels=128, cond_dim=128, hidden_dim=512, depth=6, num_heads=8, use_checkpointing=False): + super().__init__() + self.in_proj = nn.Linear(in_channels, hidden_dim) + self.use_checkpointing = use_checkpointing + self.pos_emb = PositionalEncoding(hidden_dim, max_len=16000) + + # Sinusoidal time embedding — richer time conditioning than Linear(1, D) + self.t_proj = SinusoidalTimeEmbedding(hidden_dim, cond_dim) + + self.blocks = nn.ModuleList([ + DiTBlock(hidden_dim, num_heads, cond_dim) for _ in range(depth) + ]) + + self.out_proj = nn.Linear(hidden_dim, in_channels) + nn.init.constant_(self.out_proj.weight, 0) + nn.init.constant_(self.out_proj.bias, 0) + + def forward(self, z_t, t, c, mask=None): + """ + z_t: (B, T, C) - noisy latents + t: (B, 1) - timesteps + c: (B, T, cond_dim) - conditioning features from CondEncoder + mask: (B, T) - 1 for valid, 0 for padded + """ + x = self.in_proj(z_t) + x = self.pos_emb(x) # Add Positional Encoding! + + t_emb = self.t_proj(t) # (B, cond_dim) + + # Combine conditioning and time + # c is (B, T, cond_dim) -> add t_emb to it + c = c + t_emb.unsqueeze(1) + + # PyTorch attention mask expects True for padded elements to ignore them + key_padding_mask = (mask == 0) if mask is not None else None + + for block in self.blocks: + if self.use_checkpointing and self.training: + # Custom checkpoint wrapper to handle multiple arguments + def create_custom_forward(module): + def custom_forward(*inputs): + return module(inputs[0], inputs[1], key_padding_mask=inputs[2]) + return custom_forward + x = checkpoint(create_custom_forward(block), x, c, key_padding_mask, use_reentrant=False) + else: + x = block(x, c, key_padding_mask=key_padding_mask) + + out = self.out_proj(x) + if mask is not None: + out = out * mask.unsqueeze(-1) + return out diff --git a/models/codec_wrapper.py b/models/codec_wrapper.py new file mode 100644 index 0000000000000000000000000000000000000000..4b090089dff26f5c39dc7aab0bf8a1e9c965e76e --- /dev/null +++ b/models/codec_wrapper.py @@ -0,0 +1,74 @@ +import os +import torch +import torch.nn as nn +import dac +import torch.nn.functional as F + +# Resolve project root relative to this file (models/codec_wrapper.py -> project root) +_PROJECT_ROOT = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) + +class CodecWrapper(nn.Module): + def __init__(self, backend="dac", latent_dim=1024): + super().__init__() + self.latent_dim = latent_dim + + print(f"Loading actual {backend.upper()} codec model...") + if backend == "dac": + # Prefer local weights for reproducibility and portability + local_path = os.path.join(_PROJECT_ROOT, "codec_pretrain", "dac_44khz.pth") + if os.path.isfile(local_path): + print(f" Using local DAC weights: {local_path}") + self.codec = dac.utils.load_model(load_path=local_path) + else: + print(f" Local weights not found at {local_path}, downloading via dac library...") + self.codec = dac.utils.load_model(tag="latest", model_type="44khz") + + # Freeze all codec parameters + if self.codec is not None: + self.codec.eval() + for p in self.codec.parameters(): + p.requires_grad = False + + # The Projector Network P(u) -> z_hat + self.projector = nn.Sequential( + nn.Conv1d(latent_dim, latent_dim * 2, kernel_size=3, padding=1), + nn.GELU(), + nn.Conv1d(latent_dim * 2, latent_dim, kernel_size=3, padding=1) + ) + + def forward_project(self, u_hat): + """ + Maps continuous flow prediction back to codebook manifold. + u_hat: (B, D, T) + """ + return self.projector(u_hat) + + @torch.no_grad() + def decode(self, z_hat): + """ + Decode projected latents into waveform using the frozen codec. + z_hat: (B, D, T) -> dac takes (B, D, T) discrete mapping. + """ + import warnings + warnings.filterwarnings("ignore") + return self.codec.decode(z_hat) + + @torch.no_grad() + def encode(self, wav, sample_rate): + """ + Encode waveform to codec latent space. + wav: (B, 1, T) or (B, T) + Returns z with shape (B, D, T_latent) + """ + if wav.ndim == 2: + wav = wav.unsqueeze(1) + if wav.ndim != 3: + raise ValueError(f"Expected wav to be 2D or 3D tensor, got shape {tuple(wav.shape)}") + + # DAC 44k expects 44.1k input + if sample_rate != 44100: + target_len = int(round(wav.shape[-1] * 44100 / sample_rate)) + wav = F.interpolate(wav, size=target_len, mode="linear", align_corners=False) + wav = self.codec.preprocess(wav, 44100) + z, _, _, _, _ = self.codec.encode(wav) + return z diff --git a/models/cond_encoder.py b/models/cond_encoder.py new file mode 100644 index 0000000000000000000000000000000000000000..1fac409bd16638d7b4a1db4c58ad47bfcea01eb8 --- /dev/null +++ b/models/cond_encoder.py @@ -0,0 +1,71 @@ +import torch +import torch.nn as nn +import torch.nn.functional as F + +class CondEncoder(nn.Module): + def __init__(self, ppg_dim=1280, hubert_dim=256, f0_dim=1, spk_dim=256, cond_out_dim=1024): + super().__init__() + # Projections for each feature + self.ppg_proj = nn.Linear(ppg_dim, cond_out_dim) + self.hubert_proj = nn.Linear(hubert_dim, cond_out_dim) + self.spk_proj = nn.Linear(spk_dim, cond_out_dim) + + # Simple f0 embedding (or continuous mapping) + self.f0_proj = nn.Sequential( + nn.Linear(f0_dim, 64), + nn.GELU(), + nn.Linear(64, cond_out_dim) + ) + + # Gated fusion + self.gate = nn.Linear(cond_out_dim * 4, cond_out_dim * 4) + self.combine = nn.Linear(cond_out_dim * 4, cond_out_dim) + + self.cond_out_dim = cond_out_dim + + def forward(self, ppg, hubert, f0, spk, target_seq_len): + """ + ppg: (B, T_ppg, ppg_dim) - e.g. from Whisper ~50Hz + hubert: (B, T_hubert, hubert_dim) - e.g. from Hubert ~50Hz + f0: (B, T_f0, 1) - e.g. from Crepe ~100Hz + spk: (B, spk_dim) - 1D Global embedding + target_seq_len: int - e.g. from Codec ~86Hz + + Returns: + c: (B, target_seq_len, cond_out_dim) + """ + # 1. Project inputs + ppg_h = self.ppg_proj(ppg) # (B, T_ppg, D) + hubert_h = self.hubert_proj(hubert) # (B, T_hubert, D) + f0_h = self.f0_proj(f0) # (B, T_f0, D) + + # 2. Temporal Resampling (Linear interpolation to match target sequence length) + # F.interpolate expects (B, C, T), so we transpose + ppg_h = ppg_h.transpose(1, 2) # (B, D, T_ppg) + hubert_h = hubert_h.transpose(1, 2) # (B, D, T_hubert) + f0_h = f0_h.transpose(1, 2) # (B, D, T_f0) + + if ppg_h.shape[2] != target_seq_len: + ppg_r = F.interpolate(ppg_h, size=target_seq_len, mode='linear', align_corners=False).transpose(1, 2) + else: ppg_r = ppg_h.transpose(1, 2) + + if hubert_h.shape[2] != target_seq_len: + hubert_r = F.interpolate(hubert_h, size=target_seq_len, mode='linear', align_corners=False).transpose(1, 2) + else: hubert_r = hubert_h.transpose(1, 2) + + if f0_h.shape[2] != target_seq_len: + f0_r = F.interpolate(f0_h, size=target_seq_len, mode='linear', align_corners=False).transpose(1, 2) + else: f0_r = f0_h.transpose(1, 2) + + # 3. Speaker embedding broadcast + spk_h = self.spk_proj(spk) # (B, D) + spk_r = spk_h.unsqueeze(1).expand(-1, target_seq_len, -1) # (B, T, D) + + # 4. Learned Gated Fusion + stacked = torch.cat([ppg_r, hubert_r, f0_r, spk_r], dim=-1) # (B, T, 4D) + + gate_weights = torch.sigmoid(self.gate(stacked)) + gated = stacked * gate_weights + + c = self.combine(gated) + return c diff --git a/models/f5_svc.py b/models/f5_svc.py new file mode 100644 index 0000000000000000000000000000000000000000..080244d86fef92e81b5d0d87bac967736b2160fa --- /dev/null +++ b/models/f5_svc.py @@ -0,0 +1,230 @@ +""" +F5SVCModel: F5-TTS with SVC conditioning replacing the text encoder. + +Architecture changes vs vanilla F5-TTS +--------------------------------------- +1. TextEmbedding (char tokens → ConvNeXt) is replaced by a passthrough shim. + The SVCCondAdapter is called externally and the result is passed as `text`. +2. LoRA adapters (rank 16 by default) are injected into every DiT attention + projection (to_q, to_k, to_v, to_out). +3. Two parameter groups for the two-stage training protocol: + Stage 1 → SVCCondAdapter + LoRA (singing adaptation) + Stage 2 → svc_adapter.spk_proj + stacked LoRA A₂B₂ (rank-4, per-speaker) + +Requires: pip install f5-tts +""" + +from __future__ import annotations + +import torch +import torch.nn as nn + +from .svc_cond_adapter import SVCCondAdapter +from .lora_utils import inject_lora, count_trainable + + +# --------------------------------------------------------------------------- +# Passthrough shim — replaces F5-TTS's internal TextEmbedding +# --------------------------------------------------------------------------- + +class _SVCTextEmbedShim(nn.Module): + """ + F5-TTS calls: self.text_embed(text_tokens, seq_len, drop_text=…) + We pre-compute the SVC conditioning externally and pass it as `text_tokens`. + This shim just returns it (or zeros on CFG drop). + """ + + def forward(self, x: torch.Tensor, seq_len: int, drop_text: bool = False) -> torch.Tensor: + # x: (B, T, text_dim) — pre-computed by SVCCondAdapter + if drop_text: + return torch.zeros_like(x) + return x + + +# --------------------------------------------------------------------------- +# Main model +# --------------------------------------------------------------------------- + +class F5SVCModel(nn.Module): + def __init__( + self, + f5tts_transformer: nn.Module, # DiT extracted from the F5-TTS CFM wrapper + ppg_dim: int = 1280, + hubert_dim: int = 256, + f0_dim: int = 1, + spk_dim: int = 256, + text_dim: int = 512, # F5-TTS Base=512, Small=256 + lora_rank: int = 16, + lora_alpha: float = 16.0, + feat_sr: float = 50.0, + mel_sr: float = 93.75, # 24000 / 256 + ): + super().__init__() + self.transformer = f5tts_transformer + + # 1. SVC conditioning adapter (replaces text encoder) + self.svc_adapter = SVCCondAdapter( + ppg_dim=ppg_dim, + hubert_dim=hubert_dim, + f0_dim=f0_dim, + spk_dim=spk_dim, + out_dim=text_dim, + feat_sr=feat_sr, + mel_sr=mel_sr, + ) + + # 2. Replace text_embed with passthrough shim + if not hasattr(self.transformer, "text_embed"): + raise RuntimeError( + "Could not find 'text_embed' in the F5-TTS transformer. " + "Check that you are passing the DiT, not the CFM wrapper." + ) + self.transformer.text_embed = _SVCTextEmbedShim() + + # 3. Inject LoRA into DiT attention projections + n_lora = inject_lora(self.transformer, rank=lora_rank, alpha=lora_alpha) + print(f"F5SVCModel: injected LoRA into {n_lora} attention layers (rank={lora_rank})") + + # ------------------------------------------------------------------ + # Parameter-group helpers + # ------------------------------------------------------------------ + + def set_stage1_trainable(self) -> None: + """Stage 1: SVCCondAdapter + LoRA adapters only.""" + for p in self.parameters(): + p.requires_grad_(False) + for p in self.svc_adapter.parameters(): + p.requires_grad_(True) + for name, p in self.transformer.named_parameters(): + if "lora_A" in name or "lora_B" in name: + p.requires_grad_(True) + print(f"Stage 1 trainable: {count_trainable(self):,} params") + + def set_stage2_trainable(self, stage2_rank: int = 4) -> None: + """Stage 2: spk_proj + new LoRA pair (A₂B₂) for speaker adaptation. + + Freezes everything first, then: + - Injects a second LoRA pair (A₂B₂) into every LoRALinear layer + - Makes spk_proj + A₂B₂ trainable + Stage 1 LoRA (A₁B₁), content_proj, and base weights remain frozen. + """ + from models.lora_utils import inject_lora_stage2 + + # Freeze everything + for p in self.parameters(): + p.requires_grad_(False) + + # Add Stage 2 LoRA on top of existing Stage 1 LoRA layers + n_s2 = inject_lora_stage2(self.transformer, rank=stage2_rank) + print(f"Stage 2: injected {n_s2} second LoRA pairs (rank={stage2_rank})") + + # Make spk_proj + A₂B₂ trainable + for p in self.svc_adapter.spk_proj.parameters(): + p.requires_grad_(True) + for name, p in self.transformer.named_parameters(): + if "lora_A2" in name or "lora_B2" in name: + p.requires_grad_(True) + print(f"Stage 2 trainable: {count_trainable(self):,} params") + + # ------------------------------------------------------------------ + # Forward: compute SVC conditioning, then call DiT + # ------------------------------------------------------------------ + + def forward( + self, + x: torch.Tensor, # (B, T, mel_dim) noised mel + cond: torch.Tensor, # (B, T, mel_dim) reference mel (zero-padded target) + ppg: torch.Tensor, # (B, T_feat, 1280) + hubert: torch.Tensor, # (B, T_feat, 256) + f0: torch.Tensor, # (B, T_feat, 1) + spk: torch.Tensor, # (B, 256) + time: torch.Tensor, # (B,) + drop_audio_cond: bool = False, + drop_text: bool = False, + mask: torch.Tensor | None = None, + ) -> torch.Tensor: + target_len = x.shape[1] + svc_cond = self.svc_adapter(ppg, hubert, f0, spk, target_len=target_len) + return self.transformer( + x=x, + cond=cond, + text=svc_cond, + time=time, + drop_audio_cond=drop_audio_cond, + drop_text=drop_text, + mask=mask, + cache=False, # CRITICAL: disable text caching so gradients flow to adapter + ) + + +# --------------------------------------------------------------------------- +# Factory +# --------------------------------------------------------------------------- + +def build_f5svc( + f5tts_ckpt_path: str | None = None, + lora_rank: int = 16, + text_dim: int = 512, + **adapter_kwargs, +) -> F5SVCModel: + """ + Build an F5SVCModel: + 1. Instantiate F5-TTS DiT with Base config. + 2. Optionally load pretrained weights (text_embed keys are dropped since + we replace that module). + 3. Wrap with SVCCondAdapter + LoRA. + + Requires: pip install f5-tts + Download checkpoint: + from huggingface_hub import hf_hub_download + path = hf_hub_download("SWivid/F5-TTS", "F5TTS_Base/model_1200000.safetensors") + """ + try: + from f5_tts.model import DiT + except ImportError as e: + raise ImportError("Install F5-TTS first: pip install f5-tts") from e + + # F5-TTS Base config (matches SWivid/F5-TTS public checkpoint) + dit = DiT( + dim=1024, + depth=22, + heads=16, + ff_mult=2, + text_dim=text_dim, + conv_layers=4, + ) + + if f5tts_ckpt_path is not None: + print(f"Loading F5-TTS weights from {f5tts_ckpt_path} ...") + if f5tts_ckpt_path.endswith(".safetensors"): + from safetensors.torch import load_file + sd = load_file(f5tts_ckpt_path) + else: + raw = torch.load(f5tts_ckpt_path, map_location="cpu", weights_only=True) + sd = raw.get("model_state_dict") or raw.get("ema_model_state_dict") or raw + + # Drop text_embed — we replace that module entirely. + # Also drop mel_spec (not part of DiT) and strip common prefixes. + prefix = "ema_model.transformer." + cleaned = {} + for k, v in sd.items(): + # Strip known prefixes + if k.startswith(prefix): + k = k[len(prefix):] + elif k.startswith("ema_model."): + continue # mel_spec buffers etc. + # Drop text_embed keys + if k.startswith("text_embed"): + continue + cleaned[k] = v + + missing, unexpected = dit.load_state_dict(cleaned, strict=False) + n_missing = len([k for k in missing if not k.startswith("text_embed")]) + print(f" Loaded: {len(cleaned)} keys, missing={n_missing} unexpected={len(unexpected)}") + + return F5SVCModel( + f5tts_transformer=dit, + text_dim=text_dim, + lora_rank=lora_rank, + **adapter_kwargs, + ) diff --git a/models/lora_utils.py b/models/lora_utils.py new file mode 100644 index 0000000000000000000000000000000000000000..5436e033b188c7e27f61c3b4a75e9ebae9b87a9e --- /dev/null +++ b/models/lora_utils.py @@ -0,0 +1,152 @@ +""" +LoRA utilities for injecting low-rank adapters into F5-TTS's DiT. + +Supports stacked LoRA for two-stage training: + Stage 1: W + A₁B₁ (singing adaptation) + Stage 2: W + A₁B₁ (frozen) + A₂B₂ (speaker adaptation) + +Usage: + inject_lora(dit, rank=16) # Stage 1 LoRA (A₁B₁) + inject_lora_stage2(dit, rank=4) # Stage 2 LoRA (A₂B₂) on top + get_trainable_state_dict(model) # checkpoint only trainable weights +""" + +from __future__ import annotations + +from typing import Iterable + +import torch +import torch.nn as nn + + +class LoRALinear(nn.Module): + """ + Drop-in replacement for nn.Linear with stacked low-rank bypasses: + output = W(x) + (x @ A₁ @ B₁) * s₁ [+ (x @ A₂ @ B₂) * s₂] + + Stage 1: A₁ initialized normal, B₁ zeros → bypass starts at 0. + Stage 2 (optional): A₂ initialized normal, B₂ zeros → new bypass on top. + The original weight matrix W is always frozen. + """ + + def __init__(self, linear: nn.Linear, rank: int = 16, alpha: float = 16.0): + super().__init__() + self.linear = linear + self.scale = alpha / rank + + in_f, out_f = linear.in_features, linear.out_features + self.lora_A = nn.Parameter(torch.randn(in_f, rank) * (rank ** -0.5)) + self.lora_B = nn.Parameter(torch.zeros(rank, out_f)) + + linear.weight.requires_grad_(False) + if linear.bias is not None: + linear.bias.requires_grad_(False) + + # Stage 2 LoRA (added later by inject_lora_stage2) + self.lora_A2: nn.Parameter | None = None + self.lora_B2: nn.Parameter | None = None + self.scale2: float = 0.0 + + def add_stage2_lora(self, rank: int = 4, alpha: float | None = None): + """Add a second LoRA pair for Stage 2 speaker adaptation.""" + if alpha is None: + alpha = float(rank) + in_f = self.linear.in_features + out_f = self.linear.out_features + self.lora_A2 = nn.Parameter(torch.randn(in_f, rank) * (rank ** -0.5)) + self.lora_B2 = nn.Parameter(torch.zeros(rank, out_f)) + self.scale2 = alpha / rank + + def forward(self, x: torch.Tensor) -> torch.Tensor: + y = self.linear(x) + (x @ self.lora_A @ self.lora_B) * self.scale + if self.lora_A2 is not None: + y = y + (x @ self.lora_A2 @ self.lora_B2) * self.scale2 + return y + + def merge(self) -> nn.Linear: + """Return a plain Linear with all LoRA pairs baked in (for inference).""" + m = nn.Linear(self.linear.in_features, self.linear.out_features, + bias=self.linear.bias is not None, + device=self.linear.weight.device, + dtype=self.linear.weight.dtype) + w = self.linear.weight.data + (self.lora_A @ self.lora_B).T * self.scale + if self.lora_A2 is not None: + w = w + (self.lora_A2 @ self.lora_B2).T * self.scale2 + m.weight.data = w + if self.linear.bias is not None: + m.bias.data = self.linear.bias.data.clone() + return m + + +# Attention projection names used in F5-TTS's DiT blocks +_F5TTS_ATTN_TARGETS: tuple[str, ...] = ("to_q", "to_k", "to_v", "to_out.0") + + +def inject_lora( + model: nn.Module, + rank: int = 16, + alpha: float = 16.0, + target_suffixes: Iterable[str] = _F5TTS_ATTN_TARGETS, +) -> int: + """ + Walk all named children and replace matching nn.Linear layers with + LoRALinear wrappers in-place. Returns the count of replaced layers. + """ + replaced = 0 + suffixes = tuple(target_suffixes) + for parent_name, parent in model.named_modules(): + for child_name, child in list(parent.named_children()): + if not isinstance(child, nn.Linear): + continue + full = f"{parent_name}.{child_name}" if parent_name else child_name + if any(full.endswith(s) for s in suffixes): + setattr(parent, child_name, LoRALinear(child, rank=rank, alpha=alpha)) + replaced += 1 + return replaced + + +def inject_lora_stage2( + model: nn.Module, + rank: int = 4, + alpha: float | None = None, +) -> int: + """ + Add a second LoRA pair (A₂, B₂) to every existing LoRALinear layer. + Stage 1 LoRA (A₁, B₁) remains frozen; only A₂, B₂ are trainable. + + Returns the count of layers augmented. + """ + augmented = 0 + for module in model.modules(): + if isinstance(module, LoRALinear) and module.lora_A2 is None: + module.add_stage2_lora(rank=rank, alpha=alpha) + augmented += 1 + return augmented + + +def freeze_non_lora(model: nn.Module) -> None: + """Freeze all parameters except LoRA A/B matrices.""" + for name, p in model.named_parameters(): + p.requires_grad_("lora_A" in name or "lora_B" in name) + + +def get_lora_state_dict(model: nn.Module) -> dict: + """Return only LoRA parameters — small file for sharing/resuming.""" + return {k: v for k, v in model.state_dict().items() + if "lora_A" in k or "lora_B" in k} + + +def get_trainable_state_dict(model: nn.Module) -> dict: + """Return all trainable parameters (adapter + LoRA stage 1 + LoRA stage 2).""" + return {k: v for k, v in model.state_dict().items() + if "svc_adapter" in k or "lora_A" in k or "lora_B" in k} + + +def get_stage2_state_dict(model: nn.Module) -> dict: + """Return only Stage 2 trainable weights: spk_proj + A₂B₂.""" + return {k: v for k, v in model.state_dict().items() + if "spk_proj" in k or "lora_A2" in k or "lora_B2" in k} + + +def count_trainable(model: nn.Module) -> int: + return sum(p.numel() for p in model.parameters() if p.requires_grad) diff --git a/models/svc_cond_adapter.py b/models/svc_cond_adapter.py new file mode 100644 index 0000000000000000000000000000000000000000..0fd86edf1b0b8ee3dee7fa3b814fee93dd4092dc --- /dev/null +++ b/models/svc_cond_adapter.py @@ -0,0 +1,66 @@ +""" +SVCCondAdapter: replaces F5-TTS's text conditioning pathway with SVC features. + +F5-TTS text path: char_tokens (B, T) → embed + ConvNeXt → (B, T_mel, text_dim) +SVC replacement: PPG/HuBERT/F0 (B, T_feat, D) → project → (B, T_mel, text_dim) + +The output shape matches F5-TTS's text_dim so the DiT sees no change. +Default text_dim=512 for F5-TTS Base (model_dim=1024). +""" + +import torch +import torch.nn as nn +import torch.nn.functional as F + + +class SVCCondAdapter(nn.Module): + def __init__( + self, + ppg_dim: int = 1280, + hubert_dim: int = 256, + f0_dim: int = 1, + spk_dim: int = 256, + out_dim: int = 512, # must match F5-TTS text_dim + feat_sr: float = 50.0, # Hz — PPG/HuBERT input frame rate + mel_sr: float = 93.75, # Hz — F5-TTS mel frame rate (24000/256) + ): + super().__init__() + self.feat_sr = feat_sr + self.mel_sr = mel_sr + + feat_in = ppg_dim + hubert_dim + f0_dim + self.content_proj = nn.Sequential( + nn.Linear(feat_in, out_dim * 2), + nn.SiLU(), + nn.Linear(out_dim * 2, out_dim), + ) + # Speaker embedding broadcast-added to every frame + self.spk_proj = nn.Linear(spk_dim, out_dim) + + # Small-scale init on output layers so the adapter starts near-zero + # (DiT initially sees near-zero conditioning, preserving pretrained state) + # but gradients can still flow. Pure zero-init kills gradients entirely + # because ∂output/∂W = 0 when W = 0. + nn.init.normal_(self.content_proj[-1].weight, std=0.01) + nn.init.zeros_(self.content_proj[-1].bias) + nn.init.normal_(self.spk_proj.weight, std=0.01) + nn.init.zeros_(self.spk_proj.bias) + + def forward( + self, + ppg: torch.Tensor, # (B, T_feat, ppg_dim) + hubert: torch.Tensor, # (B, T_feat, hubert_dim) + f0: torch.Tensor, # (B, T_feat, 1) + spk: torch.Tensor, # (B, spk_dim) + target_len: int, # number of mel frames to produce + ) -> torch.Tensor: # (B, target_len, out_dim) + feat = torch.cat([ppg, hubert, f0], dim=-1) # (B, T_feat, feat_in) + + # Resample from feature frame rate to mel frame rate + feat = feat.transpose(1, 2) # (B, feat_in, T_feat) + feat = F.interpolate(feat, size=target_len, mode="linear", align_corners=False) + feat = feat.transpose(1, 2) # (B, target_len, feat_in) + + out = self.content_proj(feat) # (B, target_len, out_dim) + out = out + self.spk_proj(spk).unsqueeze(1) # add speaker (broadcast) + return out diff --git a/packages.txt b/packages.txt new file mode 100644 index 0000000000000000000000000000000000000000..594c7ceddf80d2c8e9758e4bf5b60df6b22195aa --- /dev/null +++ b/packages.txt @@ -0,0 +1,2 @@ +ffmpeg +libsndfile1 diff --git a/parse_log.py b/parse_log.py new file mode 100644 index 0000000000000000000000000000000000000000..80a87d400ad5817aecc3e6b917562bdd5a7f30c0 --- /dev/null +++ b/parse_log.py @@ -0,0 +1,65 @@ +import re +import numpy as np +import matplotlib.pyplot as plt +import pandas as pd +import argparse + +def main(): + parser = argparse.ArgumentParser(description='Parse training log and plot losses.') + parser.add_argument('--log_file', type=str, default='logs/train_222379.out', help='Path to the log file') + args = parser.parse_args() + + log_path = args.log_file + data = [] + + # Pattern to match lines like: + # Epoch 70, Step 0, Total: 1.2872, Flow: 1.0775, Proj: 0.2097, Teacher: 0.0000 + pattern = re.compile(r'Epoch (\d+), Step (\d+), Total: ([\d\.]+), Flow: ([\d\.]+), Proj: ([\d\.]+)') + + with open(log_path, 'r') as f: + for line in f: + match = pattern.search(line) + if match: + epoch = int(match.group(1)) + step = int(match.group(2)) + total = float(match.group(3)) + flow = float(match.group(4)) + proj = float(match.group(5)) + data.append({ + 'epoch': epoch, + 'step': step, + 'total': total, + 'flow': flow, + 'proj': proj + }) + + if not data: + print("No valid log lines found.") + return + + df = pd.DataFrame(data) + + # Compute mean, std + stats = df[['total', 'flow', 'proj']].agg(['mean', 'std', 'min', 'max']) + print("--- Loss Statistics ---") + print(stats) + + # Plotting + plt.figure(figsize=(12, 6)) + plt.plot(df.index, df['total'], label='Total Loss', alpha=0.8, linewidth=1) + plt.plot(df.index, df['flow'], label='Flow Loss', alpha=0.8, linewidth=1) + plt.plot(df.index, df['proj'], label='Proj Loss', alpha=0.8, linewidth=1) + + plt.xlabel('Logging Steps') + plt.ylabel('Loss Value') + plt.title('Training Losses over Time') + plt.legend() + plt.grid(True, linestyle='--', alpha=0.7) + plt.tight_layout() + + plot_path = 'loss_plot.png' + plt.savefig(plot_path, dpi=300) + print(f"\nPlot successfully generated and saved to {plot_path}") + +if __name__ == '__main__': + main() diff --git a/pitch/__init__.py b/pitch/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..bc418142ecbe692527dfe1f1768205a50726dd2f --- /dev/null +++ b/pitch/__init__.py @@ -0,0 +1 @@ +from .inference import load_csv_pitch \ No newline at end of file diff --git a/pitch/debug.py b/pitch/debug.py new file mode 100644 index 0000000000000000000000000000000000000000..c9174d4840bf42e387a889b9af7598530e4a7abf --- /dev/null +++ b/pitch/debug.py @@ -0,0 +1,24 @@ +import argparse +import numpy as np + + +def save_csv_pitch(pitch, path): + with open(path, "w", encoding='utf-8') as pitch_file: + for i in range(len(pitch)): + t = i * 10 + minute = t // 60000 + seconds = (t - minute * 60000) // 1000 + millisecond = t % 1000 + print( + f"{minute}m {seconds}s {millisecond:3d},{int(pitch[i])}", file=pitch_file) + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.description = 'please enter embed parameter ...' + parser.add_argument("-p", "--pit", help="pit", dest="pit") # pit for train + args = parser.parse_args() + print(args.pit) + + pitch = np.load(args.pit) + save_csv_pitch(pitch, 'pitch_debug.csv') diff --git a/pitch/inference.py b/pitch/inference.py new file mode 100644 index 0000000000000000000000000000000000000000..f9bdeab36d50a2b67fab90630fc8f456f7b9eab7 --- /dev/null +++ b/pitch/inference.py @@ -0,0 +1,101 @@ +import sys,os +sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) +import torch +import librosa +import argparse +import numpy as np +import crepe + + +def compute_f0_voice(filename, device): + audio, sr = librosa.load(filename, sr=16000) + assert sr == 16000 + audio = torch.tensor(np.copy(audio))[None] + # Here we'll use a 10 millisecond hop length + hop_length = 160 + fmin = 50 + fmax = 1000 + model = "full" + batch_size = 512 + pitch = crepe.predict( + audio, + sr, + hop_length, + fmin, + fmax, + model, + batch_size=batch_size, + device=device, + return_periodicity=False, + ) + pitch = crepe.filter.mean(pitch, 5) + pitch = pitch.squeeze(0) + return pitch + + +def compute_f0_sing(filename, device): + audio, sr = librosa.load(filename, sr=16000) + assert sr == 16000 + audio = torch.tensor(np.copy(audio))[None] + # Here we'll use a 20 millisecond hop length + hop_length = 320 + fmin = 50 + fmax = 1000 + model = "full" + batch_size = 512 + pitch = crepe.predict( + audio, + sr, + hop_length, + fmin, + fmax, + model, + batch_size=batch_size, + device=device, + return_periodicity=False, + ) + pitch = np.repeat(pitch, 2, -1) # 320 -> 160 * 2 + pitch = crepe.filter.mean(pitch, 5) + pitch = pitch.squeeze(0) + return pitch + + +def save_csv_pitch(pitch, path): + with open(path, "w", encoding='utf-8') as pitch_file: + for i in range(len(pitch)): + t = i * 10 + minute = t // 60000 + seconds = (t - minute * 60000) // 1000 + millisecond = t % 1000 + print( + f"{minute}m {seconds}s {millisecond:3d},{int(pitch[i])}", file=pitch_file) + + +def load_csv_pitch(path): + pitch = [] + with open(path, "r", encoding='utf-8') as pitch_file: + for line in pitch_file.readlines(): + pit = line.strip().split(",")[-1] + pitch.append(int(pit)) + return pitch + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.description = 'please enter embed parameter ...' + parser.add_argument("-w", "--wav", help="wav", dest="wav") + parser.add_argument("-p", "--pit", help="pit", dest="pit") # csv for excel + args = parser.parse_args() + print(args.wav) + print(args.pit) + + if torch.cuda.is_available(): + device = "cuda" + elif torch.backends.mps.is_available(): + device = "mps" + else: + device = "cpu" + pitch = compute_f0_sing(args.wav, device) + save_csv_pitch(pitch, args.pit) + #tmp = load_csv_pitch(args.pit) + #save_csv_pitch(tmp, "tmp.csv") diff --git a/prepare/download_obama.py b/prepare/download_obama.py new file mode 100644 index 0000000000000000000000000000000000000000..ec316acc9fbbcb64b5940936ddd6e7b446c5cac0 --- /dev/null +++ b/prepare/download_obama.py @@ -0,0 +1,68 @@ +"""Download ModelsLab/Obama-Sample-Dataset and save as 24kHz mono WAVs. + +Uses huggingface_hub to pull raw audio files directly, avoiding torchcodec dependency. +""" +import os +import argparse +import numpy as np +import soundfile as sf +import torch +import torchaudio.functional as TAF # noqa: F401 used in convert_and_save +from huggingface_hub import snapshot_download + +TARGET_SR = 24000 +AUDIO_EXTS = (".wav", ".mp3", ".flac", ".ogg", ".m4a", ".opus") + + +def convert_and_save(src_path, dst_path): + arr, sr = sf.read(src_path, always_2d=True) # (T, C), float64 + arr = arr.mean(axis=1).astype(np.float32) # mono float32 + if sr != TARGET_SR: + wav = torch.tensor(arr).unsqueeze(0) # (1, T) + wav = TAF.resample(wav, sr, TARGET_SR) + arr = wav.squeeze(0).numpy() + sf.write(dst_path, arr, TARGET_SR, subtype="PCM_16") + + +def main(out_dir): + os.makedirs(out_dir, exist_ok=True) + + print("Downloading raw files from HuggingFace...") + repo_dir = snapshot_download( + repo_id="ModelsLab/Obama-Sample-Dataset", + repo_type="dataset", + ignore_patterns=["*.json", "*.md", "*.gitattributes"], + ) + print(f" Downloaded to cache: {repo_dir}") + + audio_files = sorted([ + os.path.join(root, f) + for root, _, files in os.walk(repo_dir) + for f in files + if f.lower().endswith(AUDIO_EXTS) + ]) + print(f" Found {len(audio_files)} audio files") + + saved = 0 + for i, src in enumerate(audio_files): + dst = os.path.join(out_dir, f"{i:05d}.wav") + if os.path.exists(dst): + saved += 1 + continue + try: + convert_and_save(src, dst) + saved += 1 + except Exception as e: + print(f" Warning: skipping {src}: {e}") + if (i + 1) % 20 == 0: + print(f" {i+1}/{len(audio_files)} processed") + + print(f"Done. {saved} files saved to {out_dir}") + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument("-o", "--out_dir", default="./data_svc/waves-32k/obama", + help="Output directory for 24kHz WAV files") + args = parser.parse_args() + main(args.out_dir) diff --git a/prepare/preprocess_a.py b/prepare/preprocess_a.py new file mode 100644 index 0000000000000000000000000000000000000000..ebdc0be75d67287735bd859a2642f94cd74dfe84 --- /dev/null +++ b/prepare/preprocess_a.py @@ -0,0 +1,62 @@ +import os +import librosa +import argparse +import numpy as np +from tqdm import tqdm +from concurrent.futures import ProcessPoolExecutor, as_completed +from scipy.io import wavfile + +def resample_wave(wav_in, wav_out, sample_rate): + wav, _ = librosa.load(wav_in, sr=sample_rate) + wav = wav / np.abs(wav).max() * 0.6 + wav = wav / max(0.01, np.max(np.abs(wav))) * 32767 * 0.6 + wavfile.write(wav_out, sample_rate, wav.astype(np.int16)) + +def process_file(file, wavPath, spks, outPath, sr): + if file.endswith(".wav"): + file = file[:-4] + resample_wave(f"{wavPath}/{spks}/{file}.wav", f"{outPath}/{spks}/{file}.wav", sr) + +def process_files_with_thread_pool(wavPath, spks, outPath, sr, thread_num=None): + files = [f for f in os.listdir(f"./{wavPath}/{spks}") if f.endswith(".wav")] + + with ProcessPoolExecutor(max_workers=thread_num) as executor: + futures = {executor.submit(process_file, file, wavPath, spks, outPath, sr): file for file in files} + + for future in tqdm(as_completed(futures), total=len(futures), desc='Processing files'): + future.result() + +import multiprocessing + +if __name__ == "__main__": + try: + multiprocessing.set_start_method('spawn', force=True) + except RuntimeError: + pass + parser = argparse.ArgumentParser() + parser.description = 'please enter embed parameter ...' + parser.add_argument("-w", "--wav", help="wav", dest="wav") + parser.add_argument("-o", "--out", help="out", dest="out") + parser.add_argument("-s", "--sr", help="sample rate", dest="sr", type=int) + parser.add_argument("-t", "--thread_count", help="thread count to process, set 0 to use all cpu cores", dest="thread_count", type=int, default=1) + args = parser.parse_args() + print(args.wav) + print(args.out) + print(args.sr) + if not os.path.exists(args.out): + os.makedirs(args.out) + wavPath = args.wav + outPath = args.out + + assert args.sr == 16000 or args.sr == 32000 + + for spks in os.listdir(wavPath): + if os.path.isdir(f"./{wavPath}/{spks}"): + if not os.path.exists(f"./{outPath}/{spks}"): + os.makedirs(f"./{outPath}/{spks}") + print(f">>>>>>>>>>{spks}<<<<<<<<<<") + if args.thread_count == 0: + process_num = os.cpu_count() + else: + process_num = args.thread_count + process_files_with_thread_pool(wavPath, spks, outPath, args.sr, process_num) diff --git a/prepare/preprocess_cdc.py b/prepare/preprocess_cdc.py new file mode 100644 index 0000000000000000000000000000000000000000..c219f43a1d359de077538760bbe45a23425a02be --- /dev/null +++ b/prepare/preprocess_cdc.py @@ -0,0 +1,48 @@ +import os +import argparse +import torch +import torchaudio + +from scipy.io.wavfile import read +from scipy.io.wavfile import write +# torch=1.9.0 -> pip install torchaudio==0.9.0 -i https://mirrors.aliyun.com/pypi/simple/ +# this file is for VCTK + + +MAX_WAV_VALUE = 32768.0 + + +def cut_direct_content(iWave, oWave): + source, sr = torchaudio.load(iWave) + stft = torch.stft(source, 1024, 256, 1024, torch.hann_window(1024), return_complex=True) + stft[:, 0, :] = 0 + stft[:, 1, :] = 0 + istft = torch.istft(stft, 1024, 256, 1024, torch.hann_window(1024)) + audio = istft.squeeze() + audio = MAX_WAV_VALUE * audio + audio = audio.clamp(min=-MAX_WAV_VALUE, max=MAX_WAV_VALUE-1) + audio = audio.short() + audio = audio.data.cpu().detach().numpy() + write(oWave, sr, audio) + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.description = 'please enter parameter ...' + parser.add_argument("-i", help="input path", dest="inPath") + parser.add_argument("-o", help="output path", dest="outPath") + args = parser.parse_args() + print(args.inPath) + print(args.outPath) + os.makedirs(args.outPath, exist_ok=True) + rootPath = args.inPath + outPath = args.outPath + for spks in os.listdir(rootPath): + if (os.path.isdir(f"./{rootPath}/{spks}")): + print(f"-----------{spks}-----------") + os.makedirs(f"./{outPath}/{spks}", exist_ok=True) + for file in os.listdir(f"./{rootPath}/{spks}"): + if (file.endswith(".wav")): + iWave = f"./{rootPath}/{spks}/{file}" + oWave = f"./{outPath}/{spks}/{file}" + cut_direct_content(iWave, oWave) diff --git a/prepare/preprocess_crepe.py b/prepare/preprocess_crepe.py new file mode 100644 index 0000000000000000000000000000000000000000..ce49b497a1b607cda281aa78d55d531c59dd0519 --- /dev/null +++ b/prepare/preprocess_crepe.py @@ -0,0 +1,95 @@ +import sys,os +sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) +import numpy as np +import librosa +import torch +import crepe +import argparse +from tqdm import tqdm +from multiprocessing import set_start_method +from concurrent.futures import ProcessPoolExecutor, as_completed + +def compute_f0(filename, save, device): + audio, sr = librosa.load(filename, sr=16000) + assert sr == 16000 + # Load audio + audio = torch.tensor(np.copy(audio))[None] + # Here we'll use a 10 millisecond hop length + hop_length = 160 + # Provide a sensible frequency range for your domain (upper limit is 2006 Hz) + # This would be a reasonable range for speech + fmin = 50 + fmax = 1000 + # Select a model capacity--one of "tiny" or "full" + model = "full" + # Pick a batch size that doesn't cause memory errors on your gpu + batch_size = 512 + # Compute pitch using first gpu + pitch, periodicity = crepe.predict( + audio, + sr, + hop_length, + fmin, + fmax, + model, + batch_size=batch_size, + device=device, + return_periodicity=True, + ) + # CREPE was not trained on silent audio. some error on silent need filter.pitPath + periodicity = crepe.filter.median(periodicity, 9) + pitch = crepe.filter.median(pitch, 5) + pitch[periodicity < 0.05] = 0 + pitch = pitch.squeeze(0) + np.save(save, pitch, allow_pickle=False) + +def process_file(file, wavPath, spks, pitPath, device): + if file.endswith(".wav"): + file = file[:-4] + compute_f0(f"{wavPath}/{spks}/{file}.wav", f"{pitPath}/{spks}/{file}.pit", device) + +def process_files_with_process_pool(wavPath, spks, pitPath, device, process_num=None): + files = [f for f in os.listdir(f"./{wavPath}/{spks}") if f.endswith(".wav")] + + with ProcessPoolExecutor(max_workers=process_num) as executor: + futures = {executor.submit(process_file, file, wavPath, spks, pitPath, device): file for file in files} + + for future in tqdm(as_completed(futures), total=len(futures), desc='Processing files'): + future.result() + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.description = 'please enter embed parameter ...' + parser.add_argument("-w", "--wav", help="wav", dest="wav") + parser.add_argument("-p", "--pit", help="pit", dest="pit") + parser.add_argument("-t", "--thread_count", help="thread count to process, set 0 to use all cpu cores", dest="thread_count", type=int, default=1) + args = parser.parse_args() + print(args.wav) + print(args.pit) + if not os.path.exists(args.pit): + os.makedirs(args.pit) + wavPath = args.wav + pitPath = args.pit + + if torch.cuda.is_available(): + device = "cuda" + elif torch.backends.mps.is_available(): + device = "mps" + else: + device = "cpu" + if device == "cuda" or device == "mps": + try: + set_start_method('spawn', force=True) + except RuntimeError: + pass + + for spks in os.listdir(wavPath): + if os.path.isdir(f"./{wavPath}/{spks}"): + if not os.path.exists(f"./{pitPath}/{spks}"): + os.makedirs(f"./{pitPath}/{spks}") + print(f">>>>>>>>>>{spks}<<<<<<<<<<") + if args.thread_count == 0: + process_num = os.cpu_count() + else: + process_num = args.thread_count + process_files_with_process_pool(wavPath, spks, pitPath, device, process_num) diff --git a/prepare/preprocess_hubert.py b/prepare/preprocess_hubert.py new file mode 100644 index 0000000000000000000000000000000000000000..033fefd7769c32d03657b1a566e36ed304c8dbb9 --- /dev/null +++ b/prepare/preprocess_hubert.py @@ -0,0 +1,96 @@ +import sys,os +sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) +import numpy as np +import argparse +import torch +from tqdm import tqdm +from multiprocessing import Pool +from whisper.audio import load_audio +from hubert import hubert_model +from concurrent.futures import ThreadPoolExecutor, as_completed + + +def select_device_and_precision(): + if torch.cuda.is_available(): + return torch.device("cuda"), True + if torch.backends.mps.is_available(): + return torch.device("mps"), False + return torch.device("cpu"), False + + +def load_model(path, device, use_fp16): + model = hubert_model.hubert_soft(path) + model.eval() + if use_fp16: + model.half() + else: + model.float() + model.to(device) + return model + + +def pred_vec(model, wavPath, vecPath, device, use_fp16): + feats = load_audio(wavPath) + feats = torch.from_numpy(feats).to(device) + feats = feats[None, None, :] + if use_fp16: + feats = feats.half() + else: + feats = feats.float() + with torch.no_grad(): + vec = model.units(feats).squeeze().data.cpu().float().numpy() + # print(vec.shape) # [length, dim=256] hop=320 + np.save(vecPath, vec, allow_pickle=False) + + +def process_file(file): + if file.endswith(".wav"): + file = file[:-4] + pred_vec(hubert, f"{wavPath}/{spks}/{file}.wav", f"{vecPath}/{spks}/{file}.vec", device, use_fp16) + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.description = 'please enter embed parameter ...' + parser.add_argument("-w", "--wav", help="wav", dest="wav") + parser.add_argument("-v", "--vec", help="vec", dest="vec") + parser.add_argument("-t", "--thread_count", help="thread count to process, set 0 to use all cpu cores", dest="thread_count", type=int, default=1) + + args = parser.parse_args() + print(args.wav) + print(args.vec) + if not os.path.exists(args.vec): + os.makedirs(args.vec) + + wavPath = args.wav + vecPath = args.vec + + device, use_fp16 = select_device_and_precision() + print(f"device={device}, precision={'fp16' if use_fp16 else 'fp32'}") + hubert = load_model(os.path.join( + "hubert_pretrain", "hubert-soft-0d54a1f4.pt"), device, use_fp16) + + for spks in os.listdir(wavPath): + if os.path.isdir(f"./{wavPath}/{spks}"): + if not os.path.exists(f"./{vecPath}/{spks}"): + os.makedirs(f"./{vecPath}/{spks}") + print(f">>>>>>>>>>{spks}<<<<<<<<<<") + if args.thread_count == 1: + for file in os.listdir(f"./{wavPath}/{spks}"): + if file.endswith(".wav"): + print(file) + file = file[:-4] + pred_vec(hubert, f"{wavPath}/{spks}/{file}.wav", f"{vecPath}/{spks}/{file}.vec", device, use_fp16) + else: + if args.thread_count == 0: + process_num = os.cpu_count() + else: + process_num = args.thread_count + with ThreadPoolExecutor(max_workers=process_num) as executor: + futures = [executor.submit(process_file, file) for file in os.listdir(f"./{wavPath}/{spks}")] + for future in tqdm(as_completed(futures), total=len(futures)): + pass + # with Pool(processes=process_num) as pool: + # results = [pool.apply_async(process_file, (file,)) for file in os.listdir(f"./{wavPath}/{spks}")] + # for result in tqdm(results, total=len(results)): + # result.wait() diff --git a/prepare/preprocess_pack.py b/prepare/preprocess_pack.py new file mode 100644 index 0000000000000000000000000000000000000000..3e4d245106ab643d347b4ef0cdae7b7c2929f037 --- /dev/null +++ b/prepare/preprocess_pack.py @@ -0,0 +1,164 @@ +""" +Pack per-sample feature files into single .pt bundles for NFS-efficient training. + +For each clip, combines: + //.wav (audio, resampled + mel computed here) + //.ppg.npy + //.vec.npy + //.pit.npy + //.spk.npy + +Into a single file: + //.pt (1 NFS read per sample at training time) + +Reduces training I/O from 5 NFS reads/sample to 1. +Mel is precomputed here so the DataLoader skips STFT entirely. + +Usage: + python prepare/preprocess_pack.py \ + -w data_svc/waves-32k \ + -o data_svc/packed \ + -t 8 +""" + +import sys +import os +sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) + +import argparse +import numpy as np +import torch +import torchaudio +import torchaudio.transforms as T +from concurrent.futures import ThreadPoolExecutor, as_completed +from tqdm import tqdm + +# Mel parameters matching F5-TTS / Vocos +_SAMPLE_RATE = 24_000 +_HOP_LENGTH = 256 +_N_FFT = 1_024 +_WIN_LENGTH = 1_024 +_N_MELS = 100 +_F_MIN = 0 +_F_MAX = 8_000 + +_mel_tf = None # initialized per-thread via _get_mel_tf() + +def _get_mel_tf(): + global _mel_tf + if _mel_tf is None: + _mel_tf = T.MelSpectrogram( + sample_rate=_SAMPLE_RATE, + n_fft=_N_FFT, + hop_length=_HOP_LENGTH, + n_mels=_N_MELS, + win_length=_WIN_LENGTH, + f_min=_F_MIN, + f_max=_F_MAX, + power=1.0, + norm="slaney", + mel_scale="slaney", + ) + return _mel_tf + + +def pack_file(file_id, spk, wav_dir, ppg_dir, hubert_dir, f0_dir, spk_dir, out_dir): + wav_path = os.path.join(wav_dir, spk, f"{file_id}.wav") + ppg_path = os.path.join(ppg_dir, spk, f"{file_id}.ppg.npy") + hubert_path = os.path.join(hubert_dir, spk, f"{file_id}.vec.npy") + f0_path = os.path.join(f0_dir, spk, f"{file_id}.pit.npy") + spk_path = os.path.join(spk_dir, spk, f"{file_id}.spk.npy") + out_path = os.path.join(out_dir, spk, f"{file_id}.pt") + + # Skip if already packed + if os.path.isfile(out_path): + return True, file_id + + missing = [p for p in [wav_path, ppg_path, hubert_path, f0_path, spk_path] + if not os.path.isfile(p)] + if missing: + return False, f"{file_id}: missing {missing}" + + try: + # --- Audio → log-mel at 24 kHz --- + wav, sr = torchaudio.load(wav_path) + if wav.shape[0] > 1: + wav = wav.mean(dim=0, keepdim=True) + if sr != _SAMPLE_RATE: + wav = torchaudio.functional.resample(wav, sr, _SAMPLE_RATE) + mel = _get_mel_tf()(wav).squeeze(0).T # (T_mel, N_MELS) + mel = torch.log(mel.clamp(min=1e-5)) + + # --- Features (kept as float32 tensors) --- + ppg = torch.tensor(np.load(ppg_path)).float() # (T_feat, 1280) + hubert = torch.tensor(np.load(hubert_path)).float() # (T_feat, 256) + f0_raw = torch.tensor(np.load(f0_path)).float() # (T_feat,) + f0 = torch.where(f0_raw > 0, + torch.log(f0_raw.clamp(min=1.0)), + torch.zeros_like(f0_raw)) # log-F0, 0 = unvoiced + spk_emb = torch.tensor(np.load(spk_path)).float() # (256,) + + torch.save({"mel": mel, "ppg": ppg, "hubert": hubert, + "f0": f0, "spk": spk_emb}, out_path) + return True, file_id + + except Exception as e: + return False, f"{file_id}: {e}" + + +if __name__ == "__main__": + parser = argparse.ArgumentParser( + description="Pack SVC features into single .pt bundles (NFS-efficient training)." + ) + parser.add_argument("-w", "--wav", dest="wav", required=True, + help="Input wav directory (e.g. data_svc/waves-32k)") + parser.add_argument("-o", "--out", dest="out", required=True, + help="Output packed directory (e.g. data_svc/packed)") + parser.add_argument("--ppg", dest="ppg", default="data_svc/whisper", + help="PPG directory") + parser.add_argument("--hubert", dest="hubert", default="data_svc/hubert", + help="HuBERT directory") + parser.add_argument("--f0", dest="f0", default="data_svc/pitch", + help="F0 directory") + parser.add_argument("--spk", dest="spk", default="data_svc/speaker", + help="Speaker embedding directory") + parser.add_argument("-t", "--thread_count", dest="thread_count", type=int, default=4, + help="Worker threads (0 = all CPU cores)") + args = parser.parse_args() + + os.makedirs(args.out, exist_ok=True) + n_workers = os.cpu_count() if args.thread_count == 0 else args.thread_count + + for spk in sorted(os.listdir(args.wav)): + spk_wav_dir = os.path.join(args.wav, spk) + if not os.path.isdir(spk_wav_dir): + continue + + spk_out_dir = os.path.join(args.out, spk) + os.makedirs(spk_out_dir, exist_ok=True) + + files = [f[:-4] for f in os.listdir(spk_wav_dir) if f.endswith(".wav")] + print(f">>>>>>>>>>{spk}<<<<<<<<<< ({len(files)} files)") + + ok = fail = skip = 0 + with ThreadPoolExecutor(max_workers=n_workers) as executor: + futures = { + executor.submit( + pack_file, fid, spk, + args.wav, args.ppg, args.hubert, args.f0, args.spk, args.out + ): fid for fid in files + } + for future in tqdm(as_completed(futures), total=len(futures)): + success, info = future.result() + if success: + if info == futures[future]: # not skipped + ok += 1 + else: + skip += 1 + else: + fail += 1 + tqdm.write(f" WARN: {info}") + + print(f" packed={ok} skipped(existed)={skip} failed={fail}") + + print("Done. Update training commands to use --packed_dir instead of separate feature dirs.") diff --git a/prepare/preprocess_ppg.py b/prepare/preprocess_ppg.py new file mode 100644 index 0000000000000000000000000000000000000000..de6a2849839fa58fa29a13af40d05e0dd2c9f0f0 --- /dev/null +++ b/prepare/preprocess_ppg.py @@ -0,0 +1,87 @@ +import sys,os +sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) +import numpy as np +import argparse +import torch +import random +from whisper.model import Whisper, ModelDimensions +from whisper.audio import load_audio, pad_or_trim, log_mel_spectrogram + + +def select_device_and_precision(): + if torch.cuda.is_available(): + return torch.device("cuda"), True + if torch.backends.mps.is_available(): + return torch.device("mps"), False + return torch.device("cpu"), False + + +def load_model(path) -> tuple[Whisper, bool]: + device, use_fp16 = select_device_and_precision() + checkpoint = torch.load(path, map_location="cpu") + dims = ModelDimensions(**checkpoint["dims"]) + print(dims) + model = Whisper(dims) + del model.decoder + cut = len(model.encoder.blocks) // 4 + cut = -1 * cut + del model.encoder.blocks[cut:] + model.load_state_dict(checkpoint["model_state_dict"], strict=False) + model.eval() + if use_fp16: + model.half() + else: + model.float() + model.to(device) + return model, use_fp16 + + +def pred_ppg(whisper: Whisper, wavPath, ppgPath, use_fp16: bool): + audio = load_audio(wavPath) + audln = audio.shape[0] + ppgln = audln // 320 + audio = pad_or_trim(audio) + mel = log_mel_spectrogram(audio) + if use_fp16: + mel = mel.half() + else: + mel = mel.float() + mel = mel.to(whisper.device) + with torch.no_grad(): + ppg = whisper.encoder(mel.unsqueeze(0)).squeeze().data.cpu().float().numpy() + ppg = ppg[:ppgln,] # [length, dim=1280] + # print(ppg.shape) + np.save(ppgPath, ppg, allow_pickle=False) + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.description = 'please enter embed parameter ...' + parser.add_argument("-w", "--wav", help="wav", dest="wav") + parser.add_argument("-p", "--ppg", help="ppg", dest="ppg") + args = parser.parse_args() + print(args.wav) + print(args.ppg) + + os.makedirs(args.ppg, exist_ok=True) + wavPath = args.wav + ppgPath = args.ppg + + whisper, use_fp16 = load_model(os.path.join("whisper_pretrain", "large-v2.pt")) + print(f"device={whisper.device}, precision={'fp16' if use_fp16 else 'fp32'}") + spkPaths = os.listdir(wavPath) + random.shuffle(spkPaths) + + for spks in spkPaths: + if os.path.isdir(f"./{wavPath}/{spks}"): + os.makedirs(f"./{ppgPath}/{spks}", exist_ok=True) + print(f">>>>>>>>>>{spks}<<<<<<<<<<") + for file in os.listdir(f"./{wavPath}/{spks}"): + if file.endswith(".wav"): + # print(file) + file = file[:-4] + path_wav = f"{wavPath}/{spks}/{file}.wav" + path_ppg = f"{ppgPath}/{spks}/{file}.ppg" + if os.path.isfile(f"{path_ppg}.npy"): + continue + pred_ppg(whisper, path_wav, path_ppg, use_fp16) diff --git a/prepare/preprocess_speaker.py b/prepare/preprocess_speaker.py new file mode 100644 index 0000000000000000000000000000000000000000..e3872feed1545c83706b8d280c1e7dc7d062003e --- /dev/null +++ b/prepare/preprocess_speaker.py @@ -0,0 +1,124 @@ +import sys,os +sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) +import torch +import numpy as np +import argparse +import multiprocessing + +from functools import partial +from multiprocessing.pool import ThreadPool +from tqdm import tqdm +from tqdm import tqdm +from argparse import RawTextHelpFormatter +from speaker.models.lstm import LSTMSpeakerEncoder +from speaker.config import SpeakerEncoderConfig +from speaker.utils.audio import AudioProcessor +from speaker.infer import read_json + + +def str2bool(value): + if isinstance(value, bool): + return value + value = value.strip().lower() + if value in {"1", "true", "yes", "y", "on"}: + return True + if value in {"0", "false", "no", "n", "off"}: + return False + raise argparse.ArgumentTypeError("Expected a boolean value.") + + +def get_spk_wavs(dataset_path, output_path): + wav_files = [] + if not os.path.exists(f"./{output_path}"): + os.makedirs(f"./{output_path}") + for spks in os.listdir(dataset_path): + if os.path.isdir(f"./{dataset_path}/{spks}"): + if not os.path.exists(f"./{output_path}/{spks}"): + os.makedirs(f"./{output_path}/{spks}") + for file in os.listdir(f"./{dataset_path}/{spks}"): + if file.endswith(".wav"): + wav_files.append(f"./{dataset_path}/{spks}/{file}") + elif spks.endswith(".wav"): + wav_files.append(f"./{dataset_path}/{spks}") + return wav_files + +def process_wav(wav_file, dataset_path, output_path, args, speaker_encoder_ap, speaker_encoder): + waveform = speaker_encoder_ap.load_wav( + wav_file, sr=speaker_encoder_ap.sample_rate + ) + spec = speaker_encoder_ap.melspectrogram(waveform) + spec = torch.from_numpy(spec.T) + if args.use_cuda: + spec = spec.cuda() + spec = spec.unsqueeze(0) + embed = speaker_encoder.compute_embedding(spec).detach().cpu().numpy() + embed = embed.squeeze() + embed_path = wav_file.replace(dataset_path, output_path) + embed_path = embed_path.replace(".wav", ".spk") + np.save(embed_path, embed, allow_pickle=False) + +def extract_speaker_embeddings(wav_files, dataset_path, output_path, args, speaker_encoder_ap, speaker_encoder, concurrency): + bound_process_wav = partial(process_wav, dataset_path=dataset_path, output_path=output_path, args=args, speaker_encoder_ap=speaker_encoder_ap, speaker_encoder=speaker_encoder) + + with ThreadPool(concurrency) as pool: + list(tqdm(pool.imap(bound_process_wav, wav_files), total=len(wav_files))) + +if __name__ == "__main__": + + parser = argparse.ArgumentParser( + description="""Compute embedding vectors for each wav file in a dataset.""", + formatter_class=RawTextHelpFormatter, + ) + parser.add_argument("dataset_path", type=str, help="Path to dataset waves.") + parser.add_argument( + "output_path", type=str, help="path for output speaker/speaker_wavs.npy." + ) + parser.add_argument("--use_cuda", type=str2bool, nargs="?", const=True, default=None, help="flag to set cuda") + parser.add_argument("-t", "--thread_count", help="thread count to process, set 0 to use all cpu cores", dest="thread_count", type=int, default=1) + args = parser.parse_args() + dataset_path = args.dataset_path + output_path = args.output_path + thread_count = args.thread_count + + if args.use_cuda is None: + args.use_cuda = torch.cuda.is_available() + if args.use_cuda and not torch.cuda.is_available(): + print("CUDA requested but not available. Falling back to CPU for speaker encoder.") + args.use_cuda = False + + # We will let Speaker Encoder run on CPU for now since its LSTM implementation + # may have hardcoded .cuda() calls, but CREPE is the main 10 min bottleneck. + print(f"use_cuda={args.use_cuda}") + # model + args.model_path = os.path.join("speaker_pretrain", "best_model.pth.tar") + args.config_path = os.path.join("speaker_pretrain", "config.json") + # config + config_dict = read_json(args.config_path) + + # model + config = SpeakerEncoderConfig(config_dict) + config.from_dict(config_dict) + + speaker_encoder = LSTMSpeakerEncoder( + config.model_params["input_dim"], + config.model_params["proj_dim"], + config.model_params["lstm_dim"], + config.model_params["num_lstm_layers"], + ) + + speaker_encoder.load_checkpoint(args.model_path, eval=True, use_cuda=args.use_cuda) + + # preprocess + speaker_encoder_ap = AudioProcessor(**config.audio) + # normalize the input audio level and trim silences + speaker_encoder_ap.do_sound_norm = True + speaker_encoder_ap.do_trim_silence = True + + wav_files = get_spk_wavs(dataset_path, output_path) + + if thread_count == 0: + process_num = os.cpu_count() + else: + process_num = thread_count + + extract_speaker_embeddings(wav_files, dataset_path, output_path, args, speaker_encoder_ap, speaker_encoder, process_num) diff --git a/prepare/preprocess_speaker_ave.py b/prepare/preprocess_speaker_ave.py new file mode 100644 index 0000000000000000000000000000000000000000..61bfe0865e6b5eacca2a81d5ffdb504d2c6597c1 --- /dev/null +++ b/prepare/preprocess_speaker_ave.py @@ -0,0 +1,50 @@ +import os +import argparse +import numpy as np +from tqdm import tqdm +from concurrent.futures import ProcessPoolExecutor, as_completed + +def load_embed_file(file, data_speaker, speaker): + if file.endswith(".npy"): + source_embed = np.load( + os.path.join(data_speaker, speaker, file)) + source_embed = source_embed.astype(np.float32) + return source_embed + return None + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.description = 'please enter embed parameter ...' + parser.add_argument("dataset_speaker", type=str) + parser.add_argument("dataset_singer", type=str) + parser.add_argument("-t", "--thread_count", help="thread count to process, set 0 to use all cpu cores", dest="thread_count", type=int, default=1) + + data_speaker = parser.parse_args().dataset_speaker + data_singer = parser.parse_args().dataset_singer + thread_count = parser.parse_args().thread_count + + if not os.path.exists(data_singer): + os.makedirs(data_singer) + + for speaker in os.listdir(data_speaker): + print(speaker) + subfile_num = 0 + speaker_ave = 0 + if thread_count == 0: + process_num = os.cpu_count() + else: + process_num = thread_count + + with ProcessPoolExecutor(max_workers=process_num) as executor: + futures = [executor.submit(load_embed_file, file, data_speaker, speaker) for file in os.listdir(os.path.join(data_speaker, speaker))] + for future in tqdm(as_completed(futures)): + source_embed = future.result() + if source_embed is not None: + speaker_ave = speaker_ave + source_embed + subfile_num = subfile_num + 1 + if subfile_num == 0: + continue + speaker_ave = speaker_ave / subfile_num + + np.save(os.path.join(data_singer, f"{speaker}.spk.npy"), + speaker_ave, allow_pickle=False) diff --git a/prepare/preprocess_spec.py b/prepare/preprocess_spec.py new file mode 100644 index 0000000000000000000000000000000000000000..843747f6455a5d0b774634e5948d965bfc3e76fc --- /dev/null +++ b/prepare/preprocess_spec.py @@ -0,0 +1,63 @@ +import sys,os +sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) +import torch +import argparse +import multiprocessing +from concurrent.futures import ProcessPoolExecutor, as_completed +from tqdm import tqdm +from vits import spectrogram +from vits import utils +from omegaconf import OmegaConf + + +def compute_spec(hps, filename, specname): + audio, sampling_rate = utils.load_wav_to_torch(filename) + assert sampling_rate == hps.sampling_rate, f"{sampling_rate} is not {hps.sampling_rate}" + audio_norm = audio / hps.max_wav_value + audio_norm = audio_norm.unsqueeze(0) + n_fft = hps.filter_length + sampling_rate = hps.sampling_rate + hop_size = hps.hop_length + win_size = hps.win_length + spec = spectrogram.spectrogram_torch( + audio_norm, n_fft, sampling_rate, hop_size, win_size, center=False) + spec = torch.squeeze(spec, 0) + torch.save(spec, specname) + + +def process_file(file, hps_data, wavPath, spks, spePath): + if file.endswith(".wav"): + file = file[:-4] + compute_spec(hps_data, f"{wavPath}/{spks}/{file}.wav", f"{spePath}/{spks}/{file}.pt") + +def process_files_with_thread_pool(wavPath, spks, max_workers, hps_data, spePath): + files = os.listdir(f"./{wavPath}/{spks}") + with ProcessPoolExecutor(max_workers=max_workers) as executor: + futures = [executor.submit(process_file, file, hps_data, wavPath, spks, spePath) for file in files] + for future in tqdm(as_completed(futures), total=len(futures)): + future.result() + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.description = 'please enter embed parameter ...' + parser.add_argument("-w", "--wav", help="wav", dest="wav") + parser.add_argument("-s", "--spe", help="spe", dest="spe") + parser.add_argument("-t", "--thread_count", help="thread count to process, set 0 to use all cpu cores", dest="thread_count", type=int, default=1) + args = parser.parse_args() + print(args.wav) + print(args.spe) + if not os.path.exists(args.spe): + os.makedirs(args.spe) + wavPath = args.wav + spePath = args.spe + hps = OmegaConf.load("./configs/base.yaml") + + for spks in os.listdir(wavPath): + if os.path.isdir(f"./{wavPath}/{spks}"): + if not os.path.exists(f"./{spePath}/{spks}"): + os.makedirs(f"./{spePath}/{spks}") + if args.thread_count == 0: + process_num = os.cpu_count() + else: + process_num = args.thread_count + process_files_with_thread_pool(wavPath, spks, process_num, hps.data, spePath) diff --git a/prepare/preprocess_train.py b/prepare/preprocess_train.py new file mode 100644 index 0000000000000000000000000000000000000000..248c0822e83b6e21fcd4b1c4104d9b4a151b9e78 --- /dev/null +++ b/prepare/preprocess_train.py @@ -0,0 +1,62 @@ +import os +import random + + +def print_error(info): + print(f"\033[31m File isn't existed: {info}\033[0m") + + +if __name__ == "__main__": + os.makedirs("./files/", exist_ok=True) + + rootPath = "./data_svc/waves-32k/" + all_items = [] + for spks in os.listdir(f"./{rootPath}"): + if not os.path.isdir(f"./{rootPath}/{spks}"): + continue + print(f"./{rootPath}/{spks}") + for file in os.listdir(f"./{rootPath}/{spks}"): + if file.endswith(".wav"): + file = file[:-4] + path_spk = f"./data_svc/speaker/{spks}/{file}.spk.npy" + path_wave = f"./data_svc/waves-32k/{spks}/{file}.wav" + path_spec = f"./data_svc/specs/{spks}/{file}.pt" + path_pitch = f"./data_svc/pitch/{spks}/{file}.pit.npy" + path_hubert = f"./data_svc/hubert/{spks}/{file}.vec.npy" + path_whisper = f"./data_svc/whisper/{spks}/{file}.ppg.npy" + has_error = 0 + if not os.path.isfile(path_spk): + print_error(path_spk) + has_error = 1 + if not os.path.isfile(path_wave): + print_error(path_wave) + has_error = 1 + if not os.path.isfile(path_spec): + print_error(path_spec) + has_error = 1 + if not os.path.isfile(path_pitch): + print_error(path_pitch) + has_error = 1 + if not os.path.isfile(path_hubert): + print_error(path_hubert) + has_error = 1 + if not os.path.isfile(path_whisper): + print_error(path_whisper) + has_error = 1 + if has_error == 0: + all_items.append( + f"{path_wave}|{path_spec}|{path_pitch}|{path_hubert}|{path_whisper}|{path_spk}") + + random.shuffle(all_items) + valids = all_items[:2] + valids.sort() + trains = all_items[2:] + # trains.sort() + fw = open("./files/valid.txt", "w", encoding="utf-8") + for strs in valids: + print(strs, file=fw) + fw.close() + fw = open("./files/train.txt", "w", encoding="utf-8") + for strs in trains: + print(strs, file=fw) + fw.close() diff --git a/prepare/preprocess_zzz.py b/prepare/preprocess_zzz.py new file mode 100644 index 0000000000000000000000000000000000000000..79e62a97271a9c5f14e220063900d48f09207c61 --- /dev/null +++ b/prepare/preprocess_zzz.py @@ -0,0 +1,31 @@ +import sys,os +sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) +from tqdm import tqdm +from torch.utils.data import DataLoader +from omegaconf import OmegaConf +from vits.data_utils import TextAudioSpeakerSet +from vits.data_utils import TextAudioSpeakerCollate +from vits.data_utils import DistributedBucketSampler + + +hps = OmegaConf.load("./configs/base.yaml") +dataset = TextAudioSpeakerSet("files/valid.txt", hps.data) + +for _ in tqdm(dataset): + pass + + +sampler = DistributedBucketSampler( + dataset, + 4, + [150, 300, 450], + num_replicas=1, + rank=0, + shuffle=True) +collate_fn = TextAudioSpeakerCollate() +loader = DataLoader(dataset, num_workers=0, shuffle=False, pin_memory=True, + collate_fn=collate_fn, batch_sampler=sampler) + + +for _ in tqdm(loader): + pass diff --git a/preprocess_teacher.py b/preprocess_teacher.py new file mode 100644 index 0000000000000000000000000000000000000000..197ec543c62d2c909eafddb91954db750856ee32 --- /dev/null +++ b/preprocess_teacher.py @@ -0,0 +1,122 @@ +import argparse +import glob +import os + +import numpy as np +import torch +from omegaconf import OmegaConf + +from models.codec_wrapper import CodecWrapper +from vits.models import SynthesizerInfer + + +def load_teacher_model(config_path, checkpoint_path, device): + hp = OmegaConf.load(config_path) + teacher = SynthesizerInfer( + hp.data.filter_length // 2 + 1, + hp.data.segment_size // hp.data.hop_length, + hp, + ).to(device) + + checkpoint = torch.load(checkpoint_path, map_location="cpu") + saved_state_dict = checkpoint["model_g"] if "model_g" in checkpoint else checkpoint + model_state = teacher.state_dict() + load_state = {k: saved_state_dict.get(k, v) for k, v in model_state.items()} + teacher.load_state_dict(load_state, strict=False) + teacher.eval() + for p in teacher.parameters(): + p.requires_grad = False + return teacher, int(hp.data.sampling_rate) + + +def load_cond_features(speaker_dir, file_id, data_root="./data_svc"): + ppg = np.load(f"{data_root}/whisper/{speaker_dir}/{file_id}.ppg.npy") + hubert = np.load(f"{data_root}/hubert/{speaker_dir}/{file_id}.vec.npy") + f0 = np.load(f"{data_root}/pitch/{speaker_dir}/{file_id}.pit.npy") + spk = np.load(f"{data_root}/speaker/{speaker_dir}/{file_id}.spk.npy") + + # Match so-vits inference convention: repeat 50Hz features to pitch frame rate. + ppg = np.repeat(ppg, 2, axis=0) + hubert = np.repeat(hubert, 2, axis=0) + + # Trim all to shared length. + t = min(len(f0), len(ppg), len(hubert)) + f0 = f0[:t] + ppg = ppg[:t] + hubert = hubert[:t] + + return ( + torch.tensor(ppg, dtype=torch.float32), + torch.tensor(hubert, dtype=torch.float32), + torch.tensor(f0, dtype=torch.float32), + torch.tensor(spk, dtype=torch.float32), + ) + + +@torch.no_grad() +def generate_teacher_codec_targets(args): + if torch.cuda.is_available(): + device = torch.device("cuda") + elif torch.backends.mps.is_available(): + device = torch.device("mps") + else: + device = torch.device("cpu") + + teacher, teacher_sr = load_teacher_model(args.teacher_config, args.teacher_ckpt, device) + codec = CodecWrapper(latent_dim=1024).to(device).eval() + + src_files = glob.glob(os.path.join(args.codec_target_dir, "**", "*.pt"), recursive=True) + if not src_files: + raise RuntimeError(f"No source codec targets found under {args.codec_target_dir}") + + os.makedirs(args.out_dir, exist_ok=True) + + processed = 0 + skipped = 0 + for src in src_files: + rel = os.path.relpath(src, args.codec_target_dir) + speaker_dir = os.path.basename(os.path.dirname(src)) + file_id = os.path.basename(src).replace(".pt", "").replace("_ztarget", "") + + out_path = os.path.join(args.out_dir, rel) + out_dir = os.path.dirname(out_path) + os.makedirs(out_dir, exist_ok=True) + + if os.path.isfile(out_path) and not args.overwrite: + continue + + try: + ppg, hubert, f0, spk = load_cond_features(speaker_dir, file_id, data_root=args.data_root) + + ppg = ppg.unsqueeze(0).to(device) + hubert = hubert.unsqueeze(0).to(device) + pit = f0.unsqueeze(0).to(device) # (1, T) + spk = spk.unsqueeze(0).to(device) + lengths = torch.LongTensor([pit.shape[1]]).to(device) + + source = teacher.pitch2source(pit) + wav_teacher = teacher.inference(ppg, hubert, pit, spk, lengths, source) + + z_teacher = codec.encode(wav_teacher, sample_rate=teacher_sr) # (1, 1024, T) + torch.save(z_teacher.cpu(), out_path) + processed += 1 + if processed % args.log_interval == 0: + print(f"Processed {processed} samples...") + except Exception as e: + skipped += 1 + print(f"Skip {speaker_dir}/{file_id}: {e}") + + print(f"Teacher preprocessing done. processed={processed}, skipped={skipped}") + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument("--teacher_ckpt", type=str, required=True) + parser.add_argument("--teacher_config", type=str, default="configs/base.yaml") + parser.add_argument("--codec_target_dir", type=str, default="./data_svc/codec_targets") + parser.add_argument("--data_root", type=str, default="./data_svc") + parser.add_argument("--out_dir", type=str, default="./data_svc/teacher_codec_targets") + parser.add_argument("--overwrite", action="store_true") + parser.add_argument("--log_interval", type=int, default=20) + args = parser.parse_args() + generate_teacher_codec_targets(args) diff --git a/requirements.txt b/requirements.txt new file mode 100755 index 0000000000000000000000000000000000000000..b09e4a784e24aea320d5812b39f25b51de1f75d8 --- /dev/null +++ b/requirements.txt @@ -0,0 +1,16 @@ +fsspec +pyworld +matplotlib +soundfile +scikit-learn +scipy +torchcrepe +tensorboard +transformers +tqdm +librosa +omegaconf +gradio==3.36.1 +gradio_client==0.2.9 +ruamel.yaml +setuptools<81 diff --git a/run_cluster.sh b/run_cluster.sh new file mode 100644 index 0000000000000000000000000000000000000000..89dfbf773823a6df93d6bd2581a06289f0c71350 --- /dev/null +++ b/run_cluster.sh @@ -0,0 +1,32 @@ +#!/bin/bash +#SBATCH --job-name=cfm_distill +#SBATCH --partition=a30 +#SBATCH --gres=gpu:1 +#SBATCH --time=24:00:00 +#SBATCH --output=logs/distill_%j.out +#SBATCH --error=logs/distill_%j.err +#SBATCH --mail-type=ALL +#SBATCH --mail-user=hl3025@imperial.ac.uk + +echo "Starting Distillation Pipeline on SLURM Cluster..." +cd /vol/bitbucket/hl3025/cfm_svc +source .venv_linux/bin/activate + +export PIP_CACHE_DIR=/vol/bitbucket/hl3025/pip_cache +export TMPDIR=/vol/bitbucket/hl3025/tmp + +# Prevent BLAS/OpenMP from spawning too many threads inside the multiprocessing pool +export OMP_NUM_THREADS=1 +export OPENBLAS_NUM_THREADS=1 +export MKL_NUM_THREADS=1 +export VECLIB_MAXIMUM_THREADS=1 +export NUMEXPR_NUM_THREADS=1 + +# Run preprocessing with 8 threads (since we requested 8 cores) +python svc_preprocessing.py -t 8 + +python data/codec_targets.py -w ./data_svc/waves-32k -o ./data_svc/codec_targets + +python preprocess_teacher.py --teacher_ckpt vits_pretrain/sovits5.0.pretrain.pth --teacher_config configs/base.yaml --codec_target_dir ./data_svc/codec_targets --data_root ./data_svc --out_dir ./data_svc/teacher_codec_targets --log_interval 200 + +echo "Distillation Pipeline Complete!" diff --git a/run_distillation.sh b/run_distillation.sh new file mode 100755 index 0000000000000000000000000000000000000000..b65b1839341256c6a10fefc7cdef607f56434885 --- /dev/null +++ b/run_distillation.sh @@ -0,0 +1,17 @@ +#!/bin/bash + +# Navigate to the correct directory +cd /vol/bitbucket/hl3025/cfm_svc + +# Activate the local python environment +source .venv_linux/bin/activate + +# Execute the preprocessing and distillation scripts sequentially +# Keep the thread count reasonable to avoid running out of memory! +python svc_preprocessing.py -t 4 + +python data/codec_targets.py -w ./data_svc/waves-32k -o ./data_svc/codec_targets + +python preprocess_teacher.py --teacher_ckpt vits_pretrain/sovits5.0.pretrain.pth --teacher_config configs/base.yaml --codec_target_dir ./data_svc/codec_targets --data_root ./data_svc --out_dir ./data_svc/teacher_codec_targets --log_interval 200 + +echo "Distillation Pipeline Complete!" diff --git a/samplers/ode.py b/samplers/ode.py new file mode 100644 index 0000000000000000000000000000000000000000..9da7db84d562719d11198f28447789cdb31b8b1c --- /dev/null +++ b/samplers/ode.py @@ -0,0 +1,50 @@ +import torch + +class ODESampler: + def __init__(self, v_theta, steps=12, solver='heun'): + self.v_theta = v_theta + self.steps = steps + self.solver = solver.lower() + + @torch.no_grad() + def sample(self, z_noise, cond): + """ + cond: (B, T, c_dim) + z_noise: (B, T, D) + """ + device = z_noise.device + B, T, D = z_noise.shape + dt = 1.0 / self.steps + z = z_noise.clone() + + for step in range(self.steps): + t = torch.tensor([step * dt] * B, device=device).unsqueeze(-1) # (B, 1) + + if self.solver == 'euler': + v = self.v_theta(z, t, cond) + z = z + v * dt + + elif self.solver == 'heun': + v1 = self.v_theta(z, t, cond) + z_tmp = z + v1 * dt + + t_next = torch.tensor([(step + 1) * dt] * B, device=device).unsqueeze(-1) + v2 = self.v_theta(z_tmp, t_next, cond) + + z = z + 0.5 * (v1 + v2) * dt + + elif self.solver == 'rk4': + v1 = self.v_theta(z, t, cond) + + t_mid = torch.tensor([(step + 0.5) * dt] * B, device=device).unsqueeze(-1) + v2 = self.v_theta(z + 0.5 * v1 * dt, t_mid, cond) + v3 = self.v_theta(z + 0.5 * v2 * dt, t_mid, cond) + + t_next = torch.tensor([(step + 1) * dt] * B, device=device).unsqueeze(-1) + v4 = self.v_theta(z + v3 * dt, t_next, cond) + + z = z + (1.0/6.0) * (v1 + 2*v2 + 2*v3 + v4) * dt + else: + raise ValueError(f"Unknown solver {self.solver}") + + return z diff --git a/segment_opensinger.py b/segment_opensinger.py new file mode 100644 index 0000000000000000000000000000000000000000..291dc532bb698762b13bbd08bb89d3cdbad0d589 --- /dev/null +++ b/segment_opensinger.py @@ -0,0 +1,97 @@ +import os +import glob +import librosa +import soundfile as sf +import argparse +from tqdm import tqdm + +def process_file(in_path, out_dir, min_sec=3.0, max_sec=15.0, top_db=40, sr=44100): + os.makedirs(out_dir, exist_ok=True) + try: + # Load audio (librosa converts it to mono by default) + y, _ = librosa.load(in_path, sr=sr) + except Exception as e: + print(f"Failed to load {in_path}: {e}") + return + + # Split audio on silence, returns intervals of (start_idx, end_idx) + intervals = librosa.effects.split(y, top_db=top_db) + + # Merge tiny intervals dynamically to enforce min_sec and max_sec lengths + merged_intervals = [] + cur_start = None + cur_end = None + + for start, end in intervals: + if cur_start is None: + cur_start = start + cur_end = end + else: + # If we add this new interval, does it exceed the max allowed length? + if (end - cur_start) / sr > max_sec: + # We exceeded max len. Commit the current chunk and start fresh. + merged_intervals.append((cur_start, cur_end)) + cur_start = start + cur_end = end + else: + # Merge them + cur_end = end + + if cur_start is not None: + merged_intervals.append((cur_start, cur_end)) + + base_name = os.path.basename(in_path).replace(".wav", "").replace(".", "_") + + saved_chunks = 0 + for i, (start, end) in enumerate(merged_intervals): + duration = (end - start) / sr + + # If the chunk is ridiculously short, don't keep it (unless it's the only one) + if duration < min_sec and len(merged_intervals) > 1: + continue + + chunk_data = y[start:end] + out_filename = os.path.join(out_dir, f"{base_name}_{i:04d}.wav") + sf.write(out_filename, chunk_data, sr) + saved_chunks += 1 + + return saved_chunks + +def segment_dataset(input_dir, output_dir, sr=44100, top_db=40): + wavs = glob.glob(os.path.join(input_dir, "**", "*.wav"), recursive=True) + if not wavs: + print(f"No .wav files found in {input_dir}.") + return + + print(f"Found {len(wavs)} huge .wav files. Preparing to segment into clips...") + + total_clips = 0 + for w in tqdm(wavs): + # Determine speaker ID by reading the parent folder structure under the input_dir + rel_path = os.path.relpath(w, input_dir) + parts = rel_path.split(os.sep) + + # Usually OpenSinger is formatted as OpenSinger/Singer_XX/song_YY.wav + # Use the first sub-folder as the speaker namespace + if len(parts) > 1: + speaker_domain = parts[0] + else: + speaker_domain = "singer_00" + + out_d = os.path.join(output_dir, speaker_domain) + chunks_created = process_file(w, out_d, sr=sr, top_db=top_db) + if chunks_created: + total_clips += chunks_created + + print(f"\nSegmentation complete! Sliced into {total_clips} valid distillation chunks.") + print(f"Check results in {output_dir}") + +if __name__ == "__main__": + parser = argparse.ArgumentParser(description="Cleanly slice continuous massive dataset wavs into optimal batch lengths.") + parser.add_argument("--input_dir", type=str, default="./opensinger", help="Folder containing raw continuous dataset") + parser.add_argument("--output_dir", type=str, default="./dataset_raw", help="Folder mapping where slices go for train prep") + parser.add_argument("--sr", type=int, default=44100, help="Universal resample rate") + parser.add_argument("--top_db", type=int, default=40, help="DB threshold for silence trimming") + args = parser.parse_args() + + segment_dataset(args.input_dir, args.output_dir, sr=args.sr, top_db=args.top_db) diff --git a/slides/gradient_flow_dag.aux b/slides/gradient_flow_dag.aux new file mode 100644 index 0000000000000000000000000000000000000000..9fd2b9917d51b63c915125d2eab6d7e1b35b5223 --- /dev/null +++ b/slides/gradient_flow_dag.aux @@ -0,0 +1,20 @@ +\relax +\providecommand\hyper@newdestlabel[2]{} +\providecommand\HyField@AuxAddToFields[1]{} +\providecommand\HyField@AuxAddToCoFields[2]{} +\@writefile{nav}{\headcommand {\slideentry {0}{0}{1}{1/1}{}{0}}} +\@writefile{nav}{\headcommand {\beamer@framepages {1}{1}}} +\@writefile{nav}{\headcommand {\slideentry {0}{0}{2}{2/2}{}{0}}} +\@writefile{nav}{\headcommand {\beamer@framepages {2}{2}}} +\@writefile{nav}{\headcommand {\slideentry {0}{0}{3}{3/3}{}{0}}} +\@writefile{nav}{\headcommand {\beamer@framepages {3}{3}}} +\@writefile{nav}{\headcommand {\slideentry {0}{0}{4}{4/4}{}{0}}} +\@writefile{nav}{\headcommand {\beamer@framepages {4}{4}}} +\@writefile{nav}{\headcommand {\slideentry {0}{0}{5}{5/5}{}{0}}} +\@writefile{nav}{\headcommand {\beamer@framepages {5}{5}}} +\@writefile{nav}{\headcommand {\beamer@partpages {1}{5}}} +\@writefile{nav}{\headcommand {\beamer@subsectionpages {1}{5}}} +\@writefile{nav}{\headcommand {\beamer@sectionpages {1}{5}}} +\@writefile{nav}{\headcommand {\beamer@documentpages {5}}} +\@writefile{nav}{\headcommand {\gdef \inserttotalframenumber {5}}} +\gdef \@abspage@last{5} diff --git a/slides/gradient_flow_dag.log b/slides/gradient_flow_dag.log new file mode 100644 index 0000000000000000000000000000000000000000..0e21998aa7aafb1dd2fc84b5f7b335d26beb718d --- /dev/null +++ b/slides/gradient_flow_dag.log @@ -0,0 +1,1159 @@ +This is pdfTeX, Version 3.141592653-2.6-1.40.25 (TeX Live 2023/Debian) (preloaded format=pdflatex 2026.1.13) 5 MAR 2026 23:10 +entering extended mode + restricted \write18 enabled. + %&-line parsing enabled. +**gradient_flow_dag.tex +(./gradient_flow_dag.tex +LaTeX2e <2023-11-01> patch level 1 +L3 programming layer <2024-01-22> +(/usr/share/texlive/texmf-dist/tex/latex/beamer/beamer.cls +Document Class: beamer 2024/01/06 v3.71 A class for typesetting presentations +(/usr/share/texlive/texmf-dist/tex/latex/beamer/beamerbasemodes.sty +(/usr/share/texlive/texmf-dist/tex/latex/etoolbox/etoolbox.sty +Package: etoolbox 2020/10/05 v2.5k e-TeX tools for LaTeX (JAW) +\etb@tempcnta=\count187 +) +\beamer@tempbox=\box51 +\beamer@tempcount=\count188 +\c@beamerpauses=\count189 + +(/usr/share/texlive/texmf-dist/tex/latex/beamer/beamerbasedecode.sty +\beamer@slideinframe=\count190 +\beamer@minimum=\count191 +\beamer@decode@box=\box52 +) +\beamer@commentbox=\box53 +\beamer@modecount=\count192 +) +(/usr/share/texlive/texmf-dist/tex/generic/iftex/iftex.sty +Package: iftex 2022/02/03 v1.0f TeX engine tests +) +\headdp=\dimen140 +\footheight=\dimen141 +\sidebarheight=\dimen142 +\beamer@tempdim=\dimen143 +\beamer@finalheight=\dimen144 +\beamer@animht=\dimen145 +\beamer@animdp=\dimen146 +\beamer@animwd=\dimen147 +\beamer@leftmargin=\dimen148 +\beamer@rightmargin=\dimen149 +\beamer@leftsidebar=\dimen150 +\beamer@rightsidebar=\dimen151 +\beamer@boxsize=\dimen152 +\beamer@vboxoffset=\dimen153 +\beamer@descdefault=\dimen154 +\beamer@descriptionwidth=\dimen155 +\beamer@lastskip=\skip48 +\beamer@areabox=\box54 +\beamer@animcurrent=\box55 +\beamer@animshowbox=\box56 +\beamer@sectionbox=\box57 +\beamer@logobox=\box58 +\beamer@linebox=\box59 +\beamer@sectioncount=\count193 +\beamer@subsubsectionmax=\count194 +\beamer@subsectionmax=\count195 +\beamer@sectionmax=\count196 +\beamer@totalheads=\count197 +\beamer@headcounter=\count198 +\beamer@partstartpage=\count199 +\beamer@sectionstartpage=\count266 +\beamer@subsectionstartpage=\count267 +\beamer@animationtempa=\count268 +\beamer@animationtempb=\count269 +\beamer@xpos=\count270 +\beamer@ypos=\count271 +\beamer@ypos@offset=\count272 +\beamer@showpartnumber=\count273 +\beamer@currentsubsection=\count274 +\beamer@coveringdepth=\count275 +\beamer@sectionadjust=\count276 +\beamer@toclastsection=\count277 +\beamer@tocsectionnumber=\count278 + +(/usr/share/texlive/texmf-dist/tex/latex/beamer/beamerbaseoptions.sty +(/usr/share/texlive/texmf-dist/tex/latex/graphics/keyval.sty +Package: keyval 2022/05/29 v1.15 key=value parser (DPC) +\KV@toks@=\toks17 +)) +\beamer@paperwidth=\skip49 +\beamer@paperheight=\skip50 + +(/usr/share/texlive/texmf-dist/tex/latex/geometry/geometry.sty +Package: geometry 2020/01/02 v5.9 Page Geometry + +(/usr/share/texlive/texmf-dist/tex/generic/iftex/ifvtex.sty +Package: ifvtex 2019/10/25 v1.7 ifvtex legacy package. Use iftex instead. +) +\Gm@cnth=\count279 +\Gm@cntv=\count280 +\c@Gm@tempcnt=\count281 +\Gm@bindingoffset=\dimen156 +\Gm@wd@mp=\dimen157 +\Gm@odd@mp=\dimen158 +\Gm@even@mp=\dimen159 +\Gm@layoutwidth=\dimen160 +\Gm@layoutheight=\dimen161 +\Gm@layouthoffset=\dimen162 +\Gm@layoutvoffset=\dimen163 +\Gm@dimlist=\toks18 +) +(/usr/share/texlive/texmf-dist/tex/latex/pgf/math/pgfmath.sty +(/usr/share/texlive/texmf-dist/tex/latex/pgf/utilities/pgfrcs.sty +(/usr/share/texlive/texmf-dist/tex/generic/pgf/utilities/pgfutil-common.tex +\pgfutil@everybye=\toks19 +\pgfutil@tempdima=\dimen164 +\pgfutil@tempdimb=\dimen165 +) +(/usr/share/texlive/texmf-dist/tex/generic/pgf/utilities/pgfutil-latex.def +\pgfutil@abb=\box60 +) +(/usr/share/texlive/texmf-dist/tex/generic/pgf/utilities/pgfrcs.code.tex +(/usr/share/texlive/texmf-dist/tex/generic/pgf/pgf.revision.tex) +Package: pgfrcs 2023-01-15 v3.1.10 (3.1.10) +)) +(/usr/share/texlive/texmf-dist/tex/latex/pgf/utilities/pgfkeys.sty +(/usr/share/texlive/texmf-dist/tex/generic/pgf/utilities/pgfkeys.code.tex +\pgfkeys@pathtoks=\toks20 +\pgfkeys@temptoks=\toks21 + +(/usr/share/texlive/texmf-dist/tex/generic/pgf/utilities/pgfkeyslibraryfiltered +.code.tex +\pgfkeys@tmptoks=\toks22 +))) +(/usr/share/texlive/texmf-dist/tex/generic/pgf/math/pgfmath.code.tex +(/usr/share/texlive/texmf-dist/tex/generic/pgf/math/pgfmathutil.code.tex +\pgf@x=\dimen166 +\pgf@xa=\dimen167 +\pgf@xb=\dimen168 +\pgf@xc=\dimen169 +\pgf@y=\dimen170 +\pgf@ya=\dimen171 +\pgf@yb=\dimen172 +\pgf@yc=\dimen173 +\c@pgf@counta=\count282 +\c@pgf@countb=\count283 +\c@pgf@countc=\count284 +\c@pgf@countd=\count285 +) +(/usr/share/texlive/texmf-dist/tex/generic/pgf/math/pgfmathparser.code.tex +\pgfmath@dimen=\dimen174 +\pgfmath@count=\count286 +\pgfmath@box=\box61 +\pgfmath@toks=\toks23 +\pgfmath@stack@operand=\toks24 +\pgfmath@stack@operation=\toks25 +) +(/usr/share/texlive/texmf-dist/tex/generic/pgf/math/pgfmathfunctions.code.tex) +(/usr/share/texlive/texmf-dist/tex/generic/pgf/math/pgfmathfunctions.basic.code +.tex) +(/usr/share/texlive/texmf-dist/tex/generic/pgf/math/pgfmathfunctions.trigonomet +ric.code.tex) +(/usr/share/texlive/texmf-dist/tex/generic/pgf/math/pgfmathfunctions.random.cod +e.tex) +(/usr/share/texlive/texmf-dist/tex/generic/pgf/math/pgfmathfunctions.comparison +.code.tex) +(/usr/share/texlive/texmf-dist/tex/generic/pgf/math/pgfmathfunctions.base.code. +tex) +(/usr/share/texlive/texmf-dist/tex/generic/pgf/math/pgfmathfunctions.round.code +.tex) +(/usr/share/texlive/texmf-dist/tex/generic/pgf/math/pgfmathfunctions.misc.code. +tex) +(/usr/share/texlive/texmf-dist/tex/generic/pgf/math/pgfmathfunctions.integerari +thmetics.code.tex) +(/usr/share/texlive/texmf-dist/tex/generic/pgf/math/pgfmathcalc.code.tex) +(/usr/share/texlive/texmf-dist/tex/generic/pgf/math/pgfmathfloat.code.tex +\c@pgfmathroundto@lastzeros=\count287 +))) +(/usr/share/texlive/texmf-dist/tex/latex/base/size11.clo +File: size11.clo 2023/05/17 v1.4n Standard LaTeX file (size option) +) +(/usr/share/texlive/texmf-dist/tex/latex/pgf/basiclayer/pgfcore.sty +(/usr/share/texlive/texmf-dist/tex/latex/graphics/graphicx.sty +Package: graphicx 2021/09/16 v1.2d Enhanced LaTeX Graphics (DPC,SPQR) + +(/usr/share/texlive/texmf-dist/tex/latex/graphics/graphics.sty +Package: graphics 2022/03/10 v1.4e Standard LaTeX Graphics (DPC,SPQR) + +(/usr/share/texlive/texmf-dist/tex/latex/graphics/trig.sty +Package: trig 2021/08/11 v1.11 sin cos tan (DPC) +) +(/usr/share/texlive/texmf-dist/tex/latex/graphics-cfg/graphics.cfg +File: graphics.cfg 2016/06/04 v1.11 sample graphics configuration +) +Package graphics Info: Driver file: pdftex.def on input line 107. + +(/usr/share/texlive/texmf-dist/tex/latex/graphics-def/pdftex.def +File: pdftex.def 2022/09/22 v1.2b Graphics/color driver for pdftex +)) +\Gin@req@height=\dimen175 +\Gin@req@width=\dimen176 +) +(/usr/share/texlive/texmf-dist/tex/latex/pgf/systemlayer/pgfsys.sty +(/usr/share/texlive/texmf-dist/tex/generic/pgf/systemlayer/pgfsys.code.tex +Package: pgfsys 2023-01-15 v3.1.10 (3.1.10) +\pgf@x=\dimen177 +\pgf@y=\dimen178 +\pgf@xa=\dimen179 +\pgf@ya=\dimen180 +\pgf@xb=\dimen181 +\pgf@yb=\dimen182 +\pgf@xc=\dimen183 +\pgf@yc=\dimen184 +\pgf@xd=\dimen185 +\pgf@yd=\dimen186 +\w@pgf@writea=\write3 +\r@pgf@reada=\read2 +\c@pgf@counta=\count288 +\c@pgf@countb=\count289 +\c@pgf@countc=\count290 +\c@pgf@countd=\count291 +\t@pgf@toka=\toks26 +\t@pgf@tokb=\toks27 +\t@pgf@tokc=\toks28 +\pgf@sys@id@count=\count292 + +(/usr/share/texlive/texmf-dist/tex/generic/pgf/systemlayer/pgf.cfg +File: pgf.cfg 2023-01-15 v3.1.10 (3.1.10) +) +Driver file for pgf: pgfsys-pdftex.def + +(/usr/share/texlive/texmf-dist/tex/generic/pgf/systemlayer/pgfsys-pdftex.def +File: pgfsys-pdftex.def 2023-01-15 v3.1.10 (3.1.10) + +(/usr/share/texlive/texmf-dist/tex/generic/pgf/systemlayer/pgfsys-common-pdf.de +f +File: pgfsys-common-pdf.def 2023-01-15 v3.1.10 (3.1.10) +))) +(/usr/share/texlive/texmf-dist/tex/generic/pgf/systemlayer/pgfsyssoftpath.code. +tex +File: pgfsyssoftpath.code.tex 2023-01-15 v3.1.10 (3.1.10) +\pgfsyssoftpath@smallbuffer@items=\count293 +\pgfsyssoftpath@bigbuffer@items=\count294 +) +(/usr/share/texlive/texmf-dist/tex/generic/pgf/systemlayer/pgfsysprotocol.code. +tex +File: pgfsysprotocol.code.tex 2023-01-15 v3.1.10 (3.1.10) +)) (/usr/share/texlive/texmf-dist/tex/latex/xcolor/xcolor.sty +Package: xcolor 2023/11/15 v3.01 LaTeX color extensions (UK) + +(/usr/share/texlive/texmf-dist/tex/latex/graphics-cfg/color.cfg +File: color.cfg 2016/01/02 v1.6 sample color configuration +) +Package xcolor Info: Driver file: pdftex.def on input line 274. + +(/usr/share/texlive/texmf-dist/tex/latex/graphics/mathcolor.ltx) +Package xcolor Info: Model `cmy' substituted by `cmy0' on input line 1350. +Package xcolor Info: Model `hsb' substituted by `rgb' on input line 1354. +Package xcolor Info: Model `RGB' extended on input line 1366. +Package xcolor Info: Model `HTML' substituted by `rgb' on input line 1368. +Package xcolor Info: Model `Hsb' substituted by `hsb' on input line 1369. +Package xcolor Info: Model `tHsb' substituted by `hsb' on input line 1370. +Package xcolor Info: Model `HSB' substituted by `hsb' on input line 1371. +Package xcolor Info: Model `Gray' substituted by `gray' on input line 1372. +Package xcolor Info: Model `wave' substituted by `hsb' on input line 1373. +) +(/usr/share/texlive/texmf-dist/tex/generic/pgf/basiclayer/pgfcore.code.tex +Package: pgfcore 2023-01-15 v3.1.10 (3.1.10) + +(/usr/share/texlive/texmf-dist/tex/generic/pgf/math/pgfint.code.tex) +(/usr/share/texlive/texmf-dist/tex/generic/pgf/basiclayer/pgfcorepoints.code.te +x +File: pgfcorepoints.code.tex 2023-01-15 v3.1.10 (3.1.10) +\pgf@picminx=\dimen187 +\pgf@picmaxx=\dimen188 +\pgf@picminy=\dimen189 +\pgf@picmaxy=\dimen190 +\pgf@pathminx=\dimen191 +\pgf@pathmaxx=\dimen192 +\pgf@pathminy=\dimen193 +\pgf@pathmaxy=\dimen194 +\pgf@xx=\dimen195 +\pgf@xy=\dimen196 +\pgf@yx=\dimen197 +\pgf@yy=\dimen198 +\pgf@zx=\dimen199 +\pgf@zy=\dimen256 +) +(/usr/share/texlive/texmf-dist/tex/generic/pgf/basiclayer/pgfcorepathconstruct. +code.tex +File: pgfcorepathconstruct.code.tex 2023-01-15 v3.1.10 (3.1.10) +\pgf@path@lastx=\dimen257 +\pgf@path@lasty=\dimen258 +) +(/usr/share/texlive/texmf-dist/tex/generic/pgf/basiclayer/pgfcorepathusage.code +.tex +File: pgfcorepathusage.code.tex 2023-01-15 v3.1.10 (3.1.10) +\pgf@shorten@end@additional=\dimen259 +\pgf@shorten@start@additional=\dimen260 +) +(/usr/share/texlive/texmf-dist/tex/generic/pgf/basiclayer/pgfcorescopes.code.te +x +File: pgfcorescopes.code.tex 2023-01-15 v3.1.10 (3.1.10) +\pgfpic=\box62 +\pgf@hbox=\box63 +\pgf@layerbox@main=\box64 +\pgf@picture@serial@count=\count295 +) +(/usr/share/texlive/texmf-dist/tex/generic/pgf/basiclayer/pgfcoregraphicstate.c +ode.tex +File: pgfcoregraphicstate.code.tex 2023-01-15 v3.1.10 (3.1.10) +\pgflinewidth=\dimen261 +) +(/usr/share/texlive/texmf-dist/tex/generic/pgf/basiclayer/pgfcoretransformation +s.code.tex +File: pgfcoretransformations.code.tex 2023-01-15 v3.1.10 (3.1.10) +\pgf@pt@x=\dimen262 +\pgf@pt@y=\dimen263 +\pgf@pt@temp=\dimen264 +) +(/usr/share/texlive/texmf-dist/tex/generic/pgf/basiclayer/pgfcorequick.code.tex +File: pgfcorequick.code.tex 2023-01-15 v3.1.10 (3.1.10) +) +(/usr/share/texlive/texmf-dist/tex/generic/pgf/basiclayer/pgfcoreobjects.code.t +ex +File: pgfcoreobjects.code.tex 2023-01-15 v3.1.10 (3.1.10) +) +(/usr/share/texlive/texmf-dist/tex/generic/pgf/basiclayer/pgfcorepathprocessing +.code.tex +File: pgfcorepathprocessing.code.tex 2023-01-15 v3.1.10 (3.1.10) +) +(/usr/share/texlive/texmf-dist/tex/generic/pgf/basiclayer/pgfcorearrows.code.te +x +File: pgfcorearrows.code.tex 2023-01-15 v3.1.10 (3.1.10) +\pgfarrowsep=\dimen265 +) +(/usr/share/texlive/texmf-dist/tex/generic/pgf/basiclayer/pgfcoreshade.code.tex +File: pgfcoreshade.code.tex 2023-01-15 v3.1.10 (3.1.10) +\pgf@max=\dimen266 +\pgf@sys@shading@range@num=\count296 +\pgf@shadingcount=\count297 +) +(/usr/share/texlive/texmf-dist/tex/generic/pgf/basiclayer/pgfcoreimage.code.tex +File: pgfcoreimage.code.tex 2023-01-15 v3.1.10 (3.1.10) +) +(/usr/share/texlive/texmf-dist/tex/generic/pgf/basiclayer/pgfcoreexternal.code. +tex +File: pgfcoreexternal.code.tex 2023-01-15 v3.1.10 (3.1.10) +\pgfexternal@startupbox=\box65 +) +(/usr/share/texlive/texmf-dist/tex/generic/pgf/basiclayer/pgfcorelayers.code.te +x +File: pgfcorelayers.code.tex 2023-01-15 v3.1.10 (3.1.10) +) +(/usr/share/texlive/texmf-dist/tex/generic/pgf/basiclayer/pgfcoretransparency.c +ode.tex +File: pgfcoretransparency.code.tex 2023-01-15 v3.1.10 (3.1.10) +) +(/usr/share/texlive/texmf-dist/tex/generic/pgf/basiclayer/pgfcorepatterns.code. +tex +File: pgfcorepatterns.code.tex 2023-01-15 v3.1.10 (3.1.10) +) +(/usr/share/texlive/texmf-dist/tex/generic/pgf/basiclayer/pgfcorerdf.code.tex +File: pgfcorerdf.code.tex 2023-01-15 v3.1.10 (3.1.10) +))) (/usr/share/texlive/texmf-dist/tex/latex/pgf/utilities/xxcolor.sty +Package: xxcolor 2003/10/24 ver 0.1 +\XC@nummixins=\count298 +\XC@countmixins=\count299 +) +(/usr/share/texlive/texmf-dist/tex/latex/base/atbegshi-ltx.sty +Package: atbegshi-ltx 2021/01/10 v1.0c Emulation of the original atbegshi +package with kernel methods +) +(/usr/share/texlive/texmf-dist/tex/latex/hyperref/hyperref.sty +Package: hyperref 2024-01-20 v7.01h Hypertext links for LaTeX + +(/usr/share/texlive/texmf-dist/tex/latex/kvsetkeys/kvsetkeys.sty +Package: kvsetkeys 2022-10-05 v1.19 Key value parser (HO) +) +(/usr/share/texlive/texmf-dist/tex/generic/kvdefinekeys/kvdefinekeys.sty +Package: kvdefinekeys 2019-12-19 v1.6 Define keys (HO) +) +(/usr/share/texlive/texmf-dist/tex/generic/pdfescape/pdfescape.sty +Package: pdfescape 2019/12/09 v1.15 Implements pdfTeX's escape features (HO) + +(/usr/share/texlive/texmf-dist/tex/generic/ltxcmds/ltxcmds.sty +Package: ltxcmds 2023-12-04 v1.26 LaTeX kernel commands for general use (HO) +) +(/usr/share/texlive/texmf-dist/tex/generic/pdftexcmds/pdftexcmds.sty +Package: pdftexcmds 2020-06-27 v0.33 Utility functions of pdfTeX for LuaTeX (HO +) + +(/usr/share/texlive/texmf-dist/tex/generic/infwarerr/infwarerr.sty +Package: infwarerr 2019/12/03 v1.5 Providing info/warning/error messages (HO) +) +Package pdftexcmds Info: \pdf@primitive is available. +Package pdftexcmds Info: \pdf@ifprimitive is available. +Package pdftexcmds Info: \pdfdraftmode found. +)) +(/usr/share/texlive/texmf-dist/tex/latex/hycolor/hycolor.sty +Package: hycolor 2020-01-27 v1.10 Color options for hyperref/bookmark (HO) +) +(/usr/share/texlive/texmf-dist/tex/latex/auxhook/auxhook.sty +Package: auxhook 2019-12-17 v1.6 Hooks for auxiliary files (HO) +) +(/usr/share/texlive/texmf-dist/tex/latex/hyperref/nameref.sty +Package: nameref 2023-11-26 v2.56 Cross-referencing by name of section + +(/usr/share/texlive/texmf-dist/tex/latex/refcount/refcount.sty +Package: refcount 2019/12/15 v3.6 Data extraction from label references (HO) +) +(/usr/share/texlive/texmf-dist/tex/generic/gettitlestring/gettitlestring.sty +Package: gettitlestring 2019/12/15 v1.6 Cleanup title references (HO) + +(/usr/share/texlive/texmf-dist/tex/latex/kvoptions/kvoptions.sty +Package: kvoptions 2022-06-15 v3.15 Key value format for package options (HO) +)) +\c@section@level=\count300 +) +\@linkdim=\dimen267 +\Hy@linkcounter=\count301 +\Hy@pagecounter=\count302 + +(/usr/share/texlive/texmf-dist/tex/latex/hyperref/pd1enc.def +File: pd1enc.def 2024-01-20 v7.01h Hyperref: PDFDocEncoding definition (HO) +Now handling font encoding PD1 ... +... no UTF-8 mapping file for font encoding PD1 +) +(/usr/share/texlive/texmf-dist/tex/generic/intcalc/intcalc.sty +Package: intcalc 2019/12/15 v1.3 Expandable calculations with integers (HO) +) +\Hy@SavedSpaceFactor=\count303 + +(/usr/share/texlive/texmf-dist/tex/latex/hyperref/puenc.def +File: puenc.def 2024-01-20 v7.01h Hyperref: PDF Unicode definition (HO) +Now handling font encoding PU ... +... no UTF-8 mapping file for font encoding PU +) +Package hyperref Info: Option `bookmarks' set `true' on input line 4062. +Package hyperref Info: Option `bookmarksopen' set `true' on input line 4062. +Package hyperref Info: Option `implicit' set `false' on input line 4062. +Package hyperref Info: Hyper figures OFF on input line 4179. +Package hyperref Info: Link nesting OFF on input line 4184. +Package hyperref Info: Hyper index ON on input line 4187. +Package hyperref Info: Plain pages OFF on input line 4194. +Package hyperref Info: Backreferencing OFF on input line 4199. +Package hyperref Info: Implicit mode OFF; no redefinition of LaTeX internals. +Package hyperref Info: Bookmarks ON on input line 4446. +\c@Hy@tempcnt=\count304 + +(/usr/share/texlive/texmf-dist/tex/latex/url/url.sty +\Urlmuskip=\muskip16 +Package: url 2013/09/16 ver 3.4 Verb mode for urls, etc. +) +LaTeX Info: Redefining \url on input line 4784. +\XeTeXLinkMargin=\dimen268 + +(/usr/share/texlive/texmf-dist/tex/generic/bitset/bitset.sty +Package: bitset 2019/12/09 v1.3 Handle bit-vector datatype (HO) + +(/usr/share/texlive/texmf-dist/tex/generic/bigintcalc/bigintcalc.sty +Package: bigintcalc 2019/12/15 v1.5 Expandable calculations on big integers (HO +) +)) +\Fld@menulength=\count305 +\Field@Width=\dimen269 +\Fld@charsize=\dimen270 +Package hyperref Info: Hyper figures OFF on input line 6063. +Package hyperref Info: Link nesting OFF on input line 6068. +Package hyperref Info: Hyper index ON on input line 6071. +Package hyperref Info: backreferencing OFF on input line 6078. +Package hyperref Info: Link coloring OFF on input line 6083. +Package hyperref Info: Link coloring with OCG OFF on input line 6088. +Package hyperref Info: PDF/A mode OFF on input line 6093. +\Hy@abspage=\count306 + + +Package hyperref Message: Stopped early. + +) +Package hyperref Info: Driver (autodetected): hpdftex. + (/usr/share/texlive/texmf-dist/tex/latex/hyperref/hpdftex.def +File: hpdftex.def 2024-01-20 v7.01h Hyperref driver for pdfTeX + +(/usr/share/texlive/texmf-dist/tex/latex/base/atveryend-ltx.sty +Package: atveryend-ltx 2020/08/19 v1.0a Emulation of the original atveryend pac +kage +with kernel methods +) +\Fld@listcount=\count307 +\c@bookmark@seq@number=\count308 + +(/usr/share/texlive/texmf-dist/tex/latex/rerunfilecheck/rerunfilecheck.sty +Package: rerunfilecheck 2022-07-10 v1.10 Rerun checks for auxiliary files (HO) + +(/usr/share/texlive/texmf-dist/tex/generic/uniquecounter/uniquecounter.sty +Package: uniquecounter 2019/12/15 v1.4 Provide unlimited unique counter (HO) +) +Package uniquecounter Info: New unique counter `rerunfilecheck' on input line 2 +85. +)) +(/usr/share/texlive/texmf-dist/tex/latex/beamer/beamerbaserequires.sty +(/usr/share/texlive/texmf-dist/tex/latex/beamer/beamerbasecompatibility.sty) +(/usr/share/texlive/texmf-dist/tex/latex/beamer/beamerbasefont.sty +(/usr/share/texlive/texmf-dist/tex/latex/amsfonts/amssymb.sty +Package: amssymb 2013/01/14 v3.01 AMS font symbols + +(/usr/share/texlive/texmf-dist/tex/latex/amsfonts/amsfonts.sty +Package: amsfonts 2013/01/14 v3.01 Basic AMSFonts support +\@emptytoks=\toks29 +\symAMSa=\mathgroup4 +\symAMSb=\mathgroup5 +LaTeX Font Info: Redeclaring math symbol \hbar on input line 98. +LaTeX Font Info: Overwriting math alphabet `\mathfrak' in version `bold' +(Font) U/euf/m/n --> U/euf/b/n on input line 106. +)) +(/usr/share/texlive/texmf-dist/tex/latex/sansmathaccent/sansmathaccent.sty +Package: sansmathaccent 2020/01/31 + +(/usr/share/texlive/texmf-dist/tex/latex/koma-script/scrlfile.sty +Package: scrlfile 2023/07/07 v3.41 KOMA-Script package (file load hooks) + +(/usr/share/texlive/texmf-dist/tex/latex/koma-script/scrlfile-hook.sty +Package: scrlfile-hook 2023/07/07 v3.41 KOMA-Script package (using LaTeX hooks) + + +(/usr/share/texlive/texmf-dist/tex/latex/koma-script/scrlogo.sty +Package: scrlogo 2023/07/07 v3.41 KOMA-Script package (logo) +))))) +(/usr/share/texlive/texmf-dist/tex/latex/beamer/beamerbasetranslator.sty +(/usr/share/texlive/texmf-dist/tex/latex/translator/translator.sty +Package: translator 2021-05-31 v1.12d Easy translation of strings in LaTeX +)) +(/usr/share/texlive/texmf-dist/tex/latex/beamer/beamerbasemisc.sty) +(/usr/share/texlive/texmf-dist/tex/latex/beamer/beamerbasetwoscreens.sty) +(/usr/share/texlive/texmf-dist/tex/latex/beamer/beamerbaseoverlay.sty +\beamer@argscount=\count309 +\beamer@lastskipcover=\skip51 +\beamer@trivlistdepth=\count310 +) +(/usr/share/texlive/texmf-dist/tex/latex/beamer/beamerbasetitle.sty) +(/usr/share/texlive/texmf-dist/tex/latex/beamer/beamerbasesection.sty +\c@lecture=\count311 +\c@part=\count312 +\c@section=\count313 +\c@subsection=\count314 +\c@subsubsection=\count315 +) +(/usr/share/texlive/texmf-dist/tex/latex/beamer/beamerbaseframe.sty +\beamer@framebox=\box66 +\beamer@frametitlebox=\box67 +\beamer@zoombox=\box68 +\beamer@zoomcount=\count316 +\beamer@zoomframecount=\count317 +\beamer@frametextheight=\dimen271 +\c@subsectionslide=\count318 +\beamer@frametopskip=\skip52 +\beamer@framebottomskip=\skip53 +\beamer@frametopskipautobreak=\skip54 +\beamer@framebottomskipautobreak=\skip55 +\beamer@envbody=\toks30 +\framewidth=\dimen272 +\c@framenumber=\count319 +) +(/usr/share/texlive/texmf-dist/tex/latex/beamer/beamerbaseverbatim.sty +\beamer@verbatimfileout=\write4 +) +(/usr/share/texlive/texmf-dist/tex/latex/beamer/beamerbaseframesize.sty +\beamer@splitbox=\box69 +\beamer@autobreakcount=\count320 +\beamer@autobreaklastheight=\dimen273 +\beamer@frametitletoks=\toks31 +\beamer@framesubtitletoks=\toks32 +) +(/usr/share/texlive/texmf-dist/tex/latex/beamer/beamerbaseframecomponents.sty +\beamer@footins=\box70 +) +(/usr/share/texlive/texmf-dist/tex/latex/beamer/beamerbasecolor.sty) +(/usr/share/texlive/texmf-dist/tex/latex/beamer/beamerbasenotes.sty +\beamer@frameboxcopy=\box71 +) +(/usr/share/texlive/texmf-dist/tex/latex/beamer/beamerbasetoc.sty) +(/usr/share/texlive/texmf-dist/tex/latex/beamer/beamerbasetemplates.sty +\beamer@sbttoks=\toks33 + +(/usr/share/texlive/texmf-dist/tex/latex/beamer/beamerbaseauxtemplates.sty +(/usr/share/texlive/texmf-dist/tex/latex/beamer/beamerbaseboxes.sty +\bmb@box=\box72 +\bmb@colorbox=\box73 +\bmb@boxwidth=\dimen274 +\bmb@boxheight=\dimen275 +\bmb@prevheight=\dimen276 +\bmb@temp=\dimen277 +\bmb@dima=\dimen278 +\bmb@dimb=\dimen279 +\bmb@prevheight=\dimen280 +) +\beamer@blockheadheight=\dimen281 +)) +(/usr/share/texlive/texmf-dist/tex/latex/beamer/beamerbaselocalstructure.sty +(/usr/share/texlive/texmf-dist/tex/latex/tools/enumerate.sty +Package: enumerate 2023/07/04 v3.00 enumerate extensions (DPC) +\@enLab=\toks34 +) +\beamer@bibiconwidth=\skip56 +\c@figure=\count321 +\c@table=\count322 +\abovecaptionskip=\skip57 +\belowcaptionskip=\skip58 +) +(/usr/share/texlive/texmf-dist/tex/latex/beamer/beamerbasenavigation.sty +(/usr/share/texlive/texmf-dist/tex/latex/beamer/beamerbasenavigationsymbols.tex +) +\beamer@section@min@dim=\dimen282 +) (/usr/share/texlive/texmf-dist/tex/latex/beamer/beamerbasetheorems.sty +(/usr/share/texlive/texmf-dist/tex/latex/amsmath/amsmath.sty +Package: amsmath 2023/05/13 v2.17o AMS math features +\@mathmargin=\skip59 + +For additional information on amsmath, use the `?' option. +(/usr/share/texlive/texmf-dist/tex/latex/amsmath/amstext.sty +Package: amstext 2021/08/26 v2.01 AMS text + +(/usr/share/texlive/texmf-dist/tex/latex/amsmath/amsgen.sty +File: amsgen.sty 1999/11/30 v2.0 generic functions +\@emptytoks=\toks35 +\ex@=\dimen283 +)) +(/usr/share/texlive/texmf-dist/tex/latex/amsmath/amsbsy.sty +Package: amsbsy 1999/11/29 v1.2d Bold Symbols +\pmbraise@=\dimen284 +) +(/usr/share/texlive/texmf-dist/tex/latex/amsmath/amsopn.sty +Package: amsopn 2022/04/08 v2.04 operator names +) +\inf@bad=\count323 +LaTeX Info: Redefining \frac on input line 234. +\uproot@=\count324 +\leftroot@=\count325 +LaTeX Info: Redefining \overline on input line 399. +LaTeX Info: Redefining \colon on input line 410. +\classnum@=\count326 +\DOTSCASE@=\count327 +LaTeX Info: Redefining \ldots on input line 496. +LaTeX Info: Redefining \dots on input line 499. +LaTeX Info: Redefining \cdots on input line 620. +\Mathstrutbox@=\box74 +\strutbox@=\box75 +LaTeX Info: Redefining \big on input line 722. +LaTeX Info: Redefining \Big on input line 723. +LaTeX Info: Redefining \bigg on input line 724. +LaTeX Info: Redefining \Bigg on input line 725. +\big@size=\dimen285 +LaTeX Font Info: Redeclaring font encoding OML on input line 743. +LaTeX Font Info: Redeclaring font encoding OMS on input line 744. +\macc@depth=\count328 +LaTeX Info: Redefining \bmod on input line 905. +LaTeX Info: Redefining \pmod on input line 910. +LaTeX Info: Redefining \smash on input line 940. +LaTeX Info: Redefining \relbar on input line 970. +LaTeX Info: Redefining \Relbar on input line 971. +\c@MaxMatrixCols=\count329 +\dotsspace@=\muskip17 +\c@parentequation=\count330 +\dspbrk@lvl=\count331 +\tag@help=\toks36 +\row@=\count332 +\column@=\count333 +\maxfields@=\count334 +\andhelp@=\toks37 +\eqnshift@=\dimen286 +\alignsep@=\dimen287 +\tagshift@=\dimen288 +\tagwidth@=\dimen289 +\totwidth@=\dimen290 +\lineht@=\dimen291 +\@envbody=\toks38 +\multlinegap=\skip60 +\multlinetaggap=\skip61 +\mathdisplay@stack=\toks39 +LaTeX Info: Redefining \[ on input line 2953. +LaTeX Info: Redefining \] on input line 2954. +) +(/usr/share/texlive/texmf-dist/tex/latex/amscls/amsthm.sty +Package: amsthm 2020/05/29 v2.20.6 +\thm@style=\toks40 +\thm@bodyfont=\toks41 +\thm@headfont=\toks42 +\thm@notefont=\toks43 +\thm@headpunct=\toks44 +\thm@preskip=\skip62 +\thm@postskip=\skip63 +\thm@headsep=\skip64 +\dth@everypar=\toks45 +) +\c@theorem=\count335 +) +(/usr/share/texlive/texmf-dist/tex/latex/beamer/beamerbasethemes.sty)) +(/usr/share/texlive/texmf-dist/tex/latex/beamer/beamerthemedefault.sty +(/usr/share/texlive/texmf-dist/tex/latex/beamer/beamerfontthemedefault.sty) +(/usr/share/texlive/texmf-dist/tex/latex/beamer/beamercolorthemedefault.sty) +(/usr/share/texlive/texmf-dist/tex/latex/beamer/beamerinnerthemedefault.sty +\beamer@dima=\dimen292 +\beamer@dimb=\dimen293 +) +(/usr/share/texlive/texmf-dist/tex/latex/beamer/beamerouterthemedefault.sty))) +(/usr/share/texlive/texmf-dist/tex/latex/beamertheme-metropolis/beamerthememetr +opolis.sty +Package: beamerthememetropolis 2017/01/23 v1.2 Metropolis Beamer theme + (/usr/share/texlive/texmf-dist/tex/latex/pgfopts/pgfopts.sty +Package: pgfopts 2014/07/10 v2.1a LaTeX package options with pgfkeys +\pgfopts@list@add@a@toks=\toks46 +\pgfopts@list@add@b@toks=\toks47 +) +(/usr/share/texlive/texmf-dist/tex/latex/beamertheme-metropolis/beamerinnerthem +emetropolis.sty +Package: beamerinnerthememetropolis 2017/01/23 Metropolis inner theme + (/usr/share/texlive/texmf-dist/tex/latex/tools/calc.sty +Package: calc 2023/07/08 v4.3 Infix arithmetic (KKT,FJ) +\calc@Acount=\count336 +\calc@Bcount=\count337 +\calc@Adimen=\dimen294 +\calc@Bdimen=\dimen295 +\calc@Askip=\skip65 +\calc@Bskip=\skip66 +LaTeX Info: Redefining \setlength on input line 80. +LaTeX Info: Redefining \addtolength on input line 81. +\calc@Ccount=\count338 +\calc@Cskip=\skip67 +) +(/usr/share/texlive/texmf-dist/tex/latex/pgf/frontendlayer/tikz.sty +(/usr/share/texlive/texmf-dist/tex/latex/pgf/basiclayer/pgf.sty +Package: pgf 2023-01-15 v3.1.10 (3.1.10) + +(/usr/share/texlive/texmf-dist/tex/generic/pgf/modules/pgfmoduleshapes.code.tex +File: pgfmoduleshapes.code.tex 2023-01-15 v3.1.10 (3.1.10) +\pgfnodeparttextbox=\box76 +) (/usr/share/texlive/texmf-dist/tex/generic/pgf/modules/pgfmoduleplot.code.tex +File: pgfmoduleplot.code.tex 2023-01-15 v3.1.10 (3.1.10) +) +(/usr/share/texlive/texmf-dist/tex/latex/pgf/compatibility/pgfcomp-version-0-65 +.sty +Package: pgfcomp-version-0-65 2023-01-15 v3.1.10 (3.1.10) +\pgf@nodesepstart=\dimen296 +\pgf@nodesepend=\dimen297 +) +(/usr/share/texlive/texmf-dist/tex/latex/pgf/compatibility/pgfcomp-version-1-18 +.sty +Package: pgfcomp-version-1-18 2023-01-15 v3.1.10 (3.1.10) +)) (/usr/share/texlive/texmf-dist/tex/latex/pgf/utilities/pgffor.sty +(/usr/share/texlive/texmf-dist/tex/generic/pgf/utilities/pgffor.code.tex +Package: pgffor 2023-01-15 v3.1.10 (3.1.10) +\pgffor@iter=\dimen298 +\pgffor@skip=\dimen299 +\pgffor@stack=\toks48 +\pgffor@toks=\toks49 +)) +(/usr/share/texlive/texmf-dist/tex/generic/pgf/frontendlayer/tikz/tikz.code.tex +Package: tikz 2023-01-15 v3.1.10 (3.1.10) + +(/usr/share/texlive/texmf-dist/tex/generic/pgf/libraries/pgflibraryplothandlers +.code.tex +File: pgflibraryplothandlers.code.tex 2023-01-15 v3.1.10 (3.1.10) +\pgf@plot@mark@count=\count339 +\pgfplotmarksize=\dimen300 +) +\tikz@lastx=\dimen301 +\tikz@lasty=\dimen302 +\tikz@lastxsaved=\dimen303 +\tikz@lastysaved=\dimen304 +\tikz@lastmovetox=\dimen305 +\tikz@lastmovetoy=\dimen306 +\tikzleveldistance=\dimen307 +\tikzsiblingdistance=\dimen308 +\tikz@figbox=\box77 +\tikz@figbox@bg=\box78 +\tikz@tempbox=\box79 +\tikz@tempbox@bg=\box80 +\tikztreelevel=\count340 +\tikznumberofchildren=\count341 +\tikznumberofcurrentchild=\count342 +\tikz@fig@count=\count343 + +(/usr/share/texlive/texmf-dist/tex/generic/pgf/modules/pgfmodulematrix.code.tex +File: pgfmodulematrix.code.tex 2023-01-15 v3.1.10 (3.1.10) +\pgfmatrixcurrentrow=\count344 +\pgfmatrixcurrentcolumn=\count345 +\pgf@matrix@numberofcolumns=\count346 +) +\tikz@expandcount=\count347 + +(/usr/share/texlive/texmf-dist/tex/generic/pgf/frontendlayer/tikz/libraries/tik +zlibrarytopaths.code.tex +File: tikzlibrarytopaths.code.tex 2023-01-15 v3.1.10 (3.1.10) +))) +\metropolis@titleseparator@linewidth=\skip68 +\metropolis@progressonsectionpage=\skip69 +\metropolis@progressonsectionpage@linewidth=\skip70 +\metropolis@blocksep=\skip71 +\metropolis@blockadjust=\skip72 +\metropolis@parskip=\skip73 +) +(/usr/share/texlive/texmf-dist/tex/latex/beamertheme-metropolis/beamerouterthem +emetropolis.sty +Package: beamerouterthememetropolis 2017/01/23 Metropolis outer theme +\metropolis@frametitle@padding=\skip74 +\metropolis@progressinheadfoot=\skip75 +\metropolis@progressinheadfoot@linewidth=\skip76 +) +(/usr/share/texlive/texmf-dist/tex/latex/beamertheme-metropolis/beamercolorthem +emetropolis.sty +Package: beamercolorthememetropolis 2017/01/23 Metropolis color theme +) +(/usr/share/texlive/texmf-dist/tex/latex/beamertheme-metropolis/beamerfonttheme +metropolis.sty +Package: beamerfontthememetropolis 2017/01/23 Metropolis font theme + (/usr/share/texlive/texmf-dist/tex/generic/iftex/ifxetex.sty +Package: ifxetex 2019/10/25 v0.7 ifxetex legacy package. Use iftex instead. +) +(/usr/share/texlive/texmf-dist/tex/generic/iftex/ifluatex.sty +Package: ifluatex 2019/10/25 v1.5 ifluatex legacy package. Use iftex instead. +) + +Package beamerthememetropolis Warning: You need to compile with XeLaTeX or LuaL +aTeX to use the Fira fonts on input line 95. + +)) +(/usr/share/texlive/texmf-dist/tex/generic/pgf/libraries/pgflibraryarrows.meta. +code.tex +File: pgflibraryarrows.meta.code.tex 2023-01-15 v3.1.10 (3.1.10) +\pgfarrowinset=\dimen309 +\pgfarrowlength=\dimen310 +\pgfarrowwidth=\dimen311 +\pgfarrowlinewidth=\dimen312 +) +(/usr/share/texlive/texmf-dist/tex/generic/pgf/frontendlayer/tikz/libraries/tik +zlibrarypositioning.code.tex +File: tikzlibrarypositioning.code.tex 2023-01-15 v3.1.10 (3.1.10) +) +(/usr/share/texlive/texmf-dist/tex/generic/pgf/frontendlayer/tikz/libraries/tik +zlibrarycalc.code.tex +File: tikzlibrarycalc.code.tex 2023-01-15 v3.1.10 (3.1.10) +) +(/usr/share/texlive/texmf-dist/tex/generic/pgf/frontendlayer/tikz/libraries/tik +zlibrarydecorations.pathmorphing.code.tex +(/usr/share/texlive/texmf-dist/tex/generic/pgf/frontendlayer/tikz/libraries/tik +zlibrarydecorations.code.tex +(/usr/share/texlive/texmf-dist/tex/generic/pgf/modules/pgfmoduledecorations.cod +e.tex +\pgfdecoratedcompleteddistance=\dimen313 +\pgfdecoratedremainingdistance=\dimen314 +\pgfdecoratedinputsegmentcompleteddistance=\dimen315 +\pgfdecoratedinputsegmentremainingdistance=\dimen316 +\pgf@decorate@distancetomove=\dimen317 +\pgf@decorate@repeatstate=\count348 +\pgfdecorationsegmentamplitude=\dimen318 +\pgfdecorationsegmentlength=\dimen319 +) +\tikz@lib@dec@box=\box81 +) +(/usr/share/texlive/texmf-dist/tex/generic/pgf/libraries/decorations/pgflibrary +decorations.pathmorphing.code.tex)) +(/usr/share/texlive/texmf-dist/tex/generic/pgf/frontendlayer/tikz/libraries/tik +zlibrarybackgrounds.code.tex +File: tikzlibrarybackgrounds.code.tex 2023-01-15 v3.1.10 (3.1.10) +\pgf@layerbox@background=\box82 +\pgf@layerboxsaved@background=\box83 +) +(/usr/share/texlive/texmf-dist/tex/generic/pgf/frontendlayer/tikz/libraries/tik +zlibraryfit.code.tex +File: tikzlibraryfit.code.tex 2023-01-15 v3.1.10 (3.1.10) +) +(/usr/share/texlive/texmf-dist/tex/latex/l3backend/l3backend-pdftex.def +File: l3backend-pdftex.def 2024-01-04 L3 backend support: PDF output (pdfTeX) +\l__color_backend_stack_int=\count349 +\l__pdf_internal_box=\box84 +) +No file gradient_flow_dag.aux. +\openout1 = `gradient_flow_dag.aux'. + +LaTeX Font Info: Checking defaults for OML/cmm/m/it on input line 20. +LaTeX Font Info: ... okay on input line 20. +LaTeX Font Info: Checking defaults for OMS/cmsy/m/n on input line 20. +LaTeX Font Info: ... okay on input line 20. +LaTeX Font Info: Checking defaults for OT1/cmr/m/n on input line 20. +LaTeX Font Info: ... okay on input line 20. +LaTeX Font Info: Checking defaults for T1/cmr/m/n on input line 20. +LaTeX Font Info: ... okay on input line 20. +LaTeX Font Info: Checking defaults for TS1/cmr/m/n on input line 20. +LaTeX Font Info: ... okay on input line 20. +LaTeX Font Info: Checking defaults for OMX/cmex/m/n on input line 20. +LaTeX Font Info: ... okay on input line 20. +LaTeX Font Info: Checking defaults for U/cmr/m/n on input line 20. +LaTeX Font Info: ... okay on input line 20. +LaTeX Font Info: Checking defaults for PD1/pdf/m/n on input line 20. +LaTeX Font Info: ... okay on input line 20. +LaTeX Font Info: Checking defaults for PU/pdf/m/n on input line 20. +LaTeX Font Info: ... okay on input line 20. +*geometry* driver: auto-detecting +*geometry* detected driver: pdftex +*geometry* verbose mode - [ preamble ] result: +* driver: pdftex +* paper: custom +* layout: +* layoutoffset:(h,v)=(0.0pt,0.0pt) +* modes: includehead includefoot +* h-part:(L,W,R)=(28.45274pt, 398.3386pt, 28.45274pt) +* v-part:(T,H,B)=(0.0pt, 256.0748pt, 0.0pt) +* \paperwidth=455.24408pt +* \paperheight=256.0748pt +* \textwidth=398.3386pt +* \textheight=227.62207pt +* \oddsidemargin=-43.81725pt +* \evensidemargin=-43.81725pt +* \topmargin=-72.26999pt +* \headheight=14.22636pt +* \headsep=0.0pt +* \topskip=11.0pt +* \footskip=14.22636pt +* \marginparwidth=4.0pt +* \marginparsep=10.0pt +* \columnsep=10.0pt +* \skip\footins=10.0pt plus 4.0pt minus 2.0pt +* \hoffset=0.0pt +* \voffset=0.0pt +* \mag=1000 +* \@twocolumnfalse +* \@twosidefalse +* \@mparswitchfalse +* \@reversemarginfalse +* (1in=72.27pt=25.4mm, 1cm=28.453pt) + +(/usr/share/texlive/texmf-dist/tex/context/base/mkii/supp-pdf.mkii +[Loading MPS to PDF converter (version 2006.09.02).] +\scratchcounter=\count350 +\scratchdimen=\dimen320 +\scratchbox=\box85 +\nofMPsegments=\count351 +\nofMParguments=\count352 +\everyMPshowfont=\toks50 +\MPscratchCnt=\count353 +\MPscratchDim=\dimen321 +\MPnumerator=\count354 +\makeMPintoPDFobject=\count355 +\everyMPtoPDFconversion=\toks51 +) (/usr/share/texlive/texmf-dist/tex/latex/epstopdf-pkg/epstopdf-base.sty +Package: epstopdf-base 2020-01-24 v2.11 Base part for package epstopdf +Package epstopdf-base Info: Redefining graphics rule for `.eps' on input line 4 +85. + +(/usr/share/texlive/texmf-dist/tex/latex/latexconfig/epstopdf-sys.cfg +File: epstopdf-sys.cfg 2010/07/13 v1.3 Configuration of (r)epstopdf for TeX Liv +e +)) +Package hyperref Info: Link coloring OFF on input line 20. +\@outlinefile=\write5 +\openout5 = `gradient_flow_dag.out'. + +LaTeX Font Info: Overwriting symbol font `operators' in version `normal' +(Font) OT1/cmr/m/n --> OT1/cmss/m/n on input line 20. +LaTeX Font Info: Overwriting symbol font `operators' in version `bold' +(Font) OT1/cmr/bx/n --> OT1/cmss/b/n on input line 20. +\symnumbers=\mathgroup6 +\sympureletters=\mathgroup7 +LaTeX Font Info: Overwriting math alphabet `\mathrm' in version `normal' +(Font) OT1/cmss/m/n --> OT1/cmr/m/n on input line 20. +LaTeX Font Info: Redeclaring math alphabet \mathbf on input line 20. +LaTeX Font Info: Overwriting math alphabet `\mathbf' in version `normal' +(Font) OT1/cmr/bx/n --> OT1/cmss/b/n on input line 20. +LaTeX Font Info: Overwriting math alphabet `\mathbf' in version `bold' +(Font) OT1/cmr/bx/n --> OT1/cmss/b/n on input line 20. +LaTeX Font Info: Redeclaring math alphabet \mathsf on input line 20. +LaTeX Font Info: Overwriting math alphabet `\mathsf' in version `normal' +(Font) OT1/cmss/m/n --> OT1/cmss/m/n on input line 20. +LaTeX Font Info: Overwriting math alphabet `\mathsf' in version `bold' +(Font) OT1/cmss/bx/n --> OT1/cmss/m/n on input line 20. +LaTeX Font Info: Redeclaring math alphabet \mathit on input line 20. +LaTeX Font Info: Overwriting math alphabet `\mathit' in version `normal' +(Font) OT1/cmr/m/it --> OT1/cmss/m/it on input line 20. +LaTeX Font Info: Overwriting math alphabet `\mathit' in version `bold' +(Font) OT1/cmr/bx/it --> OT1/cmss/m/it on input line 20. +LaTeX Font Info: Redeclaring math alphabet \mathtt on input line 20. +LaTeX Font Info: Overwriting math alphabet `\mathtt' in version `normal' +(Font) OT1/cmtt/m/n --> OT1/cmtt/m/n on input line 20. +LaTeX Font Info: Overwriting math alphabet `\mathtt' in version `bold' +(Font) OT1/cmtt/m/n --> OT1/cmtt/m/n on input line 20. +LaTeX Font Info: Overwriting symbol font `numbers' in version `bold' +(Font) OT1/cmss/m/n --> OT1/cmss/b/n on input line 20. +LaTeX Font Info: Overwriting symbol font `pureletters' in version `bold' +(Font) OT1/cmss/m/it --> OT1/cmss/b/it on input line 20. +LaTeX Font Info: Overwriting math alphabet `\mathrm' in version `bold' +(Font) OT1/cmss/b/n --> OT1/cmr/b/n on input line 20. +LaTeX Font Info: Overwriting math alphabet `\mathbf' in version `bold' +(Font) OT1/cmss/b/n --> OT1/cmss/b/n on input line 20. +LaTeX Font Info: Overwriting math alphabet `\mathsf' in version `bold' +(Font) OT1/cmss/m/n --> OT1/cmss/b/n on input line 20. +LaTeX Font Info: Overwriting math alphabet `\mathit' in version `bold' +(Font) OT1/cmss/m/it --> OT1/cmss/b/it on input line 20. +LaTeX Font Info: Overwriting math alphabet `\mathtt' in version `bold' +(Font) OT1/cmtt/m/n --> OT1/cmtt/b/n on input line 20. +LaTeX Font Info: Redeclaring symbol font `pureletters' on input line 20. +LaTeX Font Info: Overwriting symbol font `pureletters' in version `normal' +(Font) OT1/cmss/m/it --> OT1/mathkerncmss/m/sl on input line 2 +0. +LaTeX Font Info: Overwriting symbol font `pureletters' in version `bold' +(Font) OT1/cmss/b/it --> OT1/mathkerncmss/m/sl on input line 2 +0. +LaTeX Font Info: Overwriting symbol font `pureletters' in version `bold' +(Font) OT1/mathkerncmss/m/sl --> OT1/mathkerncmss/bx/sl on inp +ut line 20. + +(/usr/share/texlive/texmf-dist/tex/latex/translator/translator-basic-dictionary +-English.dict +Dictionary: translator-basic-dictionary, Language: English +) +(/usr/share/texlive/texmf-dist/tex/latex/translator/translator-bibliography-dic +tionary-English.dict +Dictionary: translator-bibliography-dictionary, Language: English +) +(/usr/share/texlive/texmf-dist/tex/latex/translator/translator-environment-dict +ionary-English.dict +Dictionary: translator-environment-dictionary, Language: English +) +(/usr/share/texlive/texmf-dist/tex/latex/translator/translator-months-dictionar +y-English.dict +Dictionary: translator-months-dictionary, Language: English +) +(/usr/share/texlive/texmf-dist/tex/latex/translator/translator-numbers-dictiona +ry-English.dict +Dictionary: translator-numbers-dictionary, Language: English +) +(/usr/share/texlive/texmf-dist/tex/latex/translator/translator-theorem-dictiona +ry-English.dict +Dictionary: translator-theorem-dictionary, Language: English +) +No file gradient_flow_dag.nav. + +Overfull \vbox (44.55656pt too high) detected at line 24 + [] + +[1 + +{/var/lib/texmf/fonts/map/pdftex/updmap/pdftex.map}] +LaTeX Font Info: Trying to load font information for U+msa on input line 89. + + +(/usr/share/texlive/texmf-dist/tex/latex/amsfonts/umsa.fd +File: umsa.fd 2013/01/14 v3.01 AMS symbols A +) +LaTeX Font Info: Trying to load font information for U+msb on input line 89. + + +(/usr/share/texlive/texmf-dist/tex/latex/amsfonts/umsb.fd +File: umsb.fd 2013/01/14 v3.01 AMS symbols B +) +LaTeX Font Info: Trying to load font information for OT1+mathkerncmss on inp +ut line 89. + +(/usr/share/texlive/texmf-dist/tex/latex/sansmathaccent/ot1mathkerncmss.fd +File: ot1mathkerncmss.fd 2020/01/31 Fontinst v1.933 font definitions for OT1/ma +thkerncmss. +) +Overfull \hbox (97.0257pt too wide) in paragraph at lines 89--89 + [][] + [] + + +Overfull \vbox (117.03772pt too high) detected at line 89 + [] + +[2 + +] +Overfull \hbox (3.32748pt too wide) in paragraph at lines 152--152 + [][] + [] + + +Overfull \vbox (71.17813pt too high) detected at line 152 + [] + +[3 + +] +LaTeX Font Info: Font shape `OT1/cmtt/bx/n' in size <10> not available +(Font) Font shape `OT1/cmtt/m/n' tried instead on input line 218. + +Overfull \hbox (10.8974pt too wide) in paragraph at lines 218--218 + [][] + [] + +LaTeX Font Info: Font shape `OT1/cmtt/bx/n' in size <12> not available +(Font) Font shape `OT1/cmtt/m/n' tried instead on input line 218. + +Overfull \vbox (108.27122pt too high) detected at line 218 + [] + +[4 + +] +LaTeX Font Info: Font shape `OT1/cmtt/bx/n' in size <10.95> not available +(Font) Font shape `OT1/cmtt/m/n' tried instead on input line 296. + +Overfull \hbox (2.03583pt too wide) in paragraph at lines 296--296 + [][] + [] + + +Overfull \vbox (81.08545pt too high) detected at line 296 + [] + +[5 + +] +\tf@nav=\write6 +\openout6 = `gradient_flow_dag.nav'. + +\tf@toc=\write7 +\openout7 = `gradient_flow_dag.toc'. + +\tf@snm=\write8 +\openout8 = `gradient_flow_dag.snm'. + + (./gradient_flow_dag.aux) + *********** +LaTeX2e <2023-11-01> patch level 1 +L3 programming layer <2024-01-22> + *********** + + +Package rerunfilecheck Warning: File `gradient_flow_dag.out' has changed. +(rerunfilecheck) Rerun to get outlines right +(rerunfilecheck) or use package `bookmark'. + +Package rerunfilecheck Info: Checksums for `gradient_flow_dag.out': +(rerunfilecheck) Before: +(rerunfilecheck) After: D41D8CD98F00B204E9800998ECF8427E;0. + ) +Here is how much of TeX's memory you used: + 26279 strings out of 475495 + 537242 string characters out of 5782356 + 1941975 words of memory out of 5000000 + 47782 multiletter control sequences out of 15000+600000 + 577642 words of font info for 107 fonts, out of 8000000 for 9000 + 497 hyphenation exceptions out of 8191 + 128i,15n,123p,1884b,967s stack positions out of 10000i,1000n,20000p,200000b,200000s + + +Output written on gradient_flow_dag.pdf (5 pages, 173735 bytes). +PDF statistics: + 130 PDF objects out of 1000 (max. 8388607) + 82 compressed objects within 1 object stream + 11 named destinations out of 1000 (max. 500000) + 43 words of extra memory for PDF output out of 10000 (max. 10000000) + diff --git a/slides/gradient_flow_dag.nav b/slides/gradient_flow_dag.nav new file mode 100644 index 0000000000000000000000000000000000000000..369124e5b097d4fdf560681cac6ed2da8c3c1caa --- /dev/null +++ b/slides/gradient_flow_dag.nav @@ -0,0 +1,15 @@ +\headcommand {\slideentry {0}{0}{1}{1/1}{}{0}} +\headcommand {\beamer@framepages {1}{1}} +\headcommand {\slideentry {0}{0}{2}{2/2}{}{0}} +\headcommand {\beamer@framepages {2}{2}} +\headcommand {\slideentry {0}{0}{3}{3/3}{}{0}} +\headcommand {\beamer@framepages {3}{3}} +\headcommand {\slideentry {0}{0}{4}{4/4}{}{0}} +\headcommand {\beamer@framepages {4}{4}} +\headcommand {\slideentry {0}{0}{5}{5/5}{}{0}} +\headcommand {\beamer@framepages {5}{5}} +\headcommand {\beamer@partpages {1}{5}} +\headcommand {\beamer@subsectionpages {1}{5}} +\headcommand {\beamer@sectionpages {1}{5}} +\headcommand {\beamer@documentpages {5}} +\headcommand {\gdef \inserttotalframenumber {5}} diff --git a/slides/gradient_flow_dag.out b/slides/gradient_flow_dag.out new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/slides/gradient_flow_dag.snm b/slides/gradient_flow_dag.snm new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/slides/gradient_flow_dag.tex b/slides/gradient_flow_dag.tex new file mode 100644 index 0000000000000000000000000000000000000000..72c3b9e0dc2da316558c57f1981071c817ae70d8 --- /dev/null +++ b/slides/gradient_flow_dag.tex @@ -0,0 +1,298 @@ +\documentclass[aspectratio=169]{beamer} +\usetheme{metropolis} +\usepackage{tikz} +\usetikzlibrary{arrows.meta, positioning, calc, decorations.pathmorphing, backgrounds, fit} +\usepackage{amsmath, amssymb} + +\definecolor{ditblue}{RGB}{66, 133, 244} +\definecolor{projgreen}{RGB}{52, 168, 83} +\definecolor{lossred}{RGB}{234, 67, 53} +\definecolor{gradorange}{RGB}{251, 188, 4} +\definecolor{noisecolor}{RGB}{160, 160, 160} +\definecolor{dangerred}{RGB}{220, 50, 50} +\definecolor{safegray}{RGB}{180, 180, 180} +\definecolor{stopred}{RGB}{200, 30, 30} + +\title{Backward DAG: Gradient Flow in CFM Loss} +\subtitle{Why \texttt{.detach()} Prevents Gradient Explosion} +\date{} + +\begin{document} + +\begin{frame} +\titlepage +\end{frame} + +% ============================================================ +% SLIDE 1: Forward Pass (shared by both old and new) +% ============================================================ +\begin{frame}{Forward Computation Graph} +\centering +\begin{tikzpicture}[ + node distance=1.2cm and 1.8cm, + box/.style={rectangle, rounded corners=3pt, draw, minimum width=2.2cm, minimum height=0.7cm, font=\small\bfseries}, + data/.style={box, fill=blue!8, draw=ditblue!60}, + model/.style={box, fill=ditblue!15, draw=ditblue}, + op/.style={box, fill=gray!10, draw=gray!60, minimum width=1.8cm}, + loss/.style={box, fill=lossred!15, draw=lossred, minimum width=2cm}, + arr/.style={-{Stealth[length=5pt]}, thick}, +] + % Inputs + \node[data] (znoise) {$z_{\text{noise}}$}; + \node[data, right=1.5cm of znoise] (ztarget) {$z_{\text{target}}$}; + \node[data, right=1.5cm of ztarget] (t) {$t$}; + \node[data, right=1.5cm of t] (c) {$c$}; + + % Interpolation + \node[op, below=0.9cm of $(znoise)!0.5!(ztarget)$] (interp) {$z_t = (1{-}t)\,z_0 + t\,z_1$}; + \draw[arr] (znoise) -- (interp); + \draw[arr] (ztarget) -- (interp); + + % Target velocity + \node[op, left=0.5cm of interp] (vtarget) {$v^* = z_1 - z_0$}; + \draw[arr, gray] (znoise.south) -- ++(0,-0.3) -| (vtarget.north); + \draw[arr, gray] (ztarget.south) -- ++(0,-0.15) -| ([xshift=4mm]vtarget.north); + + % DiT + \node[model, below=0.9cm of interp] (dit) {\textcolor{ditblue}{DiT}\; $v_\theta(z_t, t, c)$}; + \draw[arr] (interp) -- (dit); + \draw[arr] (t) -- ++(0,-0.5) -| ([xshift=3mm]dit.north); + \draw[arr] (c) -- ++(0,-0.8) -| ([xshift=8mm]dit.north); + + % v_pred output + \node[data, below=0.8cm of dit] (vpred) {$v_{\text{pred}}$}; + \draw[arr, ditblue] (dit) -- (vpred); + + % Flow loss + \node[loss, left=2.5cm of vpred] (flowloss) {$\mathcal{L}_{\text{flow}} = \|v_{\text{pred}} - v^*\|^2$}; + \draw[arr] (vpred) -- (flowloss); + \draw[arr] (vtarget.south) |- (flowloss.east); + + % Implied target + \node[op, below=0.8cm of vpred] (z1pred) {$\hat{z}_1 = z_t + (1{-}t)\,v_{\text{pred}}$}; + \draw[arr] (vpred) -- (z1pred); + + % Projector + \node[model, below=0.8cm of z1pred] (proj) {\textcolor{projgreen}{Projector}\; $P(\hat{z}_1)$}; + \draw[arr] (z1pred) -- (proj); + + % Proj loss + \node[loss, below=0.8cm of proj] (projloss) {$\mathcal{L}_{\text{proj}} = \|P(\hat{z}_1) - z_1\|^2$}; + \draw[arr] (proj) -- (projloss); + + % Total loss + \node[loss, below left=0.6cm and 1.5cm of projloss] (total) {$\mathcal{L} = \mathcal{L}_{\text{flow}} + \lambda\,\mathcal{L}_{\text{proj}}$}; + \draw[arr] (flowloss.south) |- (total.west); + \draw[arr] (projloss) -| (total.east); + +\end{tikzpicture} +\end{frame} + +% ============================================================ +% SLIDE 2: OLD backward pass (no detach — explosion) +% ============================================================ +\begin{frame}{Old Backward Pass: \textcolor{dangerred}{Gradient Explosion Path}} +\centering +\begin{tikzpicture}[ + node distance=1.0cm and 1.5cm, + box/.style={rectangle, rounded corners=3pt, draw, minimum width=2cm, minimum height=0.65cm, font=\small\bfseries}, + model/.style={box, fill=ditblue!15, draw=ditblue}, + op/.style={box, fill=gray!10, draw=gray!60}, + loss/.style={box, fill=lossred!15, draw=lossred}, + grad/.style={{Stealth[length=5pt]}-, thick, gradorange}, + badgrad/.style={{Stealth[length=5pt]}-, very thick, dangerred}, + goodgrad/.style={{Stealth[length=5pt]}-, thick, ditblue!70}, +] + % Loss + \node[loss] (total) {$\mathcal{L}_{\text{total}}$}; + + % Two loss branches + \node[loss, above left=1.0cm and 2cm of total] (flowloss) {$\mathcal{L}_{\text{flow}}$}; + \node[loss, above right=1.0cm and 2cm of total] (projloss) {$\mathcal{L}_{\text{proj}}$}; + \draw[grad] (total) -- (flowloss); + \draw[grad] (total) -- (projloss); + + % Flow loss path (safe) + \node[op, above=0.8cm of flowloss] (vpred1) {$v_{\text{pred}}$}; + \draw[goodgrad] (flowloss) -- node[left, font=\scriptsize, text=ditblue!70] {$\frac{\partial \mathcal{L}_{\text{flow}}}{\partial v_{\text{pred}}}$} (vpred1); + + \node[model, above=0.8cm of vpred1] (dit1) {\textcolor{ditblue}{DiT}\; $\theta$}; + \draw[goodgrad] (vpred1) -- node[left, font=\scriptsize, text=ditblue!70] {stable} (dit1); + + % Proj loss path (EXPLODING) + \node[op, above=0.8cm of projloss] (proj) {\textcolor{projgreen}{Projector}}; + \draw[badgrad] (projloss) -- node[right, font=\scriptsize, text=dangerred] {$\frac{\partial \mathcal{L}_{\text{proj}}}{\partial P}$} (proj); + + \node[op, above=0.8cm of proj] (z1pred) {$\hat{z}_1 = z_t + (1{-}t)\,v_{\text{pred}}$}; + \draw[badgrad] (proj) -- node[right, font=\scriptsize, text=dangerred] {$\nabla_{\hat{z}_1}$} (z1pred); + + \node[op, above=0.8cm of z1pred] (vpred2) {$v_{\text{pred}}$}; + \draw[badgrad] (z1pred) -- node[right, font=\scriptsize, text=dangerred] {$\times\;(1{-}t)$} (vpred2); + + \node[model, above=0.8cm of vpred2] (dit2) {\textcolor{ditblue}{DiT}\; $\theta$}; + \draw[badgrad] (vpred2) -- node[right, font=\scriptsize, text=dangerred] {\textbf{AMPLIFIED!}} (dit2); + + % Danger annotation + \node[below right=0.1cm and 0.3cm of z1pred, font=\small, text=dangerred] (warn) { + \begin{tabular}{l} + When $t \approx 0$: \\ + $(1{-}t) \approx 1.0$ \\ + $\Rightarrow$ \textbf{full gradient}\\ + \textbf{amplification!} + \end{tabular} + }; + + % Connect dit1 and dit2 to show they are the same + \draw[dashed, thick, gray] (dit1.east) -- ++(0.3,0) |- node[above, midway, font=\scriptsize, text=gray] {same parameters $\theta$} (dit2.west); + + % Big red X + \node[font=\Huge, text=dangerred, below=0.1cm of total] {\textbf{UNSTABLE}}; + +\end{tikzpicture} +\end{frame} + +% ============================================================ +% SLIDE 3: NEW backward pass (with detach — stable) +% ============================================================ +\begin{frame}{New Backward Pass: \textcolor{projgreen}{Stable with \texttt{.detach()}}} +\centering +\begin{tikzpicture}[ + node distance=1.0cm and 1.5cm, + box/.style={rectangle, rounded corners=3pt, draw, minimum width=2cm, minimum height=0.65cm, font=\small\bfseries}, + model/.style={box, fill=ditblue!15, draw=ditblue}, + op/.style={box, fill=gray!10, draw=gray!60}, + loss/.style={box, fill=lossred!15, draw=lossred}, + grad/.style={{Stealth[length=5pt]}-, thick, gradorange}, + goodgrad/.style={{Stealth[length=5pt]}-, thick, ditblue!70}, + projgrad/.style={{Stealth[length=5pt]}-, thick, projgreen}, + stopped/.style={thick, dangerred, dashed}, +] + % Loss + \node[loss] (total) {$\mathcal{L}_{\text{total}}$}; + + % Two loss branches + \node[loss, above left=1.0cm and 2cm of total] (flowloss) {$\mathcal{L}_{\text{flow}}$}; + \node[loss, above right=1.0cm and 2cm of total] (projloss) {$\mathcal{L}_{\text{proj}}$}; + \draw[grad] (total) -- (flowloss); + \draw[grad] (total) -- (projloss); + + % Flow loss path (safe — unchanged) + \node[op, above=0.8cm of flowloss] (vpred1) {$v_{\text{pred}}$}; + \draw[goodgrad] (flowloss) -- node[left, font=\scriptsize, text=ditblue!70] {$\frac{\partial \mathcal{L}_{\text{flow}}}{\partial v_{\text{pred}}}$} (vpred1); + + \node[model, above=0.8cm of vpred1] (dit1) {\textcolor{ditblue}{DiT}\; $\theta$}; + \draw[goodgrad] (vpred1) -- node[left, font=\scriptsize, text=ditblue!70] {stable} (dit1); + + % Proj loss path (SAFE — detached) + \node[op, above=0.8cm of projloss] (proj) {\textcolor{projgreen}{Projector}}; + \draw[projgrad] (projloss) -- node[right, font=\scriptsize, text=projgreen] {$\frac{\partial \mathcal{L}_{\text{proj}}}{\partial P}$} (proj); + + % DETACH barrier + \node[op, above=0.8cm of proj, fill=stopred!15, draw=stopred, very thick] (detach) {\texttt{.detach()} $\quad\hat{z}_1$}; + \draw[projgrad] (proj) -- node[right, font=\scriptsize, text=projgreen] {trains $P$ only} (detach); + + % Blocked path + \node[op, above=0.8cm of detach, fill=gray!5, draw=safegray] (z1pred) {$\hat{z}_1 = z_t + (1{-}t)\,v_{\text{pred}}$}; + \draw[stopped] (detach) -- node[right, font=\scriptsize, text=stopred] {\textbf{BLOCKED}} (z1pred); + + \node[op, above=0.8cm of z1pred, fill=gray!5, draw=safegray] (vpred2) {$v_{\text{pred}}$}; + \draw[stopped] (z1pred) -- node[right, font=\scriptsize, text=safegray] {no gradient} (vpred2); + + \node[model, above=0.8cm of vpred2, fill=gray!5, draw=safegray] (dit2) {\textcolor{safegray}{DiT}\; $\theta$}; + \draw[stopped] (vpred2) -- (dit2); + + % Annotation + \node[below right=0.1cm and 0.3cm of detach, font=\small, text=projgreen] { + \begin{tabular}{l} + $\mathcal{L}_{\text{proj}}$ gradient \\ + stops at \texttt{detach()}\\[2pt] + $\Rightarrow$ DiT trained \textbf{only}\\ + by $\mathcal{L}_{\text{flow}}$ + \end{tabular} + }; + + % Big green check + \node[font=\Huge, text=projgreen, below=0.1cm of total] {\textbf{STABLE} \checkmark}; + +\end{tikzpicture} +\end{frame} + +% ============================================================ +% SLIDE 4: Summary of gradient magnitudes +% ============================================================ +\begin{frame}{Gradient Magnitude Comparison} + +\begin{columns}[T] +\begin{column}{0.48\textwidth} +\centering +\textbf{\textcolor{dangerred}{Without \texttt{.detach()}}} + +\vspace{0.3cm} +\begin{tikzpicture} +\draw[thick, ->] (0,0) -- (5.5,0) node[right, font=\small] {$t$}; +\draw[thick, ->] (0,0) -- (0,3.5) node[above, font=\small] {$\|\nabla_\theta \mathcal{L}_{\text{proj}}\|$}; + +% Axis labels +\node[below, font=\scriptsize] at (0,0) {0}; +\node[below, font=\scriptsize] at (2.5,0) {0.5}; +\node[below, font=\scriptsize] at (5,0) {1}; + +% Curve: (1-t) amplification — goes to infinity as t→0 +\draw[very thick, dangerred, domain=0.05:0.95, samples=50] + plot ({5*\x}, {2.5*(1-\x)}); + +% Danger zone +\fill[dangerred, opacity=0.1] (0,0) rectangle (1.5,3.5); +\node[font=\scriptsize, text=dangerred, rotate=90] at (0.3, 1.8) {DANGER ZONE}; + +% Label +\node[font=\scriptsize, text=dangerred] at (3, 2.5) {$\propto (1{-}t)$}; +\end{tikzpicture} + +\vspace{0.2cm} +{\small Gradient $\to \infty$ as $t \to 0$} +\end{column} + +\begin{column}{0.48\textwidth} +\centering +\textbf{\textcolor{projgreen}{With \texttt{.detach()}}} + +\vspace{0.3cm} +\begin{tikzpicture} +\draw[thick, ->] (0,0) -- (5.5,0) node[right, font=\small] {$t$}; +\draw[thick, ->] (0,0) -- (0,3.5) node[above, font=\small] {$\|\nabla_\theta \mathcal{L}\|$}; + +% Axis labels +\node[below, font=\scriptsize] at (0,0) {0}; +\node[below, font=\scriptsize] at (2.5,0) {0.5}; +\node[below, font=\scriptsize] at (5,0) {1}; + +% Constant line — only flow loss gradient, well-behaved +\draw[very thick, projgreen, domain=0.05:0.95, samples=50] + plot ({5*\x}, {1.2 + 0.3*sin(360*\x)}); + +% Label +\node[font=\scriptsize, text=projgreen] at (3.5, 2.2) {$\nabla_\theta \mathcal{L}_{\text{flow}}$ only}; +\node[font=\scriptsize, text=safegray] at (3.5, 0.4) {$\nabla_\theta \mathcal{L}_{\text{proj}} = 0$}; +\end{tikzpicture} + +\vspace{0.2cm} +{\small Bounded gradient $\forall\, t$} +\end{column} +\end{columns} + +\vspace{0.5cm} +\centering +\begin{tabular}{lcc} +\hline +& \textbf{Old (no detach)} & \textbf{New (with detach)} \\ +\hline +DiT gradient from $\mathcal{L}_{\text{flow}}$ & \checkmark & \checkmark \\ +DiT gradient from $\mathcal{L}_{\text{proj}}$ & \textcolor{dangerred}{\checkmark\ (amplified by $(1{-}t)$)} & \textcolor{projgreen}{$\times$\ (blocked)} \\ +Projector gradient from $\mathcal{L}_{\text{proj}}$ & \checkmark & \checkmark \\ +Gradient explosion risk & \textcolor{dangerred}{HIGH} & \textcolor{projgreen}{NONE} \\ +\hline +\end{tabular} +\end{frame} + +\end{document} diff --git a/slides/gradient_flow_dag.toc b/slides/gradient_flow_dag.toc new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/slides/presentation.aux b/slides/presentation.aux new file mode 100644 index 0000000000000000000000000000000000000000..9640dc9550fe9824f0e99566ce7d0f46c21f441d --- /dev/null +++ b/slides/presentation.aux @@ -0,0 +1,44 @@ +\relax +\providecommand\hyper@newdestlabel[2]{} +\providecommand\HyField@AuxAddToFields[1]{} +\providecommand\HyField@AuxAddToCoFields[2]{} +\@writefile{nav}{\headcommand {\slideentry {0}{0}{1}{1/1}{}{0}}} +\@writefile{nav}{\headcommand {\beamer@framepages {1}{1}}} +\@writefile{nav}{\headcommand {\slideentry {0}{0}{2}{2/2}{}{0}}} +\@writefile{nav}{\headcommand {\beamer@framepages {2}{2}}} +\@writefile{toc}{\beamer@sectionintoc {1}{So-VITS-SVC (Baseline)}{3}{0}{1}} +\@writefile{nav}{\headcommand {\beamer@sectionpages {1}{2}}} +\@writefile{nav}{\headcommand {\beamer@subsectionpages {1}{2}}} +\@writefile{nav}{\headcommand {\sectionentry {1}{So-VITS-SVC (Baseline)}{3}{So-VITS-SVC (Baseline)}{0}}} +\@writefile{nav}{\headcommand {\slideentry {1}{0}{1}{3/3}{}{0}}} +\@writefile{nav}{\headcommand {\beamer@framepages {3}{3}}} +\@writefile{toc}{\beamer@sectionintoc {2}{V1 Architecture: DAC \& Flow Matching}{4}{0}{2}} +\@writefile{nav}{\headcommand {\beamer@sectionpages {3}{3}}} +\@writefile{nav}{\headcommand {\beamer@subsectionpages {3}{3}}} +\@writefile{nav}{\headcommand {\sectionentry {2}{V1 Architecture: DAC \& Flow Matching}{4}{V1 Architecture: DAC \& Flow Matching}{0}}} +\@writefile{nav}{\headcommand {\slideentry {2}{0}{1}{4/4}{}{0}}} +\@writefile{nav}{\headcommand {\beamer@framepages {4}{4}}} +\@writefile{toc}{\beamer@sectionintoc {3}{Better V1: Latent Compression}{5}{0}{3}} +\@writefile{nav}{\headcommand {\beamer@sectionpages {4}{4}}} +\@writefile{nav}{\headcommand {\beamer@subsectionpages {4}{4}}} +\@writefile{nav}{\headcommand {\sectionentry {3}{Better V1: Latent Compression}{5}{Better V1: Latent Compression}{0}}} +\@writefile{nav}{\headcommand {\slideentry {3}{0}{1}{5/5}{}{0}}} +\@writefile{nav}{\headcommand {\beamer@framepages {5}{5}}} +\@writefile{toc}{\beamer@sectionintoc {4}{V2: F5-SVC (Current Implementation)}{6}{0}{4}} +\@writefile{nav}{\headcommand {\beamer@sectionpages {5}{5}}} +\@writefile{nav}{\headcommand {\beamer@subsectionpages {5}{5}}} +\@writefile{nav}{\headcommand {\sectionentry {4}{V2: F5-SVC (Current Implementation)}{6}{V2: F5-SVC (Current Implementation)}{0}}} +\@writefile{nav}{\headcommand {\slideentry {4}{0}{1}{6/6}{}{0}}} +\@writefile{nav}{\headcommand {\beamer@framepages {6}{6}}} +\@writefile{toc}{\beamer@sectionintoc {5}{Architectural Comparison Summary}{7}{0}{5}} +\@writefile{nav}{\headcommand {\beamer@sectionpages {6}{6}}} +\@writefile{nav}{\headcommand {\beamer@subsectionpages {6}{6}}} +\@writefile{nav}{\headcommand {\sectionentry {5}{Architectural Comparison Summary}{7}{Architectural Comparison Summary}{0}}} +\@writefile{nav}{\headcommand {\slideentry {5}{0}{1}{7/7}{}{0}}} +\@writefile{nav}{\headcommand {\beamer@framepages {7}{7}}} +\@writefile{nav}{\headcommand {\beamer@partpages {1}{7}}} +\@writefile{nav}{\headcommand {\beamer@subsectionpages {7}{7}}} +\@writefile{nav}{\headcommand {\beamer@sectionpages {7}{7}}} +\@writefile{nav}{\headcommand {\beamer@documentpages {7}}} +\@writefile{nav}{\headcommand {\gdef \inserttotalframenumber {7}}} +\gdef \@abspage@last{7} diff --git a/slides/presentation.log b/slides/presentation.log new file mode 100644 index 0000000000000000000000000000000000000000..0c3c31f36f9af9b09145161f842c72e0745e2284 --- /dev/null +++ b/slides/presentation.log @@ -0,0 +1,1033 @@ +This is pdfTeX, Version 3.141592653-2.6-1.40.25 (TeX Live 2023/Debian) (preloaded format=pdflatex 2026.1.13) 6 MAR 2026 07:29 +entering extended mode + restricted \write18 enabled. + %&-line parsing enabled. +**presentation.tex +(./presentation.tex +LaTeX2e <2023-11-01> patch level 1 +L3 programming layer <2024-01-22> +(/usr/share/texlive/texmf-dist/tex/latex/beamer/beamer.cls +Document Class: beamer 2024/01/06 v3.71 A class for typesetting presentations +(/usr/share/texlive/texmf-dist/tex/latex/beamer/beamerbasemodes.sty +(/usr/share/texlive/texmf-dist/tex/latex/etoolbox/etoolbox.sty +Package: etoolbox 2020/10/05 v2.5k e-TeX tools for LaTeX (JAW) +\etb@tempcnta=\count187 +) +\beamer@tempbox=\box51 +\beamer@tempcount=\count188 +\c@beamerpauses=\count189 + +(/usr/share/texlive/texmf-dist/tex/latex/beamer/beamerbasedecode.sty +\beamer@slideinframe=\count190 +\beamer@minimum=\count191 +\beamer@decode@box=\box52 +) +\beamer@commentbox=\box53 +\beamer@modecount=\count192 +) +(/usr/share/texlive/texmf-dist/tex/generic/iftex/iftex.sty +Package: iftex 2022/02/03 v1.0f TeX engine tests +) +\headdp=\dimen140 +\footheight=\dimen141 +\sidebarheight=\dimen142 +\beamer@tempdim=\dimen143 +\beamer@finalheight=\dimen144 +\beamer@animht=\dimen145 +\beamer@animdp=\dimen146 +\beamer@animwd=\dimen147 +\beamer@leftmargin=\dimen148 +\beamer@rightmargin=\dimen149 +\beamer@leftsidebar=\dimen150 +\beamer@rightsidebar=\dimen151 +\beamer@boxsize=\dimen152 +\beamer@vboxoffset=\dimen153 +\beamer@descdefault=\dimen154 +\beamer@descriptionwidth=\dimen155 +\beamer@lastskip=\skip48 +\beamer@areabox=\box54 +\beamer@animcurrent=\box55 +\beamer@animshowbox=\box56 +\beamer@sectionbox=\box57 +\beamer@logobox=\box58 +\beamer@linebox=\box59 +\beamer@sectioncount=\count193 +\beamer@subsubsectionmax=\count194 +\beamer@subsectionmax=\count195 +\beamer@sectionmax=\count196 +\beamer@totalheads=\count197 +\beamer@headcounter=\count198 +\beamer@partstartpage=\count199 +\beamer@sectionstartpage=\count266 +\beamer@subsectionstartpage=\count267 +\beamer@animationtempa=\count268 +\beamer@animationtempb=\count269 +\beamer@xpos=\count270 +\beamer@ypos=\count271 +\beamer@ypos@offset=\count272 +\beamer@showpartnumber=\count273 +\beamer@currentsubsection=\count274 +\beamer@coveringdepth=\count275 +\beamer@sectionadjust=\count276 +\beamer@toclastsection=\count277 +\beamer@tocsectionnumber=\count278 + +(/usr/share/texlive/texmf-dist/tex/latex/beamer/beamerbaseoptions.sty +(/usr/share/texlive/texmf-dist/tex/latex/graphics/keyval.sty +Package: keyval 2022/05/29 v1.15 key=value parser (DPC) +\KV@toks@=\toks17 +)) +\beamer@paperwidth=\skip49 +\beamer@paperheight=\skip50 + +(/usr/share/texlive/texmf-dist/tex/latex/geometry/geometry.sty +Package: geometry 2020/01/02 v5.9 Page Geometry + +(/usr/share/texlive/texmf-dist/tex/generic/iftex/ifvtex.sty +Package: ifvtex 2019/10/25 v1.7 ifvtex legacy package. Use iftex instead. +) +\Gm@cnth=\count279 +\Gm@cntv=\count280 +\c@Gm@tempcnt=\count281 +\Gm@bindingoffset=\dimen156 +\Gm@wd@mp=\dimen157 +\Gm@odd@mp=\dimen158 +\Gm@even@mp=\dimen159 +\Gm@layoutwidth=\dimen160 +\Gm@layoutheight=\dimen161 +\Gm@layouthoffset=\dimen162 +\Gm@layoutvoffset=\dimen163 +\Gm@dimlist=\toks18 +) +(/usr/share/texlive/texmf-dist/tex/latex/pgf/math/pgfmath.sty +(/usr/share/texlive/texmf-dist/tex/latex/pgf/utilities/pgfrcs.sty +(/usr/share/texlive/texmf-dist/tex/generic/pgf/utilities/pgfutil-common.tex +\pgfutil@everybye=\toks19 +\pgfutil@tempdima=\dimen164 +\pgfutil@tempdimb=\dimen165 +) +(/usr/share/texlive/texmf-dist/tex/generic/pgf/utilities/pgfutil-latex.def +\pgfutil@abb=\box60 +) +(/usr/share/texlive/texmf-dist/tex/generic/pgf/utilities/pgfrcs.code.tex +(/usr/share/texlive/texmf-dist/tex/generic/pgf/pgf.revision.tex) +Package: pgfrcs 2023-01-15 v3.1.10 (3.1.10) +)) +(/usr/share/texlive/texmf-dist/tex/latex/pgf/utilities/pgfkeys.sty +(/usr/share/texlive/texmf-dist/tex/generic/pgf/utilities/pgfkeys.code.tex +\pgfkeys@pathtoks=\toks20 +\pgfkeys@temptoks=\toks21 + +(/usr/share/texlive/texmf-dist/tex/generic/pgf/utilities/pgfkeyslibraryfiltered +.code.tex +\pgfkeys@tmptoks=\toks22 +))) +(/usr/share/texlive/texmf-dist/tex/generic/pgf/math/pgfmath.code.tex +(/usr/share/texlive/texmf-dist/tex/generic/pgf/math/pgfmathutil.code.tex +\pgf@x=\dimen166 +\pgf@xa=\dimen167 +\pgf@xb=\dimen168 +\pgf@xc=\dimen169 +\pgf@y=\dimen170 +\pgf@ya=\dimen171 +\pgf@yb=\dimen172 +\pgf@yc=\dimen173 +\c@pgf@counta=\count282 +\c@pgf@countb=\count283 +\c@pgf@countc=\count284 +\c@pgf@countd=\count285 +) +(/usr/share/texlive/texmf-dist/tex/generic/pgf/math/pgfmathparser.code.tex +\pgfmath@dimen=\dimen174 +\pgfmath@count=\count286 +\pgfmath@box=\box61 +\pgfmath@toks=\toks23 +\pgfmath@stack@operand=\toks24 +\pgfmath@stack@operation=\toks25 +) +(/usr/share/texlive/texmf-dist/tex/generic/pgf/math/pgfmathfunctions.code.tex) +(/usr/share/texlive/texmf-dist/tex/generic/pgf/math/pgfmathfunctions.basic.code +.tex) +(/usr/share/texlive/texmf-dist/tex/generic/pgf/math/pgfmathfunctions.trigonomet +ric.code.tex) +(/usr/share/texlive/texmf-dist/tex/generic/pgf/math/pgfmathfunctions.random.cod +e.tex) +(/usr/share/texlive/texmf-dist/tex/generic/pgf/math/pgfmathfunctions.comparison +.code.tex) +(/usr/share/texlive/texmf-dist/tex/generic/pgf/math/pgfmathfunctions.base.code. +tex) +(/usr/share/texlive/texmf-dist/tex/generic/pgf/math/pgfmathfunctions.round.code +.tex) +(/usr/share/texlive/texmf-dist/tex/generic/pgf/math/pgfmathfunctions.misc.code. +tex) +(/usr/share/texlive/texmf-dist/tex/generic/pgf/math/pgfmathfunctions.integerari +thmetics.code.tex) +(/usr/share/texlive/texmf-dist/tex/generic/pgf/math/pgfmathcalc.code.tex) +(/usr/share/texlive/texmf-dist/tex/generic/pgf/math/pgfmathfloat.code.tex +\c@pgfmathroundto@lastzeros=\count287 +))) +(/usr/share/texlive/texmf-dist/tex/latex/base/size11.clo +File: size11.clo 2023/05/17 v1.4n Standard LaTeX file (size option) +) +(/usr/share/texlive/texmf-dist/tex/latex/pgf/basiclayer/pgfcore.sty +(/usr/share/texlive/texmf-dist/tex/latex/graphics/graphicx.sty +Package: graphicx 2021/09/16 v1.2d Enhanced LaTeX Graphics (DPC,SPQR) + +(/usr/share/texlive/texmf-dist/tex/latex/graphics/graphics.sty +Package: graphics 2022/03/10 v1.4e Standard LaTeX Graphics (DPC,SPQR) + +(/usr/share/texlive/texmf-dist/tex/latex/graphics/trig.sty +Package: trig 2021/08/11 v1.11 sin cos tan (DPC) +) +(/usr/share/texlive/texmf-dist/tex/latex/graphics-cfg/graphics.cfg +File: graphics.cfg 2016/06/04 v1.11 sample graphics configuration +) +Package graphics Info: Driver file: pdftex.def on input line 107. + +(/usr/share/texlive/texmf-dist/tex/latex/graphics-def/pdftex.def +File: pdftex.def 2022/09/22 v1.2b Graphics/color driver for pdftex +)) +\Gin@req@height=\dimen175 +\Gin@req@width=\dimen176 +) +(/usr/share/texlive/texmf-dist/tex/latex/pgf/systemlayer/pgfsys.sty +(/usr/share/texlive/texmf-dist/tex/generic/pgf/systemlayer/pgfsys.code.tex +Package: pgfsys 2023-01-15 v3.1.10 (3.1.10) +\pgf@x=\dimen177 +\pgf@y=\dimen178 +\pgf@xa=\dimen179 +\pgf@ya=\dimen180 +\pgf@xb=\dimen181 +\pgf@yb=\dimen182 +\pgf@xc=\dimen183 +\pgf@yc=\dimen184 +\pgf@xd=\dimen185 +\pgf@yd=\dimen186 +\w@pgf@writea=\write3 +\r@pgf@reada=\read2 +\c@pgf@counta=\count288 +\c@pgf@countb=\count289 +\c@pgf@countc=\count290 +\c@pgf@countd=\count291 +\t@pgf@toka=\toks26 +\t@pgf@tokb=\toks27 +\t@pgf@tokc=\toks28 +\pgf@sys@id@count=\count292 + +(/usr/share/texlive/texmf-dist/tex/generic/pgf/systemlayer/pgf.cfg +File: pgf.cfg 2023-01-15 v3.1.10 (3.1.10) +) +Driver file for pgf: pgfsys-pdftex.def + +(/usr/share/texlive/texmf-dist/tex/generic/pgf/systemlayer/pgfsys-pdftex.def +File: pgfsys-pdftex.def 2023-01-15 v3.1.10 (3.1.10) + +(/usr/share/texlive/texmf-dist/tex/generic/pgf/systemlayer/pgfsys-common-pdf.de +f +File: pgfsys-common-pdf.def 2023-01-15 v3.1.10 (3.1.10) +))) +(/usr/share/texlive/texmf-dist/tex/generic/pgf/systemlayer/pgfsyssoftpath.code. +tex +File: pgfsyssoftpath.code.tex 2023-01-15 v3.1.10 (3.1.10) +\pgfsyssoftpath@smallbuffer@items=\count293 +\pgfsyssoftpath@bigbuffer@items=\count294 +) +(/usr/share/texlive/texmf-dist/tex/generic/pgf/systemlayer/pgfsysprotocol.code. +tex +File: pgfsysprotocol.code.tex 2023-01-15 v3.1.10 (3.1.10) +)) (/usr/share/texlive/texmf-dist/tex/latex/xcolor/xcolor.sty +Package: xcolor 2023/11/15 v3.01 LaTeX color extensions (UK) + +(/usr/share/texlive/texmf-dist/tex/latex/graphics-cfg/color.cfg +File: color.cfg 2016/01/02 v1.6 sample color configuration +) +Package xcolor Info: Driver file: pdftex.def on input line 274. + +(/usr/share/texlive/texmf-dist/tex/latex/graphics/mathcolor.ltx) +Package xcolor Info: Model `cmy' substituted by `cmy0' on input line 1350. +Package xcolor Info: Model `hsb' substituted by `rgb' on input line 1354. +Package xcolor Info: Model `RGB' extended on input line 1366. +Package xcolor Info: Model `HTML' substituted by `rgb' on input line 1368. +Package xcolor Info: Model `Hsb' substituted by `hsb' on input line 1369. +Package xcolor Info: Model `tHsb' substituted by `hsb' on input line 1370. +Package xcolor Info: Model `HSB' substituted by `hsb' on input line 1371. +Package xcolor Info: Model `Gray' substituted by `gray' on input line 1372. +Package xcolor Info: Model `wave' substituted by `hsb' on input line 1373. +) +(/usr/share/texlive/texmf-dist/tex/generic/pgf/basiclayer/pgfcore.code.tex +Package: pgfcore 2023-01-15 v3.1.10 (3.1.10) + +(/usr/share/texlive/texmf-dist/tex/generic/pgf/math/pgfint.code.tex) +(/usr/share/texlive/texmf-dist/tex/generic/pgf/basiclayer/pgfcorepoints.code.te +x +File: pgfcorepoints.code.tex 2023-01-15 v3.1.10 (3.1.10) +\pgf@picminx=\dimen187 +\pgf@picmaxx=\dimen188 +\pgf@picminy=\dimen189 +\pgf@picmaxy=\dimen190 +\pgf@pathminx=\dimen191 +\pgf@pathmaxx=\dimen192 +\pgf@pathminy=\dimen193 +\pgf@pathmaxy=\dimen194 +\pgf@xx=\dimen195 +\pgf@xy=\dimen196 +\pgf@yx=\dimen197 +\pgf@yy=\dimen198 +\pgf@zx=\dimen199 +\pgf@zy=\dimen256 +) +(/usr/share/texlive/texmf-dist/tex/generic/pgf/basiclayer/pgfcorepathconstruct. +code.tex +File: pgfcorepathconstruct.code.tex 2023-01-15 v3.1.10 (3.1.10) +\pgf@path@lastx=\dimen257 +\pgf@path@lasty=\dimen258 +) +(/usr/share/texlive/texmf-dist/tex/generic/pgf/basiclayer/pgfcorepathusage.code +.tex +File: pgfcorepathusage.code.tex 2023-01-15 v3.1.10 (3.1.10) +\pgf@shorten@end@additional=\dimen259 +\pgf@shorten@start@additional=\dimen260 +) +(/usr/share/texlive/texmf-dist/tex/generic/pgf/basiclayer/pgfcorescopes.code.te +x +File: pgfcorescopes.code.tex 2023-01-15 v3.1.10 (3.1.10) +\pgfpic=\box62 +\pgf@hbox=\box63 +\pgf@layerbox@main=\box64 +\pgf@picture@serial@count=\count295 +) +(/usr/share/texlive/texmf-dist/tex/generic/pgf/basiclayer/pgfcoregraphicstate.c +ode.tex +File: pgfcoregraphicstate.code.tex 2023-01-15 v3.1.10 (3.1.10) +\pgflinewidth=\dimen261 +) +(/usr/share/texlive/texmf-dist/tex/generic/pgf/basiclayer/pgfcoretransformation +s.code.tex +File: pgfcoretransformations.code.tex 2023-01-15 v3.1.10 (3.1.10) +\pgf@pt@x=\dimen262 +\pgf@pt@y=\dimen263 +\pgf@pt@temp=\dimen264 +) +(/usr/share/texlive/texmf-dist/tex/generic/pgf/basiclayer/pgfcorequick.code.tex +File: pgfcorequick.code.tex 2023-01-15 v3.1.10 (3.1.10) +) +(/usr/share/texlive/texmf-dist/tex/generic/pgf/basiclayer/pgfcoreobjects.code.t +ex +File: pgfcoreobjects.code.tex 2023-01-15 v3.1.10 (3.1.10) +) +(/usr/share/texlive/texmf-dist/tex/generic/pgf/basiclayer/pgfcorepathprocessing +.code.tex +File: pgfcorepathprocessing.code.tex 2023-01-15 v3.1.10 (3.1.10) +) +(/usr/share/texlive/texmf-dist/tex/generic/pgf/basiclayer/pgfcorearrows.code.te +x +File: pgfcorearrows.code.tex 2023-01-15 v3.1.10 (3.1.10) +\pgfarrowsep=\dimen265 +) +(/usr/share/texlive/texmf-dist/tex/generic/pgf/basiclayer/pgfcoreshade.code.tex +File: pgfcoreshade.code.tex 2023-01-15 v3.1.10 (3.1.10) +\pgf@max=\dimen266 +\pgf@sys@shading@range@num=\count296 +\pgf@shadingcount=\count297 +) +(/usr/share/texlive/texmf-dist/tex/generic/pgf/basiclayer/pgfcoreimage.code.tex +File: pgfcoreimage.code.tex 2023-01-15 v3.1.10 (3.1.10) +) +(/usr/share/texlive/texmf-dist/tex/generic/pgf/basiclayer/pgfcoreexternal.code. +tex +File: pgfcoreexternal.code.tex 2023-01-15 v3.1.10 (3.1.10) +\pgfexternal@startupbox=\box65 +) +(/usr/share/texlive/texmf-dist/tex/generic/pgf/basiclayer/pgfcorelayers.code.te +x +File: pgfcorelayers.code.tex 2023-01-15 v3.1.10 (3.1.10) +) +(/usr/share/texlive/texmf-dist/tex/generic/pgf/basiclayer/pgfcoretransparency.c +ode.tex +File: pgfcoretransparency.code.tex 2023-01-15 v3.1.10 (3.1.10) +) +(/usr/share/texlive/texmf-dist/tex/generic/pgf/basiclayer/pgfcorepatterns.code. +tex +File: pgfcorepatterns.code.tex 2023-01-15 v3.1.10 (3.1.10) +) +(/usr/share/texlive/texmf-dist/tex/generic/pgf/basiclayer/pgfcorerdf.code.tex +File: pgfcorerdf.code.tex 2023-01-15 v3.1.10 (3.1.10) +))) (/usr/share/texlive/texmf-dist/tex/latex/pgf/utilities/xxcolor.sty +Package: xxcolor 2003/10/24 ver 0.1 +\XC@nummixins=\count298 +\XC@countmixins=\count299 +) +(/usr/share/texlive/texmf-dist/tex/latex/base/atbegshi-ltx.sty +Package: atbegshi-ltx 2021/01/10 v1.0c Emulation of the original atbegshi +package with kernel methods +) +(/usr/share/texlive/texmf-dist/tex/latex/hyperref/hyperref.sty +Package: hyperref 2024-01-20 v7.01h Hypertext links for LaTeX + +(/usr/share/texlive/texmf-dist/tex/latex/kvsetkeys/kvsetkeys.sty +Package: kvsetkeys 2022-10-05 v1.19 Key value parser (HO) +) +(/usr/share/texlive/texmf-dist/tex/generic/kvdefinekeys/kvdefinekeys.sty +Package: kvdefinekeys 2019-12-19 v1.6 Define keys (HO) +) +(/usr/share/texlive/texmf-dist/tex/generic/pdfescape/pdfescape.sty +Package: pdfescape 2019/12/09 v1.15 Implements pdfTeX's escape features (HO) + +(/usr/share/texlive/texmf-dist/tex/generic/ltxcmds/ltxcmds.sty +Package: ltxcmds 2023-12-04 v1.26 LaTeX kernel commands for general use (HO) +) +(/usr/share/texlive/texmf-dist/tex/generic/pdftexcmds/pdftexcmds.sty +Package: pdftexcmds 2020-06-27 v0.33 Utility functions of pdfTeX for LuaTeX (HO +) + +(/usr/share/texlive/texmf-dist/tex/generic/infwarerr/infwarerr.sty +Package: infwarerr 2019/12/03 v1.5 Providing info/warning/error messages (HO) +) +Package pdftexcmds Info: \pdf@primitive is available. +Package pdftexcmds Info: \pdf@ifprimitive is available. +Package pdftexcmds Info: \pdfdraftmode found. +)) +(/usr/share/texlive/texmf-dist/tex/latex/hycolor/hycolor.sty +Package: hycolor 2020-01-27 v1.10 Color options for hyperref/bookmark (HO) +) +(/usr/share/texlive/texmf-dist/tex/latex/auxhook/auxhook.sty +Package: auxhook 2019-12-17 v1.6 Hooks for auxiliary files (HO) +) +(/usr/share/texlive/texmf-dist/tex/latex/hyperref/nameref.sty +Package: nameref 2023-11-26 v2.56 Cross-referencing by name of section + +(/usr/share/texlive/texmf-dist/tex/latex/refcount/refcount.sty +Package: refcount 2019/12/15 v3.6 Data extraction from label references (HO) +) +(/usr/share/texlive/texmf-dist/tex/generic/gettitlestring/gettitlestring.sty +Package: gettitlestring 2019/12/15 v1.6 Cleanup title references (HO) + +(/usr/share/texlive/texmf-dist/tex/latex/kvoptions/kvoptions.sty +Package: kvoptions 2022-06-15 v3.15 Key value format for package options (HO) +)) +\c@section@level=\count300 +) +\@linkdim=\dimen267 +\Hy@linkcounter=\count301 +\Hy@pagecounter=\count302 + +(/usr/share/texlive/texmf-dist/tex/latex/hyperref/pd1enc.def +File: pd1enc.def 2024-01-20 v7.01h Hyperref: PDFDocEncoding definition (HO) +Now handling font encoding PD1 ... +... no UTF-8 mapping file for font encoding PD1 +) +(/usr/share/texlive/texmf-dist/tex/generic/intcalc/intcalc.sty +Package: intcalc 2019/12/15 v1.3 Expandable calculations with integers (HO) +) +\Hy@SavedSpaceFactor=\count303 + +(/usr/share/texlive/texmf-dist/tex/latex/hyperref/puenc.def +File: puenc.def 2024-01-20 v7.01h Hyperref: PDF Unicode definition (HO) +Now handling font encoding PU ... +... no UTF-8 mapping file for font encoding PU +) +Package hyperref Info: Option `bookmarks' set `true' on input line 4062. +Package hyperref Info: Option `bookmarksopen' set `true' on input line 4062. +Package hyperref Info: Option `implicit' set `false' on input line 4062. +Package hyperref Info: Hyper figures OFF on input line 4179. +Package hyperref Info: Link nesting OFF on input line 4184. +Package hyperref Info: Hyper index ON on input line 4187. +Package hyperref Info: Plain pages OFF on input line 4194. +Package hyperref Info: Backreferencing OFF on input line 4199. +Package hyperref Info: Implicit mode OFF; no redefinition of LaTeX internals. +Package hyperref Info: Bookmarks ON on input line 4446. +\c@Hy@tempcnt=\count304 + +(/usr/share/texlive/texmf-dist/tex/latex/url/url.sty +\Urlmuskip=\muskip16 +Package: url 2013/09/16 ver 3.4 Verb mode for urls, etc. +) +LaTeX Info: Redefining \url on input line 4784. +\XeTeXLinkMargin=\dimen268 + +(/usr/share/texlive/texmf-dist/tex/generic/bitset/bitset.sty +Package: bitset 2019/12/09 v1.3 Handle bit-vector datatype (HO) + +(/usr/share/texlive/texmf-dist/tex/generic/bigintcalc/bigintcalc.sty +Package: bigintcalc 2019/12/15 v1.5 Expandable calculations on big integers (HO +) +)) +\Fld@menulength=\count305 +\Field@Width=\dimen269 +\Fld@charsize=\dimen270 +Package hyperref Info: Hyper figures OFF on input line 6063. +Package hyperref Info: Link nesting OFF on input line 6068. +Package hyperref Info: Hyper index ON on input line 6071. +Package hyperref Info: backreferencing OFF on input line 6078. +Package hyperref Info: Link coloring OFF on input line 6083. +Package hyperref Info: Link coloring with OCG OFF on input line 6088. +Package hyperref Info: PDF/A mode OFF on input line 6093. +\Hy@abspage=\count306 + + +Package hyperref Message: Stopped early. + +) +Package hyperref Info: Driver (autodetected): hpdftex. + (/usr/share/texlive/texmf-dist/tex/latex/hyperref/hpdftex.def +File: hpdftex.def 2024-01-20 v7.01h Hyperref driver for pdfTeX + +(/usr/share/texlive/texmf-dist/tex/latex/base/atveryend-ltx.sty +Package: atveryend-ltx 2020/08/19 v1.0a Emulation of the original atveryend pac +kage +with kernel methods +) +\Fld@listcount=\count307 +\c@bookmark@seq@number=\count308 + +(/usr/share/texlive/texmf-dist/tex/latex/rerunfilecheck/rerunfilecheck.sty +Package: rerunfilecheck 2022-07-10 v1.10 Rerun checks for auxiliary files (HO) + +(/usr/share/texlive/texmf-dist/tex/generic/uniquecounter/uniquecounter.sty +Package: uniquecounter 2019/12/15 v1.4 Provide unlimited unique counter (HO) +) +Package uniquecounter Info: New unique counter `rerunfilecheck' on input line 2 +85. +)) +(/usr/share/texlive/texmf-dist/tex/latex/beamer/beamerbaserequires.sty +(/usr/share/texlive/texmf-dist/tex/latex/beamer/beamerbasecompatibility.sty) +(/usr/share/texlive/texmf-dist/tex/latex/beamer/beamerbasefont.sty +(/usr/share/texlive/texmf-dist/tex/latex/amsfonts/amssymb.sty +Package: amssymb 2013/01/14 v3.01 AMS font symbols + +(/usr/share/texlive/texmf-dist/tex/latex/amsfonts/amsfonts.sty +Package: amsfonts 2013/01/14 v3.01 Basic AMSFonts support +\@emptytoks=\toks29 +\symAMSa=\mathgroup4 +\symAMSb=\mathgroup5 +LaTeX Font Info: Redeclaring math symbol \hbar on input line 98. +LaTeX Font Info: Overwriting math alphabet `\mathfrak' in version `bold' +(Font) U/euf/m/n --> U/euf/b/n on input line 106. +)) +(/usr/share/texlive/texmf-dist/tex/latex/sansmathaccent/sansmathaccent.sty +Package: sansmathaccent 2020/01/31 + +(/usr/share/texlive/texmf-dist/tex/latex/koma-script/scrlfile.sty +Package: scrlfile 2023/07/07 v3.41 KOMA-Script package (file load hooks) + +(/usr/share/texlive/texmf-dist/tex/latex/koma-script/scrlfile-hook.sty +Package: scrlfile-hook 2023/07/07 v3.41 KOMA-Script package (using LaTeX hooks) + + +(/usr/share/texlive/texmf-dist/tex/latex/koma-script/scrlogo.sty +Package: scrlogo 2023/07/07 v3.41 KOMA-Script package (logo) +))))) +(/usr/share/texlive/texmf-dist/tex/latex/beamer/beamerbasetranslator.sty +(/usr/share/texlive/texmf-dist/tex/latex/translator/translator.sty +Package: translator 2021-05-31 v1.12d Easy translation of strings in LaTeX +)) +(/usr/share/texlive/texmf-dist/tex/latex/beamer/beamerbasemisc.sty) +(/usr/share/texlive/texmf-dist/tex/latex/beamer/beamerbasetwoscreens.sty) +(/usr/share/texlive/texmf-dist/tex/latex/beamer/beamerbaseoverlay.sty +\beamer@argscount=\count309 +\beamer@lastskipcover=\skip51 +\beamer@trivlistdepth=\count310 +) +(/usr/share/texlive/texmf-dist/tex/latex/beamer/beamerbasetitle.sty) +(/usr/share/texlive/texmf-dist/tex/latex/beamer/beamerbasesection.sty +\c@lecture=\count311 +\c@part=\count312 +\c@section=\count313 +\c@subsection=\count314 +\c@subsubsection=\count315 +) +(/usr/share/texlive/texmf-dist/tex/latex/beamer/beamerbaseframe.sty +\beamer@framebox=\box66 +\beamer@frametitlebox=\box67 +\beamer@zoombox=\box68 +\beamer@zoomcount=\count316 +\beamer@zoomframecount=\count317 +\beamer@frametextheight=\dimen271 +\c@subsectionslide=\count318 +\beamer@frametopskip=\skip52 +\beamer@framebottomskip=\skip53 +\beamer@frametopskipautobreak=\skip54 +\beamer@framebottomskipautobreak=\skip55 +\beamer@envbody=\toks30 +\framewidth=\dimen272 +\c@framenumber=\count319 +) +(/usr/share/texlive/texmf-dist/tex/latex/beamer/beamerbaseverbatim.sty +\beamer@verbatimfileout=\write4 +) +(/usr/share/texlive/texmf-dist/tex/latex/beamer/beamerbaseframesize.sty +\beamer@splitbox=\box69 +\beamer@autobreakcount=\count320 +\beamer@autobreaklastheight=\dimen273 +\beamer@frametitletoks=\toks31 +\beamer@framesubtitletoks=\toks32 +) +(/usr/share/texlive/texmf-dist/tex/latex/beamer/beamerbaseframecomponents.sty +\beamer@footins=\box70 +) +(/usr/share/texlive/texmf-dist/tex/latex/beamer/beamerbasecolor.sty) +(/usr/share/texlive/texmf-dist/tex/latex/beamer/beamerbasenotes.sty +\beamer@frameboxcopy=\box71 +) +(/usr/share/texlive/texmf-dist/tex/latex/beamer/beamerbasetoc.sty) +(/usr/share/texlive/texmf-dist/tex/latex/beamer/beamerbasetemplates.sty +\beamer@sbttoks=\toks33 + +(/usr/share/texlive/texmf-dist/tex/latex/beamer/beamerbaseauxtemplates.sty +(/usr/share/texlive/texmf-dist/tex/latex/beamer/beamerbaseboxes.sty +\bmb@box=\box72 +\bmb@colorbox=\box73 +\bmb@boxwidth=\dimen274 +\bmb@boxheight=\dimen275 +\bmb@prevheight=\dimen276 +\bmb@temp=\dimen277 +\bmb@dima=\dimen278 +\bmb@dimb=\dimen279 +\bmb@prevheight=\dimen280 +) +\beamer@blockheadheight=\dimen281 +)) +(/usr/share/texlive/texmf-dist/tex/latex/beamer/beamerbaselocalstructure.sty +(/usr/share/texlive/texmf-dist/tex/latex/tools/enumerate.sty +Package: enumerate 2023/07/04 v3.00 enumerate extensions (DPC) +\@enLab=\toks34 +) +\beamer@bibiconwidth=\skip56 +\c@figure=\count321 +\c@table=\count322 +\abovecaptionskip=\skip57 +\belowcaptionskip=\skip58 +) +(/usr/share/texlive/texmf-dist/tex/latex/beamer/beamerbasenavigation.sty +(/usr/share/texlive/texmf-dist/tex/latex/beamer/beamerbasenavigationsymbols.tex +) +\beamer@section@min@dim=\dimen282 +) (/usr/share/texlive/texmf-dist/tex/latex/beamer/beamerbasetheorems.sty +(/usr/share/texlive/texmf-dist/tex/latex/amsmath/amsmath.sty +Package: amsmath 2023/05/13 v2.17o AMS math features +\@mathmargin=\skip59 + +For additional information on amsmath, use the `?' option. +(/usr/share/texlive/texmf-dist/tex/latex/amsmath/amstext.sty +Package: amstext 2021/08/26 v2.01 AMS text + +(/usr/share/texlive/texmf-dist/tex/latex/amsmath/amsgen.sty +File: amsgen.sty 1999/11/30 v2.0 generic functions +\@emptytoks=\toks35 +\ex@=\dimen283 +)) +(/usr/share/texlive/texmf-dist/tex/latex/amsmath/amsbsy.sty +Package: amsbsy 1999/11/29 v1.2d Bold Symbols +\pmbraise@=\dimen284 +) +(/usr/share/texlive/texmf-dist/tex/latex/amsmath/amsopn.sty +Package: amsopn 2022/04/08 v2.04 operator names +) +\inf@bad=\count323 +LaTeX Info: Redefining \frac on input line 234. +\uproot@=\count324 +\leftroot@=\count325 +LaTeX Info: Redefining \overline on input line 399. +LaTeX Info: Redefining \colon on input line 410. +\classnum@=\count326 +\DOTSCASE@=\count327 +LaTeX Info: Redefining \ldots on input line 496. +LaTeX Info: Redefining \dots on input line 499. +LaTeX Info: Redefining \cdots on input line 620. +\Mathstrutbox@=\box74 +\strutbox@=\box75 +LaTeX Info: Redefining \big on input line 722. +LaTeX Info: Redefining \Big on input line 723. +LaTeX Info: Redefining \bigg on input line 724. +LaTeX Info: Redefining \Bigg on input line 725. +\big@size=\dimen285 +LaTeX Font Info: Redeclaring font encoding OML on input line 743. +LaTeX Font Info: Redeclaring font encoding OMS on input line 744. +\macc@depth=\count328 +LaTeX Info: Redefining \bmod on input line 905. +LaTeX Info: Redefining \pmod on input line 910. +LaTeX Info: Redefining \smash on input line 940. +LaTeX Info: Redefining \relbar on input line 970. +LaTeX Info: Redefining \Relbar on input line 971. +\c@MaxMatrixCols=\count329 +\dotsspace@=\muskip17 +\c@parentequation=\count330 +\dspbrk@lvl=\count331 +\tag@help=\toks36 +\row@=\count332 +\column@=\count333 +\maxfields@=\count334 +\andhelp@=\toks37 +\eqnshift@=\dimen286 +\alignsep@=\dimen287 +\tagshift@=\dimen288 +\tagwidth@=\dimen289 +\totwidth@=\dimen290 +\lineht@=\dimen291 +\@envbody=\toks38 +\multlinegap=\skip60 +\multlinetaggap=\skip61 +\mathdisplay@stack=\toks39 +LaTeX Info: Redefining \[ on input line 2953. +LaTeX Info: Redefining \] on input line 2954. +) +(/usr/share/texlive/texmf-dist/tex/latex/amscls/amsthm.sty +Package: amsthm 2020/05/29 v2.20.6 +\thm@style=\toks40 +\thm@bodyfont=\toks41 +\thm@headfont=\toks42 +\thm@notefont=\toks43 +\thm@headpunct=\toks44 +\thm@preskip=\skip62 +\thm@postskip=\skip63 +\thm@headsep=\skip64 +\dth@everypar=\toks45 +) +\c@theorem=\count335 +) +(/usr/share/texlive/texmf-dist/tex/latex/beamer/beamerbasethemes.sty)) +(/usr/share/texlive/texmf-dist/tex/latex/beamer/beamerthemedefault.sty +(/usr/share/texlive/texmf-dist/tex/latex/beamer/beamerfontthemedefault.sty) +(/usr/share/texlive/texmf-dist/tex/latex/beamer/beamercolorthemedefault.sty) +(/usr/share/texlive/texmf-dist/tex/latex/beamer/beamerinnerthemedefault.sty +\beamer@dima=\dimen292 +\beamer@dimb=\dimen293 +) +(/usr/share/texlive/texmf-dist/tex/latex/beamer/beamerouterthemedefault.sty))) +(/usr/share/texlive/texmf-dist/tex/latex/beamer/beamerthemeMadrid.sty +(/usr/share/texlive/texmf-dist/tex/latex/beamer/beamercolorthemewhale.sty) +(/usr/share/texlive/texmf-dist/tex/latex/beamer/beamercolorthemeorchid.sty) +(/usr/share/texlive/texmf-dist/tex/latex/beamer/beamerinnerthemerounded.sty) +(/usr/share/texlive/texmf-dist/tex/latex/beamer/beamerouterthemeinfolines.sty)) +(/usr/share/texlive/texmf-dist/tex/latex/pgf/frontendlayer/tikz.sty +(/usr/share/texlive/texmf-dist/tex/latex/pgf/basiclayer/pgf.sty +Package: pgf 2023-01-15 v3.1.10 (3.1.10) + +(/usr/share/texlive/texmf-dist/tex/generic/pgf/modules/pgfmoduleshapes.code.tex +File: pgfmoduleshapes.code.tex 2023-01-15 v3.1.10 (3.1.10) +\pgfnodeparttextbox=\box76 +) (/usr/share/texlive/texmf-dist/tex/generic/pgf/modules/pgfmoduleplot.code.tex +File: pgfmoduleplot.code.tex 2023-01-15 v3.1.10 (3.1.10) +) +(/usr/share/texlive/texmf-dist/tex/latex/pgf/compatibility/pgfcomp-version-0-65 +.sty +Package: pgfcomp-version-0-65 2023-01-15 v3.1.10 (3.1.10) +\pgf@nodesepstart=\dimen294 +\pgf@nodesepend=\dimen295 +) +(/usr/share/texlive/texmf-dist/tex/latex/pgf/compatibility/pgfcomp-version-1-18 +.sty +Package: pgfcomp-version-1-18 2023-01-15 v3.1.10 (3.1.10) +)) (/usr/share/texlive/texmf-dist/tex/latex/pgf/utilities/pgffor.sty +(/usr/share/texlive/texmf-dist/tex/generic/pgf/utilities/pgffor.code.tex +Package: pgffor 2023-01-15 v3.1.10 (3.1.10) +\pgffor@iter=\dimen296 +\pgffor@skip=\dimen297 +\pgffor@stack=\toks46 +\pgffor@toks=\toks47 +)) +(/usr/share/texlive/texmf-dist/tex/generic/pgf/frontendlayer/tikz/tikz.code.tex +Package: tikz 2023-01-15 v3.1.10 (3.1.10) + +(/usr/share/texlive/texmf-dist/tex/generic/pgf/libraries/pgflibraryplothandlers +.code.tex +File: pgflibraryplothandlers.code.tex 2023-01-15 v3.1.10 (3.1.10) +\pgf@plot@mark@count=\count336 +\pgfplotmarksize=\dimen298 +) +\tikz@lastx=\dimen299 +\tikz@lasty=\dimen300 +\tikz@lastxsaved=\dimen301 +\tikz@lastysaved=\dimen302 +\tikz@lastmovetox=\dimen303 +\tikz@lastmovetoy=\dimen304 +\tikzleveldistance=\dimen305 +\tikzsiblingdistance=\dimen306 +\tikz@figbox=\box77 +\tikz@figbox@bg=\box78 +\tikz@tempbox=\box79 +\tikz@tempbox@bg=\box80 +\tikztreelevel=\count337 +\tikznumberofchildren=\count338 +\tikznumberofcurrentchild=\count339 +\tikz@fig@count=\count340 + +(/usr/share/texlive/texmf-dist/tex/generic/pgf/modules/pgfmodulematrix.code.tex +File: pgfmodulematrix.code.tex 2023-01-15 v3.1.10 (3.1.10) +\pgfmatrixcurrentrow=\count341 +\pgfmatrixcurrentcolumn=\count342 +\pgf@matrix@numberofcolumns=\count343 +) +\tikz@expandcount=\count344 + +(/usr/share/texlive/texmf-dist/tex/generic/pgf/frontendlayer/tikz/libraries/tik +zlibrarytopaths.code.tex +File: tikzlibrarytopaths.code.tex 2023-01-15 v3.1.10 (3.1.10) +))) +(/usr/share/texlive/texmf-dist/tex/latex/booktabs/booktabs.sty +Package: booktabs 2020/01/12 v1.61803398 Publication quality tables +\heavyrulewidth=\dimen307 +\lightrulewidth=\dimen308 +\cmidrulewidth=\dimen309 +\belowrulesep=\dimen310 +\belowbottomsep=\dimen311 +\aboverulesep=\dimen312 +\abovetopsep=\dimen313 +\cmidrulesep=\dimen314 +\cmidrulekern=\dimen315 +\defaultaddspace=\dimen316 +\@cmidla=\count345 +\@cmidlb=\count346 +\@aboverulesep=\dimen317 +\@belowrulesep=\dimen318 +\@thisruleclass=\count347 +\@lastruleclass=\count348 +\@thisrulewidth=\dimen319 +) +(/usr/share/texlive/texmf-dist/tex/latex/l3backend/l3backend-pdftex.def +File: l3backend-pdftex.def 2024-01-04 L3 backend support: PDF output (pdfTeX) +\l__color_backend_stack_int=\count349 +\l__pdf_internal_box=\box81 +) +No file presentation.aux. +\openout1 = `presentation.aux'. + +LaTeX Font Info: Checking defaults for OML/cmm/m/it on input line 17. +LaTeX Font Info: ... okay on input line 17. +LaTeX Font Info: Checking defaults for OMS/cmsy/m/n on input line 17. +LaTeX Font Info: ... okay on input line 17. +LaTeX Font Info: Checking defaults for OT1/cmr/m/n on input line 17. +LaTeX Font Info: ... okay on input line 17. +LaTeX Font Info: Checking defaults for T1/cmr/m/n on input line 17. +LaTeX Font Info: ... okay on input line 17. +LaTeX Font Info: Checking defaults for TS1/cmr/m/n on input line 17. +LaTeX Font Info: ... okay on input line 17. +LaTeX Font Info: Checking defaults for OMX/cmex/m/n on input line 17. +LaTeX Font Info: ... okay on input line 17. +LaTeX Font Info: Checking defaults for U/cmr/m/n on input line 17. +LaTeX Font Info: ... okay on input line 17. +LaTeX Font Info: Checking defaults for PD1/pdf/m/n on input line 17. +LaTeX Font Info: ... okay on input line 17. +LaTeX Font Info: Checking defaults for PU/pdf/m/n on input line 17. +LaTeX Font Info: ... okay on input line 17. +*geometry* driver: auto-detecting +*geometry* detected driver: pdftex +*geometry* verbose mode - [ preamble ] result: +* driver: pdftex +* paper: custom +* layout: +* layoutoffset:(h,v)=(0.0pt,0.0pt) +* modes: includehead includefoot +* h-part:(L,W,R)=(10.95003pt, 433.34402pt, 10.95003pt) +* v-part:(T,H,B)=(0.0pt, 256.0748pt, 0.0pt) +* \paperwidth=455.24408pt +* \paperheight=256.0748pt +* \textwidth=433.34402pt +* \textheight=227.62207pt +* \oddsidemargin=-61.31996pt +* \evensidemargin=-61.31996pt +* \topmargin=-72.26999pt +* \headheight=14.22636pt +* \headsep=0.0pt +* \topskip=11.0pt +* \footskip=14.22636pt +* \marginparwidth=4.0pt +* \marginparsep=10.0pt +* \columnsep=10.0pt +* \skip\footins=10.0pt plus 4.0pt minus 2.0pt +* \hoffset=0.0pt +* \voffset=0.0pt +* \mag=1000 +* \@twocolumnfalse +* \@twosidefalse +* \@mparswitchfalse +* \@reversemarginfalse +* (1in=72.27pt=25.4mm, 1cm=28.453pt) + +(/usr/share/texlive/texmf-dist/tex/context/base/mkii/supp-pdf.mkii +[Loading MPS to PDF converter (version 2006.09.02).] +\scratchcounter=\count350 +\scratchdimen=\dimen320 +\scratchbox=\box82 +\nofMPsegments=\count351 +\nofMParguments=\count352 +\everyMPshowfont=\toks48 +\MPscratchCnt=\count353 +\MPscratchDim=\dimen321 +\MPnumerator=\count354 +\makeMPintoPDFobject=\count355 +\everyMPtoPDFconversion=\toks49 +) (/usr/share/texlive/texmf-dist/tex/latex/epstopdf-pkg/epstopdf-base.sty +Package: epstopdf-base 2020-01-24 v2.11 Base part for package epstopdf +Package epstopdf-base Info: Redefining graphics rule for `.eps' on input line 4 +85. + +(/usr/share/texlive/texmf-dist/tex/latex/latexconfig/epstopdf-sys.cfg +File: epstopdf-sys.cfg 2010/07/13 v1.3 Configuration of (r)epstopdf for TeX Liv +e +)) +Package hyperref Info: Link coloring OFF on input line 17. +\@outlinefile=\write5 +\openout5 = `presentation.out'. + +LaTeX Font Info: Overwriting symbol font `operators' in version `normal' +(Font) OT1/cmr/m/n --> OT1/cmss/m/n on input line 17. +LaTeX Font Info: Overwriting symbol font `operators' in version `bold' +(Font) OT1/cmr/bx/n --> OT1/cmss/b/n on input line 17. +\symnumbers=\mathgroup6 +\sympureletters=\mathgroup7 +LaTeX Font Info: Overwriting math alphabet `\mathrm' in version `normal' +(Font) OT1/cmss/m/n --> OT1/cmr/m/n on input line 17. +LaTeX Font Info: Redeclaring math alphabet \mathbf on input line 17. +LaTeX Font Info: Overwriting math alphabet `\mathbf' in version `normal' +(Font) OT1/cmr/bx/n --> OT1/cmss/b/n on input line 17. +LaTeX Font Info: Overwriting math alphabet `\mathbf' in version `bold' +(Font) OT1/cmr/bx/n --> OT1/cmss/b/n on input line 17. +LaTeX Font Info: Redeclaring math alphabet \mathsf on input line 17. +LaTeX Font Info: Overwriting math alphabet `\mathsf' in version `normal' +(Font) OT1/cmss/m/n --> OT1/cmss/m/n on input line 17. +LaTeX Font Info: Overwriting math alphabet `\mathsf' in version `bold' +(Font) OT1/cmss/bx/n --> OT1/cmss/m/n on input line 17. +LaTeX Font Info: Redeclaring math alphabet \mathit on input line 17. +LaTeX Font Info: Overwriting math alphabet `\mathit' in version `normal' +(Font) OT1/cmr/m/it --> OT1/cmss/m/it on input line 17. +LaTeX Font Info: Overwriting math alphabet `\mathit' in version `bold' +(Font) OT1/cmr/bx/it --> OT1/cmss/m/it on input line 17. +LaTeX Font Info: Redeclaring math alphabet \mathtt on input line 17. +LaTeX Font Info: Overwriting math alphabet `\mathtt' in version `normal' +(Font) OT1/cmtt/m/n --> OT1/cmtt/m/n on input line 17. +LaTeX Font Info: Overwriting math alphabet `\mathtt' in version `bold' +(Font) OT1/cmtt/m/n --> OT1/cmtt/m/n on input line 17. +LaTeX Font Info: Overwriting symbol font `numbers' in version `bold' +(Font) OT1/cmss/m/n --> OT1/cmss/b/n on input line 17. +LaTeX Font Info: Overwriting symbol font `pureletters' in version `bold' +(Font) OT1/cmss/m/it --> OT1/cmss/b/it on input line 17. +LaTeX Font Info: Overwriting math alphabet `\mathrm' in version `bold' +(Font) OT1/cmss/b/n --> OT1/cmr/b/n on input line 17. +LaTeX Font Info: Overwriting math alphabet `\mathbf' in version `bold' +(Font) OT1/cmss/b/n --> OT1/cmss/b/n on input line 17. +LaTeX Font Info: Overwriting math alphabet `\mathsf' in version `bold' +(Font) OT1/cmss/m/n --> OT1/cmss/b/n on input line 17. +LaTeX Font Info: Overwriting math alphabet `\mathit' in version `bold' +(Font) OT1/cmss/m/it --> OT1/cmss/b/it on input line 17. +LaTeX Font Info: Overwriting math alphabet `\mathtt' in version `bold' +(Font) OT1/cmtt/m/n --> OT1/cmtt/b/n on input line 17. +LaTeX Font Info: Redeclaring symbol font `pureletters' on input line 17. +LaTeX Font Info: Overwriting symbol font `pureletters' in version `normal' +(Font) OT1/cmss/m/it --> OT1/mathkerncmss/m/sl on input line 1 +7. +LaTeX Font Info: Overwriting symbol font `pureletters' in version `bold' +(Font) OT1/cmss/b/it --> OT1/mathkerncmss/m/sl on input line 1 +7. +LaTeX Font Info: Overwriting symbol font `pureletters' in version `bold' +(Font) OT1/mathkerncmss/m/sl --> OT1/mathkerncmss/bx/sl on inp +ut line 17. + +(/usr/share/texlive/texmf-dist/tex/latex/translator/translator-basic-dictionary +-English.dict +Dictionary: translator-basic-dictionary, Language: English +) +(/usr/share/texlive/texmf-dist/tex/latex/translator/translator-bibliography-dic +tionary-English.dict +Dictionary: translator-bibliography-dictionary, Language: English +) +(/usr/share/texlive/texmf-dist/tex/latex/translator/translator-environment-dict +ionary-English.dict +Dictionary: translator-environment-dictionary, Language: English +) +(/usr/share/texlive/texmf-dist/tex/latex/translator/translator-months-dictionar +y-English.dict +Dictionary: translator-months-dictionary, Language: English +) +(/usr/share/texlive/texmf-dist/tex/latex/translator/translator-numbers-dictiona +ry-English.dict +Dictionary: translator-numbers-dictionary, Language: English +) +(/usr/share/texlive/texmf-dist/tex/latex/translator/translator-theorem-dictiona +ry-English.dict +Dictionary: translator-theorem-dictionary, Language: English +) +No file presentation.nav. +[1 + +{/var/lib/texmf/fonts/map/pdftex/updmap/pdftex.map}] +No file presentation.toc. +[2 + +] +LaTeX Font Info: Trying to load font information for U+msa on input line 44. + + (/usr/share/texlive/texmf-dist/tex/latex/amsfonts/umsa.fd +File: umsa.fd 2013/01/14 v3.01 AMS symbols A +) +LaTeX Font Info: Trying to load font information for U+msb on input line 44. + + +(/usr/share/texlive/texmf-dist/tex/latex/amsfonts/umsb.fd +File: umsb.fd 2013/01/14 v3.01 AMS symbols B +) +LaTeX Font Info: Trying to load font information for OT1+mathkerncmss on inp +ut line 44. + +(/usr/share/texlive/texmf-dist/tex/latex/sansmathaccent/ot1mathkerncmss.fd +File: ot1mathkerncmss.fd 2020/01/31 Fontinst v1.933 font definitions for OT1/ma +thkerncmss. +) +[3 + +] [4 + +] [5 + +] [6 + +] [7 + +] +\tf@nav=\write6 +\openout6 = `presentation.nav'. + +\tf@toc=\write7 +\openout7 = `presentation.toc'. + +\tf@snm=\write8 +\openout8 = `presentation.snm'. + + (./presentation.aux) + *********** +LaTeX2e <2023-11-01> patch level 1 +L3 programming layer <2024-01-22> + *********** + + +Package rerunfilecheck Warning: File `presentation.out' has changed. +(rerunfilecheck) Rerun to get outlines right +(rerunfilecheck) or use package `bookmark'. + +Package rerunfilecheck Info: Checksums for `presentation.out': +(rerunfilecheck) Before: +(rerunfilecheck) After: 2D39CDAEFFC696FCD00160D97059AF7F;1042. + ) +Here is how much of TeX's memory you used: + 24731 strings out of 475495 + 492143 string characters out of 5782356 + 1936975 words of memory out of 5000000 + 46265 multiletter control sequences out of 15000+600000 + 569725 words of font info for 78 fonts, out of 8000000 for 9000 + 497 hyphenation exceptions out of 8191 + 128i,15n,123p,387b,599s stack positions out of 10000i,1000n,20000p,200000b,200000s + +Output written on presentation.pdf (7 pages, 118824 bytes). +PDF statistics: + 280 PDF objects out of 1000 (max. 8388607) + 229 compressed objects within 3 object streams + 20 named destinations out of 1000 (max. 500000) + 91 words of extra memory for PDF output out of 10000 (max. 10000000) + diff --git a/slides/presentation.nav b/slides/presentation.nav new file mode 100644 index 0000000000000000000000000000000000000000..4bc08d8478299e25ccd5c1213a10dfed6e715f2e --- /dev/null +++ b/slides/presentation.nav @@ -0,0 +1,34 @@ +\headcommand {\slideentry {0}{0}{1}{1/1}{}{0}} +\headcommand {\beamer@framepages {1}{1}} +\headcommand {\slideentry {0}{0}{2}{2/2}{}{0}} +\headcommand {\beamer@framepages {2}{2}} +\headcommand {\beamer@sectionpages {1}{2}} +\headcommand {\beamer@subsectionpages {1}{2}} +\headcommand {\sectionentry {1}{So-VITS-SVC (Baseline)}{3}{So-VITS-SVC (Baseline)}{0}} +\headcommand {\slideentry {1}{0}{1}{3/3}{}{0}} +\headcommand {\beamer@framepages {3}{3}} +\headcommand {\beamer@sectionpages {3}{3}} +\headcommand {\beamer@subsectionpages {3}{3}} +\headcommand {\sectionentry {2}{V1 Architecture: DAC \& Flow Matching}{4}{V1 Architecture: DAC \& Flow Matching}{0}} +\headcommand {\slideentry {2}{0}{1}{4/4}{}{0}} +\headcommand {\beamer@framepages {4}{4}} +\headcommand {\beamer@sectionpages {4}{4}} +\headcommand {\beamer@subsectionpages {4}{4}} +\headcommand {\sectionentry {3}{Better V1: Latent Compression}{5}{Better V1: Latent Compression}{0}} +\headcommand {\slideentry {3}{0}{1}{5/5}{}{0}} +\headcommand {\beamer@framepages {5}{5}} +\headcommand {\beamer@sectionpages {5}{5}} +\headcommand {\beamer@subsectionpages {5}{5}} +\headcommand {\sectionentry {4}{V2: F5-SVC (Current Implementation)}{6}{V2: F5-SVC (Current Implementation)}{0}} +\headcommand {\slideentry {4}{0}{1}{6/6}{}{0}} +\headcommand {\beamer@framepages {6}{6}} +\headcommand {\beamer@sectionpages {6}{6}} +\headcommand {\beamer@subsectionpages {6}{6}} +\headcommand {\sectionentry {5}{Architectural Comparison Summary}{7}{Architectural Comparison Summary}{0}} +\headcommand {\slideentry {5}{0}{1}{7/7}{}{0}} +\headcommand {\beamer@framepages {7}{7}} +\headcommand {\beamer@partpages {1}{7}} +\headcommand {\beamer@subsectionpages {7}{7}} +\headcommand {\beamer@sectionpages {7}{7}} +\headcommand {\beamer@documentpages {7}} +\headcommand {\gdef \inserttotalframenumber {7}} diff --git a/slides/presentation.out b/slides/presentation.out new file mode 100644 index 0000000000000000000000000000000000000000..1f8cb0ec328679827f46dbfaa1264cc867cfccf7 --- /dev/null +++ b/slides/presentation.out @@ -0,0 +1,5 @@ +\BOOKMARK [2][]{Outline0.1}{\376\377\000S\000o\000-\000V\000I\000T\000S\000-\000S\000V\000C\000\040\000\050\000B\000a\000s\000e\000l\000i\000n\000e\000\051}{}% 1 +\BOOKMARK [2][]{Outline0.2}{\376\377\000V\0001\000\040\000A\000r\000c\000h\000i\000t\000e\000c\000t\000u\000r\000e\000:\000\040\000D\000A\000C\000\040\000\046\000\040\000F\000l\000o\000w\000\040\000M\000a\000t\000c\000h\000i\000n\000g}{}% 2 +\BOOKMARK [2][]{Outline0.3}{\376\377\000B\000e\000t\000t\000e\000r\000\040\000V\0001\000:\000\040\000L\000a\000t\000e\000n\000t\000\040\000C\000o\000m\000p\000r\000e\000s\000s\000i\000o\000n}{}% 3 +\BOOKMARK [2][]{Outline0.4}{\376\377\000V\0002\000:\000\040\000F\0005\000-\000S\000V\000C\000\040\000\050\000C\000u\000r\000r\000e\000n\000t\000\040\000I\000m\000p\000l\000e\000m\000e\000n\000t\000a\000t\000i\000o\000n\000\051}{}% 4 +\BOOKMARK [2][]{Outline0.5}{\376\377\000A\000r\000c\000h\000i\000t\000e\000c\000t\000u\000r\000a\000l\000\040\000C\000o\000m\000p\000a\000r\000i\000s\000o\000n\000\040\000S\000u\000m\000m\000a\000r\000y}{}% 5 diff --git a/slides/presentation.pdf b/slides/presentation.pdf new file mode 100644 index 0000000000000000000000000000000000000000..ac70ea2510b28f51585dba45cb5e5a9738c65b24 --- /dev/null +++ b/slides/presentation.pdf @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:dd6c5cfba099094eb4766dba99898d2a7aaa87d0013b70e5b0b4394c382ff6a0 +size 118824 diff --git a/slides/presentation.snm b/slides/presentation.snm new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/slides/presentation.tex b/slides/presentation.tex new file mode 100644 index 0000000000000000000000000000000000000000..38871c6c5557f9256edb097a9e49d7df3c2c094d --- /dev/null +++ b/slides/presentation.tex @@ -0,0 +1,133 @@ +\documentclass[aspectratio=169]{beamer} + +\usetheme{Madrid} +\usecolortheme{default} + +% Packages +\usepackage{tikz} +\usepackage{booktabs} +\usepackage{graphicx} +\usepackage{amsmath} + +\title{Singing Voice Conversion Architecture Evolution} +\subtitle{From So-VITS-SVC to F5-SVC} +\author{F5-SVC Development Team} +\date{\today} + +\begin{document} + +\begin{frame} + \titlepage +\end{frame} + +\begin{frame}{Overview of Architectures} + \tableofcontents +\end{frame} + +% Section 1: So-VITS-SVC +\section{So-VITS-SVC (Baseline)} +\begin{frame}{So-VITS-SVC (The Baseline)} + \textbf{Core Concept}: Use a Normalizing Flow (NF) in a comparatively low-dimensional space, directly generating waveforms using a learned vocoder. + \vspace{0.5cm} + + \begin{itemize} + \item \textbf{Input features}: PPG (Content) + F0 (Pitch) + Spk (Timbre) + \item \textbf{Acoustic Model}: Normalizing Flow maps noise to $z$ (192-dim space) + \item \textbf{Vocoder}: Learned HiFi-GAN (VITS Decoder) translates $z$ to Waveform + \item \textbf{Fine-tuning}: Full model fine-tuning for new speakers + \end{itemize} + \vspace{0.2cm} + + \textbf{Pipeline}:\\ + \centering + \texttt{PPG/F0/Spk $\rightarrow$ NF Flow (192-dim) $\rightarrow$ Learned HiFi-GAN $\rightarrow$ Audio} +\end{frame} + +% Section 2: V1 +\section{V1 Architecture: DAC \& Flow Matching} +\begin{frame}{V1: High-Fidelity Flow Matching} + \textbf{Core Concept}: Scale up using a powerful Discrete Audio Codec (DAC) space and Continuous Normalizing Flow (Flow Matching) via Diffusion Transformers (DiT). + \vspace{0.5cm} + + \begin{itemize} + \item \textbf{Input features}: PPG / HuBERT / F0 + Spk Embedding + \item \textbf{Acoustic Model}: DiT Flow Matching in 1024-dim DAC continuous space + \item \textbf{Vocoder}: Frozen DAC Decoder translates 1024-dim $z$ to Waveform + \item \textbf{Challenge}: High-dimensional flow matching (1024-dim) is computationally expensive and difficult to converge efficiently. + \end{itemize} + + \vspace{0.5cm} + \textbf{Pipeline}:\\ + \centering + \texttt{PPG/HuBERT/F0 $\rightarrow$ DiT Flow (1024-dim) $\rightarrow$ Frozen DAC Decode} +\end{frame} + +% Section 3: Better V1 (Proposed AE) +\section{Better V1: Latent Compression} +\begin{frame}{Better V1: Flow Matching with Latent Compression} + \textbf{Core Concept}: Pre-train an Autoencoder (AE) to compress the 1024-dim DAC latent space down to 256-dim, thereby making Flow Matching much easier. + \vspace{0.3cm} + + \begin{itemize} + \item \textbf{Step 1 (Pre-train AE)}: Compress DAC $z$ (1024) $\rightarrow$ $z_{comp}$ (256) $\rightarrow$ Recon $z$ (1024)\\ + \textit{(Trained with reconstruction + perceptual loss)} + \item \textbf{Step 2 (Flow Matching)}: DiT operates in the compressed 256-dim space + \item \textbf{Step 3 (Decoding)}: Project back to 1024-dim using AE Decoder, then pass to frozen DAC Decoder + \end{itemize} + + \vspace{0.3cm} + \textbf{Pipeline}:\\ + \centering + \texttt{PPG/\dots $\rightarrow$ DiT Flow (256-dim) $\rightarrow$ AE Upsample (1024) $\rightarrow$ DAC Decode} +\end{frame} + +% Section 4: V2 (F5-TTS pipeline) +\section{V2: F5-SVC (Current Implementation)} +\begin{frame}{V2: F5-SVC (Zero-Shot / Few-Shot TTS Base)} + \textbf{Core Concept}: Leverage massive pre-trained TTS foundations (F5-TTS) generating Mel-spectrograms, overriding TTS text conditioning with SVC representations. + \vspace{0.3cm} + + \begin{itemize} + \item \textbf{Input}: PPG / HuBERT / F0 $\rightarrow$ \textit{SVCCondAdapter} (projected to text\_dim) + \item \textbf{Acoustic Model}: Pre-trained frozen F5-DiT with Stacked LoRA + \begin{itemize} + \item \textbf{Stage 1}: Rank-16 LoRA (singing adaptation, $W + A_1 B_1$) + \item \textbf{Stage 2}: Rank-4 LoRA (speaker adaptation, $W + A_1 B_1 + A_2 B_2$) + \end{itemize} + \item \textbf{Target Space}: 100-dim Mel-spectrograms (24kHz format) + \item \textbf{Vocoder}: Frozen Vocos (Mel $\rightarrow$ Audio) + \end{itemize} + + \vspace{0.1cm} + \textbf{Pipeline}:\\ + \centering + \texttt{PPG/\dots $\rightarrow$ F5-DiT (+ Stacked LoRA) (100-dim Mel) $\rightarrow$ Vocos $\rightarrow$ Audio} +\end{frame} + +% Section 5: Architecture Comparison +\section{Architectural Comparison Summary} +\begin{frame}{Target Space \& Dimensionality Comparison} + \begin{table}[] + \centering + \begin{tabular}{l l l l} + \toprule + \textbf{Architecture} & \textbf{Decoding Space} & \textbf{Space Dim.} & \textbf{Vocoder} \\ + \midrule + So-VITS-SVC & Latent $z$ (VITS) & 192 & Learned HiFi-GAN \\ + V1 & Latent $z$ (DAC) & 1024 & Frozen DAC \\ + Better V1 (AE) & Compressed $z_{comp}$ & 256 & AE + Frozen DAC \\ + V2 (F5-SVC) & Mel-Spectrogram & 100 & Frozen Vocos \\ + \bottomrule + \end{tabular} + \end{table} + + \vspace{0.5cm} + \textbf{Takeaways}: + \begin{itemize} + \item \textbf{V1} suffered from massive dimensionality (1024-dim flows are hard). + \item \textbf{Better V1} solves this by compressing the target to 256-dim. + \item \textbf{V2} (Current) leverages a massive pre-trained model (F5-TTS) in an already highly-compressed, well-understood space (100-dim Log-Mel) with PEFT (LoRA). + \end{itemize} +\end{frame} + +\end{document} diff --git a/slides/presentation.toc b/slides/presentation.toc new file mode 100644 index 0000000000000000000000000000000000000000..6d7c55633ed5e3134714ccce963064041b9a3b17 --- /dev/null +++ b/slides/presentation.toc @@ -0,0 +1,5 @@ +\beamer@sectionintoc {1}{So-VITS-SVC (Baseline)}{3}{0}{1} +\beamer@sectionintoc {2}{V1 Architecture: DAC \& Flow Matching}{4}{0}{2} +\beamer@sectionintoc {3}{Better V1: Latent Compression}{5}{0}{3} +\beamer@sectionintoc {4}{V2: F5-SVC (Current Implementation)}{6}{0}{4} +\beamer@sectionintoc {5}{Architectural Comparison Summary}{7}{0}{5} diff --git a/slides/time_embedding_analysis.aux b/slides/time_embedding_analysis.aux new file mode 100644 index 0000000000000000000000000000000000000000..9995403862ba26998fbdbaf0dc32e982d51ae473 --- /dev/null +++ b/slides/time_embedding_analysis.aux @@ -0,0 +1,32 @@ +\relax +\providecommand\hyper@newdestlabel[2]{} +\providecommand\HyField@AuxAddToFields[1]{} +\providecommand\HyField@AuxAddToCoFields[2]{} +\@writefile{nav}{\headcommand {\slideentry {0}{0}{1}{1/1}{}{0}}} +\@writefile{nav}{\headcommand {\beamer@framepages {1}{1}}} +\@writefile{nav}{\headcommand {\slideentry {0}{0}{2}{2/2}{}{0}}} +\@writefile{nav}{\headcommand {\beamer@framepages {2}{2}}} +\@writefile{nav}{\headcommand {\slideentry {0}{0}{3}{3/3}{}{0}}} +\@writefile{nav}{\headcommand {\beamer@framepages {3}{3}}} +\@writefile{nav}{\headcommand {\slideentry {0}{0}{4}{4/4}{}{0}}} +\@writefile{nav}{\headcommand {\beamer@framepages {4}{4}}} +\@writefile{nav}{\headcommand {\slideentry {0}{0}{5}{5/5}{}{0}}} +\@writefile{nav}{\headcommand {\beamer@framepages {5}{5}}} +\@writefile{nav}{\headcommand {\slideentry {0}{0}{6}{6/6}{}{0}}} +\@writefile{nav}{\headcommand {\beamer@framepages {6}{6}}} +\@writefile{nav}{\headcommand {\slideentry {0}{0}{7}{7/7}{}{0}}} +\@writefile{nav}{\headcommand {\beamer@framepages {7}{7}}} +\@writefile{nav}{\headcommand {\slideentry {0}{0}{8}{8/8}{}{0}}} +\@writefile{nav}{\headcommand {\beamer@framepages {8}{8}}} +\@writefile{nav}{\headcommand {\slideentry {0}{0}{9}{9/9}{}{0}}} +\@writefile{nav}{\headcommand {\beamer@framepages {9}{9}}} +\@writefile{nav}{\headcommand {\slideentry {0}{0}{10}{10/10}{}{0}}} +\@writefile{nav}{\headcommand {\beamer@framepages {10}{10}}} +\@writefile{nav}{\headcommand {\slideentry {0}{0}{11}{11/11}{}{0}}} +\@writefile{nav}{\headcommand {\beamer@framepages {11}{11}}} +\@writefile{nav}{\headcommand {\beamer@partpages {1}{11}}} +\@writefile{nav}{\headcommand {\beamer@subsectionpages {1}{11}}} +\@writefile{nav}{\headcommand {\beamer@sectionpages {1}{11}}} +\@writefile{nav}{\headcommand {\beamer@documentpages {11}}} +\@writefile{nav}{\headcommand {\gdef \inserttotalframenumber {11}}} +\gdef \@abspage@last{11} diff --git a/slides/time_embedding_analysis.log b/slides/time_embedding_analysis.log new file mode 100644 index 0000000000000000000000000000000000000000..73ae881b4f8760e625aed980c9cb1125fcb071b6 --- /dev/null +++ b/slides/time_embedding_analysis.log @@ -0,0 +1,1315 @@ +This is pdfTeX, Version 3.141592653-2.6-1.40.25 (TeX Live 2023/Debian) (preloaded format=pdflatex 2026.1.13) 5 MAR 2026 15:54 +entering extended mode + restricted \write18 enabled. + %&-line parsing enabled. +**time_embedding_analysis.tex +(./time_embedding_analysis.tex +LaTeX2e <2023-11-01> patch level 1 +L3 programming layer <2024-01-22> +(/usr/share/texlive/texmf-dist/tex/latex/beamer/beamer.cls +Document Class: beamer 2024/01/06 v3.71 A class for typesetting presentations +(/usr/share/texlive/texmf-dist/tex/latex/beamer/beamerbasemodes.sty +(/usr/share/texlive/texmf-dist/tex/latex/etoolbox/etoolbox.sty +Package: etoolbox 2020/10/05 v2.5k e-TeX tools for LaTeX (JAW) +\etb@tempcnta=\count187 +) +\beamer@tempbox=\box51 +\beamer@tempcount=\count188 +\c@beamerpauses=\count189 + +(/usr/share/texlive/texmf-dist/tex/latex/beamer/beamerbasedecode.sty +\beamer@slideinframe=\count190 +\beamer@minimum=\count191 +\beamer@decode@box=\box52 +) +\beamer@commentbox=\box53 +\beamer@modecount=\count192 +) +(/usr/share/texlive/texmf-dist/tex/generic/iftex/iftex.sty +Package: iftex 2022/02/03 v1.0f TeX engine tests +) +\headdp=\dimen140 +\footheight=\dimen141 +\sidebarheight=\dimen142 +\beamer@tempdim=\dimen143 +\beamer@finalheight=\dimen144 +\beamer@animht=\dimen145 +\beamer@animdp=\dimen146 +\beamer@animwd=\dimen147 +\beamer@leftmargin=\dimen148 +\beamer@rightmargin=\dimen149 +\beamer@leftsidebar=\dimen150 +\beamer@rightsidebar=\dimen151 +\beamer@boxsize=\dimen152 +\beamer@vboxoffset=\dimen153 +\beamer@descdefault=\dimen154 +\beamer@descriptionwidth=\dimen155 +\beamer@lastskip=\skip48 +\beamer@areabox=\box54 +\beamer@animcurrent=\box55 +\beamer@animshowbox=\box56 +\beamer@sectionbox=\box57 +\beamer@logobox=\box58 +\beamer@linebox=\box59 +\beamer@sectioncount=\count193 +\beamer@subsubsectionmax=\count194 +\beamer@subsectionmax=\count195 +\beamer@sectionmax=\count196 +\beamer@totalheads=\count197 +\beamer@headcounter=\count198 +\beamer@partstartpage=\count199 +\beamer@sectionstartpage=\count266 +\beamer@subsectionstartpage=\count267 +\beamer@animationtempa=\count268 +\beamer@animationtempb=\count269 +\beamer@xpos=\count270 +\beamer@ypos=\count271 +\beamer@ypos@offset=\count272 +\beamer@showpartnumber=\count273 +\beamer@currentsubsection=\count274 +\beamer@coveringdepth=\count275 +\beamer@sectionadjust=\count276 +\beamer@toclastsection=\count277 +\beamer@tocsectionnumber=\count278 + +(/usr/share/texlive/texmf-dist/tex/latex/beamer/beamerbaseoptions.sty +(/usr/share/texlive/texmf-dist/tex/latex/graphics/keyval.sty +Package: keyval 2022/05/29 v1.15 key=value parser (DPC) +\KV@toks@=\toks17 +)) +\beamer@paperwidth=\skip49 +\beamer@paperheight=\skip50 + +(/usr/share/texlive/texmf-dist/tex/latex/geometry/geometry.sty +Package: geometry 2020/01/02 v5.9 Page Geometry + +(/usr/share/texlive/texmf-dist/tex/generic/iftex/ifvtex.sty +Package: ifvtex 2019/10/25 v1.7 ifvtex legacy package. Use iftex instead. +) +\Gm@cnth=\count279 +\Gm@cntv=\count280 +\c@Gm@tempcnt=\count281 +\Gm@bindingoffset=\dimen156 +\Gm@wd@mp=\dimen157 +\Gm@odd@mp=\dimen158 +\Gm@even@mp=\dimen159 +\Gm@layoutwidth=\dimen160 +\Gm@layoutheight=\dimen161 +\Gm@layouthoffset=\dimen162 +\Gm@layoutvoffset=\dimen163 +\Gm@dimlist=\toks18 +) +(/usr/share/texlive/texmf-dist/tex/latex/pgf/math/pgfmath.sty +(/usr/share/texlive/texmf-dist/tex/latex/pgf/utilities/pgfrcs.sty +(/usr/share/texlive/texmf-dist/tex/generic/pgf/utilities/pgfutil-common.tex +\pgfutil@everybye=\toks19 +\pgfutil@tempdima=\dimen164 +\pgfutil@tempdimb=\dimen165 +) +(/usr/share/texlive/texmf-dist/tex/generic/pgf/utilities/pgfutil-latex.def +\pgfutil@abb=\box60 +) +(/usr/share/texlive/texmf-dist/tex/generic/pgf/utilities/pgfrcs.code.tex +(/usr/share/texlive/texmf-dist/tex/generic/pgf/pgf.revision.tex) +Package: pgfrcs 2023-01-15 v3.1.10 (3.1.10) +)) +(/usr/share/texlive/texmf-dist/tex/latex/pgf/utilities/pgfkeys.sty +(/usr/share/texlive/texmf-dist/tex/generic/pgf/utilities/pgfkeys.code.tex +\pgfkeys@pathtoks=\toks20 +\pgfkeys@temptoks=\toks21 + +(/usr/share/texlive/texmf-dist/tex/generic/pgf/utilities/pgfkeyslibraryfiltered +.code.tex +\pgfkeys@tmptoks=\toks22 +))) +(/usr/share/texlive/texmf-dist/tex/generic/pgf/math/pgfmath.code.tex +(/usr/share/texlive/texmf-dist/tex/generic/pgf/math/pgfmathutil.code.tex +\pgf@x=\dimen166 +\pgf@xa=\dimen167 +\pgf@xb=\dimen168 +\pgf@xc=\dimen169 +\pgf@y=\dimen170 +\pgf@ya=\dimen171 +\pgf@yb=\dimen172 +\pgf@yc=\dimen173 +\c@pgf@counta=\count282 +\c@pgf@countb=\count283 +\c@pgf@countc=\count284 +\c@pgf@countd=\count285 +) +(/usr/share/texlive/texmf-dist/tex/generic/pgf/math/pgfmathparser.code.tex +\pgfmath@dimen=\dimen174 +\pgfmath@count=\count286 +\pgfmath@box=\box61 +\pgfmath@toks=\toks23 +\pgfmath@stack@operand=\toks24 +\pgfmath@stack@operation=\toks25 +) +(/usr/share/texlive/texmf-dist/tex/generic/pgf/math/pgfmathfunctions.code.tex) +(/usr/share/texlive/texmf-dist/tex/generic/pgf/math/pgfmathfunctions.basic.code +.tex) +(/usr/share/texlive/texmf-dist/tex/generic/pgf/math/pgfmathfunctions.trigonomet +ric.code.tex) +(/usr/share/texlive/texmf-dist/tex/generic/pgf/math/pgfmathfunctions.random.cod +e.tex) +(/usr/share/texlive/texmf-dist/tex/generic/pgf/math/pgfmathfunctions.comparison +.code.tex) +(/usr/share/texlive/texmf-dist/tex/generic/pgf/math/pgfmathfunctions.base.code. +tex) +(/usr/share/texlive/texmf-dist/tex/generic/pgf/math/pgfmathfunctions.round.code +.tex) +(/usr/share/texlive/texmf-dist/tex/generic/pgf/math/pgfmathfunctions.misc.code. +tex) +(/usr/share/texlive/texmf-dist/tex/generic/pgf/math/pgfmathfunctions.integerari +thmetics.code.tex) +(/usr/share/texlive/texmf-dist/tex/generic/pgf/math/pgfmathcalc.code.tex) +(/usr/share/texlive/texmf-dist/tex/generic/pgf/math/pgfmathfloat.code.tex +\c@pgfmathroundto@lastzeros=\count287 +))) +(/usr/share/texlive/texmf-dist/tex/latex/base/size11.clo +File: size11.clo 2023/05/17 v1.4n Standard LaTeX file (size option) +) +(/usr/share/texlive/texmf-dist/tex/latex/pgf/basiclayer/pgfcore.sty +(/usr/share/texlive/texmf-dist/tex/latex/graphics/graphicx.sty +Package: graphicx 2021/09/16 v1.2d Enhanced LaTeX Graphics (DPC,SPQR) + +(/usr/share/texlive/texmf-dist/tex/latex/graphics/graphics.sty +Package: graphics 2022/03/10 v1.4e Standard LaTeX Graphics (DPC,SPQR) + +(/usr/share/texlive/texmf-dist/tex/latex/graphics/trig.sty +Package: trig 2021/08/11 v1.11 sin cos tan (DPC) +) +(/usr/share/texlive/texmf-dist/tex/latex/graphics-cfg/graphics.cfg +File: graphics.cfg 2016/06/04 v1.11 sample graphics configuration +) +Package graphics Info: Driver file: pdftex.def on input line 107. + +(/usr/share/texlive/texmf-dist/tex/latex/graphics-def/pdftex.def +File: pdftex.def 2022/09/22 v1.2b Graphics/color driver for pdftex +)) +\Gin@req@height=\dimen175 +\Gin@req@width=\dimen176 +) +(/usr/share/texlive/texmf-dist/tex/latex/pgf/systemlayer/pgfsys.sty +(/usr/share/texlive/texmf-dist/tex/generic/pgf/systemlayer/pgfsys.code.tex +Package: pgfsys 2023-01-15 v3.1.10 (3.1.10) +\pgf@x=\dimen177 +\pgf@y=\dimen178 +\pgf@xa=\dimen179 +\pgf@ya=\dimen180 +\pgf@xb=\dimen181 +\pgf@yb=\dimen182 +\pgf@xc=\dimen183 +\pgf@yc=\dimen184 +\pgf@xd=\dimen185 +\pgf@yd=\dimen186 +\w@pgf@writea=\write3 +\r@pgf@reada=\read2 +\c@pgf@counta=\count288 +\c@pgf@countb=\count289 +\c@pgf@countc=\count290 +\c@pgf@countd=\count291 +\t@pgf@toka=\toks26 +\t@pgf@tokb=\toks27 +\t@pgf@tokc=\toks28 +\pgf@sys@id@count=\count292 + +(/usr/share/texlive/texmf-dist/tex/generic/pgf/systemlayer/pgf.cfg +File: pgf.cfg 2023-01-15 v3.1.10 (3.1.10) +) +Driver file for pgf: pgfsys-pdftex.def + +(/usr/share/texlive/texmf-dist/tex/generic/pgf/systemlayer/pgfsys-pdftex.def +File: pgfsys-pdftex.def 2023-01-15 v3.1.10 (3.1.10) + +(/usr/share/texlive/texmf-dist/tex/generic/pgf/systemlayer/pgfsys-common-pdf.de +f +File: pgfsys-common-pdf.def 2023-01-15 v3.1.10 (3.1.10) +))) +(/usr/share/texlive/texmf-dist/tex/generic/pgf/systemlayer/pgfsyssoftpath.code. +tex +File: pgfsyssoftpath.code.tex 2023-01-15 v3.1.10 (3.1.10) +\pgfsyssoftpath@smallbuffer@items=\count293 +\pgfsyssoftpath@bigbuffer@items=\count294 +) +(/usr/share/texlive/texmf-dist/tex/generic/pgf/systemlayer/pgfsysprotocol.code. +tex +File: pgfsysprotocol.code.tex 2023-01-15 v3.1.10 (3.1.10) +)) (/usr/share/texlive/texmf-dist/tex/latex/xcolor/xcolor.sty +Package: xcolor 2023/11/15 v3.01 LaTeX color extensions (UK) + +(/usr/share/texlive/texmf-dist/tex/latex/graphics-cfg/color.cfg +File: color.cfg 2016/01/02 v1.6 sample color configuration +) +Package xcolor Info: Driver file: pdftex.def on input line 274. + +(/usr/share/texlive/texmf-dist/tex/latex/graphics/mathcolor.ltx) +Package xcolor Info: Model `cmy' substituted by `cmy0' on input line 1350. +Package xcolor Info: Model `hsb' substituted by `rgb' on input line 1354. +Package xcolor Info: Model `RGB' extended on input line 1366. +Package xcolor Info: Model `HTML' substituted by `rgb' on input line 1368. +Package xcolor Info: Model `Hsb' substituted by `hsb' on input line 1369. +Package xcolor Info: Model `tHsb' substituted by `hsb' on input line 1370. +Package xcolor Info: Model `HSB' substituted by `hsb' on input line 1371. +Package xcolor Info: Model `Gray' substituted by `gray' on input line 1372. +Package xcolor Info: Model `wave' substituted by `hsb' on input line 1373. +) +(/usr/share/texlive/texmf-dist/tex/generic/pgf/basiclayer/pgfcore.code.tex +Package: pgfcore 2023-01-15 v3.1.10 (3.1.10) + +(/usr/share/texlive/texmf-dist/tex/generic/pgf/math/pgfint.code.tex) +(/usr/share/texlive/texmf-dist/tex/generic/pgf/basiclayer/pgfcorepoints.code.te +x +File: pgfcorepoints.code.tex 2023-01-15 v3.1.10 (3.1.10) +\pgf@picminx=\dimen187 +\pgf@picmaxx=\dimen188 +\pgf@picminy=\dimen189 +\pgf@picmaxy=\dimen190 +\pgf@pathminx=\dimen191 +\pgf@pathmaxx=\dimen192 +\pgf@pathminy=\dimen193 +\pgf@pathmaxy=\dimen194 +\pgf@xx=\dimen195 +\pgf@xy=\dimen196 +\pgf@yx=\dimen197 +\pgf@yy=\dimen198 +\pgf@zx=\dimen199 +\pgf@zy=\dimen256 +) +(/usr/share/texlive/texmf-dist/tex/generic/pgf/basiclayer/pgfcorepathconstruct. +code.tex +File: pgfcorepathconstruct.code.tex 2023-01-15 v3.1.10 (3.1.10) +\pgf@path@lastx=\dimen257 +\pgf@path@lasty=\dimen258 +) +(/usr/share/texlive/texmf-dist/tex/generic/pgf/basiclayer/pgfcorepathusage.code +.tex +File: pgfcorepathusage.code.tex 2023-01-15 v3.1.10 (3.1.10) +\pgf@shorten@end@additional=\dimen259 +\pgf@shorten@start@additional=\dimen260 +) +(/usr/share/texlive/texmf-dist/tex/generic/pgf/basiclayer/pgfcorescopes.code.te +x +File: pgfcorescopes.code.tex 2023-01-15 v3.1.10 (3.1.10) +\pgfpic=\box62 +\pgf@hbox=\box63 +\pgf@layerbox@main=\box64 +\pgf@picture@serial@count=\count295 +) +(/usr/share/texlive/texmf-dist/tex/generic/pgf/basiclayer/pgfcoregraphicstate.c +ode.tex +File: pgfcoregraphicstate.code.tex 2023-01-15 v3.1.10 (3.1.10) +\pgflinewidth=\dimen261 +) +(/usr/share/texlive/texmf-dist/tex/generic/pgf/basiclayer/pgfcoretransformation +s.code.tex +File: pgfcoretransformations.code.tex 2023-01-15 v3.1.10 (3.1.10) +\pgf@pt@x=\dimen262 +\pgf@pt@y=\dimen263 +\pgf@pt@temp=\dimen264 +) +(/usr/share/texlive/texmf-dist/tex/generic/pgf/basiclayer/pgfcorequick.code.tex +File: pgfcorequick.code.tex 2023-01-15 v3.1.10 (3.1.10) +) +(/usr/share/texlive/texmf-dist/tex/generic/pgf/basiclayer/pgfcoreobjects.code.t +ex +File: pgfcoreobjects.code.tex 2023-01-15 v3.1.10 (3.1.10) +) +(/usr/share/texlive/texmf-dist/tex/generic/pgf/basiclayer/pgfcorepathprocessing +.code.tex +File: pgfcorepathprocessing.code.tex 2023-01-15 v3.1.10 (3.1.10) +) +(/usr/share/texlive/texmf-dist/tex/generic/pgf/basiclayer/pgfcorearrows.code.te +x +File: pgfcorearrows.code.tex 2023-01-15 v3.1.10 (3.1.10) +\pgfarrowsep=\dimen265 +) +(/usr/share/texlive/texmf-dist/tex/generic/pgf/basiclayer/pgfcoreshade.code.tex +File: pgfcoreshade.code.tex 2023-01-15 v3.1.10 (3.1.10) +\pgf@max=\dimen266 +\pgf@sys@shading@range@num=\count296 +\pgf@shadingcount=\count297 +) +(/usr/share/texlive/texmf-dist/tex/generic/pgf/basiclayer/pgfcoreimage.code.tex +File: pgfcoreimage.code.tex 2023-01-15 v3.1.10 (3.1.10) +) +(/usr/share/texlive/texmf-dist/tex/generic/pgf/basiclayer/pgfcoreexternal.code. +tex +File: pgfcoreexternal.code.tex 2023-01-15 v3.1.10 (3.1.10) +\pgfexternal@startupbox=\box65 +) +(/usr/share/texlive/texmf-dist/tex/generic/pgf/basiclayer/pgfcorelayers.code.te +x +File: pgfcorelayers.code.tex 2023-01-15 v3.1.10 (3.1.10) +) +(/usr/share/texlive/texmf-dist/tex/generic/pgf/basiclayer/pgfcoretransparency.c +ode.tex +File: pgfcoretransparency.code.tex 2023-01-15 v3.1.10 (3.1.10) +) +(/usr/share/texlive/texmf-dist/tex/generic/pgf/basiclayer/pgfcorepatterns.code. +tex +File: pgfcorepatterns.code.tex 2023-01-15 v3.1.10 (3.1.10) +) +(/usr/share/texlive/texmf-dist/tex/generic/pgf/basiclayer/pgfcorerdf.code.tex +File: pgfcorerdf.code.tex 2023-01-15 v3.1.10 (3.1.10) +))) (/usr/share/texlive/texmf-dist/tex/latex/pgf/utilities/xxcolor.sty +Package: xxcolor 2003/10/24 ver 0.1 +\XC@nummixins=\count298 +\XC@countmixins=\count299 +) +(/usr/share/texlive/texmf-dist/tex/latex/base/atbegshi-ltx.sty +Package: atbegshi-ltx 2021/01/10 v1.0c Emulation of the original atbegshi +package with kernel methods +) +(/usr/share/texlive/texmf-dist/tex/latex/hyperref/hyperref.sty +Package: hyperref 2024-01-20 v7.01h Hypertext links for LaTeX + +(/usr/share/texlive/texmf-dist/tex/latex/kvsetkeys/kvsetkeys.sty +Package: kvsetkeys 2022-10-05 v1.19 Key value parser (HO) +) +(/usr/share/texlive/texmf-dist/tex/generic/kvdefinekeys/kvdefinekeys.sty +Package: kvdefinekeys 2019-12-19 v1.6 Define keys (HO) +) +(/usr/share/texlive/texmf-dist/tex/generic/pdfescape/pdfescape.sty +Package: pdfescape 2019/12/09 v1.15 Implements pdfTeX's escape features (HO) + +(/usr/share/texlive/texmf-dist/tex/generic/ltxcmds/ltxcmds.sty +Package: ltxcmds 2023-12-04 v1.26 LaTeX kernel commands for general use (HO) +) +(/usr/share/texlive/texmf-dist/tex/generic/pdftexcmds/pdftexcmds.sty +Package: pdftexcmds 2020-06-27 v0.33 Utility functions of pdfTeX for LuaTeX (HO +) + +(/usr/share/texlive/texmf-dist/tex/generic/infwarerr/infwarerr.sty +Package: infwarerr 2019/12/03 v1.5 Providing info/warning/error messages (HO) +) +Package pdftexcmds Info: \pdf@primitive is available. +Package pdftexcmds Info: \pdf@ifprimitive is available. +Package pdftexcmds Info: \pdfdraftmode found. +)) +(/usr/share/texlive/texmf-dist/tex/latex/hycolor/hycolor.sty +Package: hycolor 2020-01-27 v1.10 Color options for hyperref/bookmark (HO) +) +(/usr/share/texlive/texmf-dist/tex/latex/auxhook/auxhook.sty +Package: auxhook 2019-12-17 v1.6 Hooks for auxiliary files (HO) +) +(/usr/share/texlive/texmf-dist/tex/latex/hyperref/nameref.sty +Package: nameref 2023-11-26 v2.56 Cross-referencing by name of section + +(/usr/share/texlive/texmf-dist/tex/latex/refcount/refcount.sty +Package: refcount 2019/12/15 v3.6 Data extraction from label references (HO) +) +(/usr/share/texlive/texmf-dist/tex/generic/gettitlestring/gettitlestring.sty +Package: gettitlestring 2019/12/15 v1.6 Cleanup title references (HO) + +(/usr/share/texlive/texmf-dist/tex/latex/kvoptions/kvoptions.sty +Package: kvoptions 2022-06-15 v3.15 Key value format for package options (HO) +)) +\c@section@level=\count300 +) +\@linkdim=\dimen267 +\Hy@linkcounter=\count301 +\Hy@pagecounter=\count302 + +(/usr/share/texlive/texmf-dist/tex/latex/hyperref/pd1enc.def +File: pd1enc.def 2024-01-20 v7.01h Hyperref: PDFDocEncoding definition (HO) +Now handling font encoding PD1 ... +... no UTF-8 mapping file for font encoding PD1 +) +(/usr/share/texlive/texmf-dist/tex/generic/intcalc/intcalc.sty +Package: intcalc 2019/12/15 v1.3 Expandable calculations with integers (HO) +) +\Hy@SavedSpaceFactor=\count303 + +(/usr/share/texlive/texmf-dist/tex/latex/hyperref/puenc.def +File: puenc.def 2024-01-20 v7.01h Hyperref: PDF Unicode definition (HO) +Now handling font encoding PU ... +... no UTF-8 mapping file for font encoding PU +) +Package hyperref Info: Option `bookmarks' set `true' on input line 4062. +Package hyperref Info: Option `bookmarksopen' set `true' on input line 4062. +Package hyperref Info: Option `implicit' set `false' on input line 4062. +Package hyperref Info: Hyper figures OFF on input line 4179. +Package hyperref Info: Link nesting OFF on input line 4184. +Package hyperref Info: Hyper index ON on input line 4187. +Package hyperref Info: Plain pages OFF on input line 4194. +Package hyperref Info: Backreferencing OFF on input line 4199. +Package hyperref Info: Implicit mode OFF; no redefinition of LaTeX internals. +Package hyperref Info: Bookmarks ON on input line 4446. +\c@Hy@tempcnt=\count304 + +(/usr/share/texlive/texmf-dist/tex/latex/url/url.sty +\Urlmuskip=\muskip16 +Package: url 2013/09/16 ver 3.4 Verb mode for urls, etc. +) +LaTeX Info: Redefining \url on input line 4784. +\XeTeXLinkMargin=\dimen268 + +(/usr/share/texlive/texmf-dist/tex/generic/bitset/bitset.sty +Package: bitset 2019/12/09 v1.3 Handle bit-vector datatype (HO) + +(/usr/share/texlive/texmf-dist/tex/generic/bigintcalc/bigintcalc.sty +Package: bigintcalc 2019/12/15 v1.5 Expandable calculations on big integers (HO +) +)) +\Fld@menulength=\count305 +\Field@Width=\dimen269 +\Fld@charsize=\dimen270 +Package hyperref Info: Hyper figures OFF on input line 6063. +Package hyperref Info: Link nesting OFF on input line 6068. +Package hyperref Info: Hyper index ON on input line 6071. +Package hyperref Info: backreferencing OFF on input line 6078. +Package hyperref Info: Link coloring OFF on input line 6083. +Package hyperref Info: Link coloring with OCG OFF on input line 6088. +Package hyperref Info: PDF/A mode OFF on input line 6093. +\Hy@abspage=\count306 + + +Package hyperref Message: Stopped early. + +) +Package hyperref Info: Driver (autodetected): hpdftex. + (/usr/share/texlive/texmf-dist/tex/latex/hyperref/hpdftex.def +File: hpdftex.def 2024-01-20 v7.01h Hyperref driver for pdfTeX + +(/usr/share/texlive/texmf-dist/tex/latex/base/atveryend-ltx.sty +Package: atveryend-ltx 2020/08/19 v1.0a Emulation of the original atveryend pac +kage +with kernel methods +) +\Fld@listcount=\count307 +\c@bookmark@seq@number=\count308 + +(/usr/share/texlive/texmf-dist/tex/latex/rerunfilecheck/rerunfilecheck.sty +Package: rerunfilecheck 2022-07-10 v1.10 Rerun checks for auxiliary files (HO) + +(/usr/share/texlive/texmf-dist/tex/generic/uniquecounter/uniquecounter.sty +Package: uniquecounter 2019/12/15 v1.4 Provide unlimited unique counter (HO) +) +Package uniquecounter Info: New unique counter `rerunfilecheck' on input line 2 +85. +)) +(/usr/share/texlive/texmf-dist/tex/latex/beamer/beamerbaserequires.sty +(/usr/share/texlive/texmf-dist/tex/latex/beamer/beamerbasecompatibility.sty) +(/usr/share/texlive/texmf-dist/tex/latex/beamer/beamerbasefont.sty +(/usr/share/texlive/texmf-dist/tex/latex/amsfonts/amssymb.sty +Package: amssymb 2013/01/14 v3.01 AMS font symbols + +(/usr/share/texlive/texmf-dist/tex/latex/amsfonts/amsfonts.sty +Package: amsfonts 2013/01/14 v3.01 Basic AMSFonts support +\@emptytoks=\toks29 +\symAMSa=\mathgroup4 +\symAMSb=\mathgroup5 +LaTeX Font Info: Redeclaring math symbol \hbar on input line 98. +LaTeX Font Info: Overwriting math alphabet `\mathfrak' in version `bold' +(Font) U/euf/m/n --> U/euf/b/n on input line 106. +)) +(/usr/share/texlive/texmf-dist/tex/latex/sansmathaccent/sansmathaccent.sty +Package: sansmathaccent 2020/01/31 + +(/usr/share/texlive/texmf-dist/tex/latex/koma-script/scrlfile.sty +Package: scrlfile 2023/07/07 v3.41 KOMA-Script package (file load hooks) + +(/usr/share/texlive/texmf-dist/tex/latex/koma-script/scrlfile-hook.sty +Package: scrlfile-hook 2023/07/07 v3.41 KOMA-Script package (using LaTeX hooks) + + +(/usr/share/texlive/texmf-dist/tex/latex/koma-script/scrlogo.sty +Package: scrlogo 2023/07/07 v3.41 KOMA-Script package (logo) +))))) +(/usr/share/texlive/texmf-dist/tex/latex/beamer/beamerbasetranslator.sty +(/usr/share/texlive/texmf-dist/tex/latex/translator/translator.sty +Package: translator 2021-05-31 v1.12d Easy translation of strings in LaTeX +)) +(/usr/share/texlive/texmf-dist/tex/latex/beamer/beamerbasemisc.sty) +(/usr/share/texlive/texmf-dist/tex/latex/beamer/beamerbasetwoscreens.sty) +(/usr/share/texlive/texmf-dist/tex/latex/beamer/beamerbaseoverlay.sty +\beamer@argscount=\count309 +\beamer@lastskipcover=\skip51 +\beamer@trivlistdepth=\count310 +) +(/usr/share/texlive/texmf-dist/tex/latex/beamer/beamerbasetitle.sty) +(/usr/share/texlive/texmf-dist/tex/latex/beamer/beamerbasesection.sty +\c@lecture=\count311 +\c@part=\count312 +\c@section=\count313 +\c@subsection=\count314 +\c@subsubsection=\count315 +) +(/usr/share/texlive/texmf-dist/tex/latex/beamer/beamerbaseframe.sty +\beamer@framebox=\box66 +\beamer@frametitlebox=\box67 +\beamer@zoombox=\box68 +\beamer@zoomcount=\count316 +\beamer@zoomframecount=\count317 +\beamer@frametextheight=\dimen271 +\c@subsectionslide=\count318 +\beamer@frametopskip=\skip52 +\beamer@framebottomskip=\skip53 +\beamer@frametopskipautobreak=\skip54 +\beamer@framebottomskipautobreak=\skip55 +\beamer@envbody=\toks30 +\framewidth=\dimen272 +\c@framenumber=\count319 +) +(/usr/share/texlive/texmf-dist/tex/latex/beamer/beamerbaseverbatim.sty +\beamer@verbatimfileout=\write4 +) +(/usr/share/texlive/texmf-dist/tex/latex/beamer/beamerbaseframesize.sty +\beamer@splitbox=\box69 +\beamer@autobreakcount=\count320 +\beamer@autobreaklastheight=\dimen273 +\beamer@frametitletoks=\toks31 +\beamer@framesubtitletoks=\toks32 +) +(/usr/share/texlive/texmf-dist/tex/latex/beamer/beamerbaseframecomponents.sty +\beamer@footins=\box70 +) +(/usr/share/texlive/texmf-dist/tex/latex/beamer/beamerbasecolor.sty) +(/usr/share/texlive/texmf-dist/tex/latex/beamer/beamerbasenotes.sty +\beamer@frameboxcopy=\box71 +) +(/usr/share/texlive/texmf-dist/tex/latex/beamer/beamerbasetoc.sty) +(/usr/share/texlive/texmf-dist/tex/latex/beamer/beamerbasetemplates.sty +\beamer@sbttoks=\toks33 + +(/usr/share/texlive/texmf-dist/tex/latex/beamer/beamerbaseauxtemplates.sty +(/usr/share/texlive/texmf-dist/tex/latex/beamer/beamerbaseboxes.sty +\bmb@box=\box72 +\bmb@colorbox=\box73 +\bmb@boxwidth=\dimen274 +\bmb@boxheight=\dimen275 +\bmb@prevheight=\dimen276 +\bmb@temp=\dimen277 +\bmb@dima=\dimen278 +\bmb@dimb=\dimen279 +\bmb@prevheight=\dimen280 +) +\beamer@blockheadheight=\dimen281 +)) +(/usr/share/texlive/texmf-dist/tex/latex/beamer/beamerbaselocalstructure.sty +(/usr/share/texlive/texmf-dist/tex/latex/tools/enumerate.sty +Package: enumerate 2023/07/04 v3.00 enumerate extensions (DPC) +\@enLab=\toks34 +) +\beamer@bibiconwidth=\skip56 +\c@figure=\count321 +\c@table=\count322 +\abovecaptionskip=\skip57 +\belowcaptionskip=\skip58 +) +(/usr/share/texlive/texmf-dist/tex/latex/beamer/beamerbasenavigation.sty +(/usr/share/texlive/texmf-dist/tex/latex/beamer/beamerbasenavigationsymbols.tex +) +\beamer@section@min@dim=\dimen282 +) (/usr/share/texlive/texmf-dist/tex/latex/beamer/beamerbasetheorems.sty +(/usr/share/texlive/texmf-dist/tex/latex/amsmath/amsmath.sty +Package: amsmath 2023/05/13 v2.17o AMS math features +\@mathmargin=\skip59 + +For additional information on amsmath, use the `?' option. +(/usr/share/texlive/texmf-dist/tex/latex/amsmath/amstext.sty +Package: amstext 2021/08/26 v2.01 AMS text + +(/usr/share/texlive/texmf-dist/tex/latex/amsmath/amsgen.sty +File: amsgen.sty 1999/11/30 v2.0 generic functions +\@emptytoks=\toks35 +\ex@=\dimen283 +)) +(/usr/share/texlive/texmf-dist/tex/latex/amsmath/amsbsy.sty +Package: amsbsy 1999/11/29 v1.2d Bold Symbols +\pmbraise@=\dimen284 +) +(/usr/share/texlive/texmf-dist/tex/latex/amsmath/amsopn.sty +Package: amsopn 2022/04/08 v2.04 operator names +) +\inf@bad=\count323 +LaTeX Info: Redefining \frac on input line 234. +\uproot@=\count324 +\leftroot@=\count325 +LaTeX Info: Redefining \overline on input line 399. +LaTeX Info: Redefining \colon on input line 410. +\classnum@=\count326 +\DOTSCASE@=\count327 +LaTeX Info: Redefining \ldots on input line 496. +LaTeX Info: Redefining \dots on input line 499. +LaTeX Info: Redefining \cdots on input line 620. +\Mathstrutbox@=\box74 +\strutbox@=\box75 +LaTeX Info: Redefining \big on input line 722. +LaTeX Info: Redefining \Big on input line 723. +LaTeX Info: Redefining \bigg on input line 724. +LaTeX Info: Redefining \Bigg on input line 725. +\big@size=\dimen285 +LaTeX Font Info: Redeclaring font encoding OML on input line 743. +LaTeX Font Info: Redeclaring font encoding OMS on input line 744. +\macc@depth=\count328 +LaTeX Info: Redefining \bmod on input line 905. +LaTeX Info: Redefining \pmod on input line 910. +LaTeX Info: Redefining \smash on input line 940. +LaTeX Info: Redefining \relbar on input line 970. +LaTeX Info: Redefining \Relbar on input line 971. +\c@MaxMatrixCols=\count329 +\dotsspace@=\muskip17 +\c@parentequation=\count330 +\dspbrk@lvl=\count331 +\tag@help=\toks36 +\row@=\count332 +\column@=\count333 +\maxfields@=\count334 +\andhelp@=\toks37 +\eqnshift@=\dimen286 +\alignsep@=\dimen287 +\tagshift@=\dimen288 +\tagwidth@=\dimen289 +\totwidth@=\dimen290 +\lineht@=\dimen291 +\@envbody=\toks38 +\multlinegap=\skip60 +\multlinetaggap=\skip61 +\mathdisplay@stack=\toks39 +LaTeX Info: Redefining \[ on input line 2953. +LaTeX Info: Redefining \] on input line 2954. +) +(/usr/share/texlive/texmf-dist/tex/latex/amscls/amsthm.sty +Package: amsthm 2020/05/29 v2.20.6 +\thm@style=\toks40 +\thm@bodyfont=\toks41 +\thm@headfont=\toks42 +\thm@notefont=\toks43 +\thm@headpunct=\toks44 +\thm@preskip=\skip62 +\thm@postskip=\skip63 +\thm@headsep=\skip64 +\dth@everypar=\toks45 +) +\c@theorem=\count335 +) +(/usr/share/texlive/texmf-dist/tex/latex/beamer/beamerbasethemes.sty)) +(/usr/share/texlive/texmf-dist/tex/latex/beamer/beamerthemedefault.sty +(/usr/share/texlive/texmf-dist/tex/latex/beamer/beamerfontthemedefault.sty) +(/usr/share/texlive/texmf-dist/tex/latex/beamer/beamercolorthemedefault.sty) +(/usr/share/texlive/texmf-dist/tex/latex/beamer/beamerinnerthemedefault.sty +\beamer@dima=\dimen292 +\beamer@dimb=\dimen293 +) +(/usr/share/texlive/texmf-dist/tex/latex/beamer/beamerouterthemedefault.sty))) +(/usr/share/texlive/texmf-dist/tex/latex/beamertheme-metropolis/beamerthememetr +opolis.sty +Package: beamerthememetropolis 2017/01/23 v1.2 Metropolis Beamer theme + (/usr/share/texlive/texmf-dist/tex/latex/pgfopts/pgfopts.sty +Package: pgfopts 2014/07/10 v2.1a LaTeX package options with pgfkeys +\pgfopts@list@add@a@toks=\toks46 +\pgfopts@list@add@b@toks=\toks47 +) +(/usr/share/texlive/texmf-dist/tex/latex/beamertheme-metropolis/beamerinnerthem +emetropolis.sty +Package: beamerinnerthememetropolis 2017/01/23 Metropolis inner theme + (/usr/share/texlive/texmf-dist/tex/latex/tools/calc.sty +Package: calc 2023/07/08 v4.3 Infix arithmetic (KKT,FJ) +\calc@Acount=\count336 +\calc@Bcount=\count337 +\calc@Adimen=\dimen294 +\calc@Bdimen=\dimen295 +\calc@Askip=\skip65 +\calc@Bskip=\skip66 +LaTeX Info: Redefining \setlength on input line 80. +LaTeX Info: Redefining \addtolength on input line 81. +\calc@Ccount=\count338 +\calc@Cskip=\skip67 +) +(/usr/share/texlive/texmf-dist/tex/latex/pgf/frontendlayer/tikz.sty +(/usr/share/texlive/texmf-dist/tex/latex/pgf/basiclayer/pgf.sty +Package: pgf 2023-01-15 v3.1.10 (3.1.10) + +(/usr/share/texlive/texmf-dist/tex/generic/pgf/modules/pgfmoduleshapes.code.tex +File: pgfmoduleshapes.code.tex 2023-01-15 v3.1.10 (3.1.10) +\pgfnodeparttextbox=\box76 +) (/usr/share/texlive/texmf-dist/tex/generic/pgf/modules/pgfmoduleplot.code.tex +File: pgfmoduleplot.code.tex 2023-01-15 v3.1.10 (3.1.10) +) +(/usr/share/texlive/texmf-dist/tex/latex/pgf/compatibility/pgfcomp-version-0-65 +.sty +Package: pgfcomp-version-0-65 2023-01-15 v3.1.10 (3.1.10) +\pgf@nodesepstart=\dimen296 +\pgf@nodesepend=\dimen297 +) +(/usr/share/texlive/texmf-dist/tex/latex/pgf/compatibility/pgfcomp-version-1-18 +.sty +Package: pgfcomp-version-1-18 2023-01-15 v3.1.10 (3.1.10) +)) (/usr/share/texlive/texmf-dist/tex/latex/pgf/utilities/pgffor.sty +(/usr/share/texlive/texmf-dist/tex/generic/pgf/utilities/pgffor.code.tex +Package: pgffor 2023-01-15 v3.1.10 (3.1.10) +\pgffor@iter=\dimen298 +\pgffor@skip=\dimen299 +\pgffor@stack=\toks48 +\pgffor@toks=\toks49 +)) +(/usr/share/texlive/texmf-dist/tex/generic/pgf/frontendlayer/tikz/tikz.code.tex +Package: tikz 2023-01-15 v3.1.10 (3.1.10) + +(/usr/share/texlive/texmf-dist/tex/generic/pgf/libraries/pgflibraryplothandlers +.code.tex +File: pgflibraryplothandlers.code.tex 2023-01-15 v3.1.10 (3.1.10) +\pgf@plot@mark@count=\count339 +\pgfplotmarksize=\dimen300 +) +\tikz@lastx=\dimen301 +\tikz@lasty=\dimen302 +\tikz@lastxsaved=\dimen303 +\tikz@lastysaved=\dimen304 +\tikz@lastmovetox=\dimen305 +\tikz@lastmovetoy=\dimen306 +\tikzleveldistance=\dimen307 +\tikzsiblingdistance=\dimen308 +\tikz@figbox=\box77 +\tikz@figbox@bg=\box78 +\tikz@tempbox=\box79 +\tikz@tempbox@bg=\box80 +\tikztreelevel=\count340 +\tikznumberofchildren=\count341 +\tikznumberofcurrentchild=\count342 +\tikz@fig@count=\count343 + +(/usr/share/texlive/texmf-dist/tex/generic/pgf/modules/pgfmodulematrix.code.tex +File: pgfmodulematrix.code.tex 2023-01-15 v3.1.10 (3.1.10) +\pgfmatrixcurrentrow=\count344 +\pgfmatrixcurrentcolumn=\count345 +\pgf@matrix@numberofcolumns=\count346 +) +\tikz@expandcount=\count347 + +(/usr/share/texlive/texmf-dist/tex/generic/pgf/frontendlayer/tikz/libraries/tik +zlibrarytopaths.code.tex +File: tikzlibrarytopaths.code.tex 2023-01-15 v3.1.10 (3.1.10) +))) +\metropolis@titleseparator@linewidth=\skip68 +\metropolis@progressonsectionpage=\skip69 +\metropolis@progressonsectionpage@linewidth=\skip70 +\metropolis@blocksep=\skip71 +\metropolis@blockadjust=\skip72 +\metropolis@parskip=\skip73 +) +(/usr/share/texlive/texmf-dist/tex/latex/beamertheme-metropolis/beamerouterthem +emetropolis.sty +Package: beamerouterthememetropolis 2017/01/23 Metropolis outer theme +\metropolis@frametitle@padding=\skip74 +\metropolis@progressinheadfoot=\skip75 +\metropolis@progressinheadfoot@linewidth=\skip76 +) +(/usr/share/texlive/texmf-dist/tex/latex/beamertheme-metropolis/beamercolorthem +emetropolis.sty +Package: beamercolorthememetropolis 2017/01/23 Metropolis color theme +) +(/usr/share/texlive/texmf-dist/tex/latex/beamertheme-metropolis/beamerfonttheme +metropolis.sty +Package: beamerfontthememetropolis 2017/01/23 Metropolis font theme + (/usr/share/texlive/texmf-dist/tex/generic/iftex/ifxetex.sty +Package: ifxetex 2019/10/25 v0.7 ifxetex legacy package. Use iftex instead. +) +(/usr/share/texlive/texmf-dist/tex/generic/iftex/ifluatex.sty +Package: ifluatex 2019/10/25 v1.5 ifluatex legacy package. Use iftex instead. +) + +Package beamerthememetropolis Warning: You need to compile with XeLaTeX or LuaL +aTeX to use the Fira fonts on input line 95. + +)) +(/usr/share/texlive/texmf-dist/tex/generic/pgf/libraries/pgflibraryarrows.meta. +code.tex +File: pgflibraryarrows.meta.code.tex 2023-01-15 v3.1.10 (3.1.10) +\pgfarrowinset=\dimen309 +\pgfarrowlength=\dimen310 +\pgfarrowwidth=\dimen311 +\pgfarrowlinewidth=\dimen312 +) +(/usr/share/texlive/texmf-dist/tex/generic/pgf/frontendlayer/tikz/libraries/tik +zlibrarypositioning.code.tex +File: tikzlibrarypositioning.code.tex 2023-01-15 v3.1.10 (3.1.10) +) +(/usr/share/texlive/texmf-dist/tex/generic/pgf/frontendlayer/tikz/libraries/tik +zlibrarycalc.code.tex +File: tikzlibrarycalc.code.tex 2023-01-15 v3.1.10 (3.1.10) +) +(/usr/share/texlive/texmf-dist/tex/generic/pgf/frontendlayer/tikz/libraries/tik +zlibrarydecorations.pathmorphing.code.tex +(/usr/share/texlive/texmf-dist/tex/generic/pgf/frontendlayer/tikz/libraries/tik +zlibrarydecorations.code.tex +(/usr/share/texlive/texmf-dist/tex/generic/pgf/modules/pgfmoduledecorations.cod +e.tex +\pgfdecoratedcompleteddistance=\dimen313 +\pgfdecoratedremainingdistance=\dimen314 +\pgfdecoratedinputsegmentcompleteddistance=\dimen315 +\pgfdecoratedinputsegmentremainingdistance=\dimen316 +\pgf@decorate@distancetomove=\dimen317 +\pgf@decorate@repeatstate=\count348 +\pgfdecorationsegmentamplitude=\dimen318 +\pgfdecorationsegmentlength=\dimen319 +) +\tikz@lib@dec@box=\box81 +) +(/usr/share/texlive/texmf-dist/tex/generic/pgf/libraries/decorations/pgflibrary +decorations.pathmorphing.code.tex)) +(/usr/share/texlive/texmf-dist/tex/latex/pgfplots/pgfplots.sty +(/usr/share/texlive/texmf-dist/tex/generic/pgfplots/pgfplots.revision.tex) +Package: pgfplots 2021/05/15 v1.18.1 Data Visualization (1.18.1) + +(/usr/share/texlive/texmf-dist/tex/generic/pgfplots/pgfplots.code.tex +(/usr/share/texlive/texmf-dist/tex/generic/pgfplots/pgfplotscore.code.tex +\t@pgfplots@toka=\toks50 +\t@pgfplots@tokb=\toks51 +\t@pgfplots@tokc=\toks52 +\pgfplots@tmpa=\dimen320 +\c@pgfplots@coordindex=\count349 +\c@pgfplots@scanlineindex=\count350 + +(/usr/share/texlive/texmf-dist/tex/generic/pgfplots/sys/pgfplotssysgeneric.code +.tex)) +(/usr/share/texlive/texmf-dist/tex/generic/pgfplots/libs/pgfplotslibrary.code.t +ex) +(/usr/share/texlive/texmf-dist/tex/generic/pgfplots/oldpgfcompatib/pgfplotsoldp +gfsupp_loader.code.tex +(/usr/share/texlive/texmf-dist/tex/generic/pgf/libraries/pgflibraryfpu.code.tex +) +Package pgfplots: loading complementary utilities for your pgf version... +\t@pgf@toka=\toks53 +\t@pgf@tokb=\toks54 +\t@pgf@tokc=\toks55 + +(/usr/share/texlive/texmf-dist/tex/generic/pgfplots/oldpgfcompatib/pgfplotsoldp +gfsupp_pgfutil-common-lists.tex)) +(/usr/share/texlive/texmf-dist/tex/generic/pgfplots/util/pgfplotsutil.code.tex +(/usr/share/texlive/texmf-dist/tex/generic/pgfplots/liststructure/pgfplotslists +tructure.code.tex) +(/usr/share/texlive/texmf-dist/tex/generic/pgfplots/liststructure/pgfplotslists +tructureext.code.tex) +(/usr/share/texlive/texmf-dist/tex/generic/pgfplots/liststructure/pgfplotsarray +.code.tex +\c@pgfplotsarray@tmp=\count351 +) +(/usr/share/texlive/texmf-dist/tex/generic/pgfplots/liststructure/pgfplotsmatri +x.code.tex) +(/usr/share/texlive/texmf-dist/tex/generic/pgfplots/numtable/pgfplotstableshare +d.code.tex +\c@pgfplotstable@counta=\count352 +\t@pgfplotstable@a=\toks56 +) +(/usr/share/texlive/texmf-dist/tex/generic/pgfplots/liststructure/pgfplotsdeque +.code.tex) +(/usr/share/texlive/texmf-dist/tex/generic/pgfplots/util/pgfplotsbinary.code.te +x +(/usr/share/texlive/texmf-dist/tex/generic/pgfplots/util/pgfplotsbinary.data.co +de.tex)) +(/usr/share/texlive/texmf-dist/tex/generic/pgfplots/util/pgfplotsutil.verb.code +.tex) +(/usr/share/texlive/texmf-dist/tex/generic/pgfplots/libs/pgflibrarypgfplots.sur +fshading.code.tex +\c@pgfplotslibrarysurf@no=\count353 + +(/usr/share/texlive/texmf-dist/tex/generic/pgfplots/sys/pgflibrarypgfplots.surf +shading.pgfsys-pdftex.def))) +(/usr/share/texlive/texmf-dist/tex/generic/pgfplots/util/pgfplotscolormap.code. +tex +(/usr/share/texlive/texmf-dist/tex/generic/pgfplots/util/pgfplotscolor.code.tex +)) +(/usr/share/texlive/texmf-dist/tex/generic/pgfplots/pgfplotsstackedplots.code.t +ex) +(/usr/share/texlive/texmf-dist/tex/generic/pgfplots/pgfplotsplothandlers.code.t +ex +(/usr/share/texlive/texmf-dist/tex/generic/pgfplots/pgfplotsmeshplothandler.cod +e.tex +(/usr/share/texlive/texmf-dist/tex/generic/pgfplots/pgfplotsmeshplotimage.code. +tex))) +(/usr/share/texlive/texmf-dist/tex/generic/pgfplots/pgfplots.scaling.code.tex) +(/usr/share/texlive/texmf-dist/tex/generic/pgfplots/pgfplotscoordprocessing.cod +e.tex) +(/usr/share/texlive/texmf-dist/tex/generic/pgfplots/pgfplots.errorbars.code.tex +) (/usr/share/texlive/texmf-dist/tex/generic/pgfplots/pgfplots.markers.code.tex +) (/usr/share/texlive/texmf-dist/tex/generic/pgfplots/pgfplotsticks.code.tex) +(/usr/share/texlive/texmf-dist/tex/generic/pgfplots/pgfplots.paths.code.tex) +(/usr/share/texlive/texmf-dist/tex/generic/pgf/frontendlayer/tikz/libraries/tik +zlibrarydecorations.pathreplacing.code.tex +(/usr/share/texlive/texmf-dist/tex/generic/pgf/libraries/decorations/pgflibrary +decorations.pathreplacing.code.tex)) +(/usr/share/texlive/texmf-dist/tex/generic/pgfplots/libs/tikzlibrarypgfplots.co +ntourlua.code.tex) +\pgfplots@numplots=\count354 +\pgfplots@xmin@reg=\dimen321 +\pgfplots@xmax@reg=\dimen322 +\pgfplots@ymin@reg=\dimen323 +\pgfplots@ymax@reg=\dimen324 +\pgfplots@zmin@reg=\dimen325 +\pgfplots@zmax@reg=\dimen326 +) +(/usr/share/texlive/texmf-dist/tex/generic/pgf/frontendlayer/tikz/libraries/tik +zlibraryplotmarks.code.tex +File: tikzlibraryplotmarks.code.tex 2023-01-15 v3.1.10 (3.1.10) + +(/usr/share/texlive/texmf-dist/tex/generic/pgf/libraries/pgflibraryplotmarks.co +de.tex +File: pgflibraryplotmarks.code.tex 2023-01-15 v3.1.10 (3.1.10) +))) (/usr/share/texlive/texmf-dist/tex/latex/booktabs/booktabs.sty +Package: booktabs 2020/01/12 v1.61803398 Publication quality tables +\heavyrulewidth=\dimen327 +\lightrulewidth=\dimen328 +\cmidrulewidth=\dimen329 +\belowrulesep=\dimen330 +\belowbottomsep=\dimen331 +\aboverulesep=\dimen332 +\abovetopsep=\dimen333 +\cmidrulesep=\dimen334 +\cmidrulekern=\dimen335 +\defaultaddspace=\dimen336 +\@cmidla=\count355 +\@cmidlb=\count356 +\@aboverulesep=\dimen337 +\@belowrulesep=\dimen338 +\@thisruleclass=\count357 +\@lastruleclass=\count358 +\@thisrulewidth=\dimen339 +) +(/usr/share/texlive/texmf-dist/tex/latex/beamertheme-metropolis/pgfplotsthemeto +l.sty +Package: pgfplotsthemetol 2017/01/23 PGFplots colors based on Paul Tol's SRON t +echnical note +) (/usr/share/texlive/texmf-dist/tex/latex/l3backend/l3backend-pdftex.def +File: l3backend-pdftex.def 2024-01-04 L3 backend support: PDF output (pdfTeX) +\l__color_backend_stack_int=\count359 +\l__pdf_internal_box=\box82 +) +No file time_embedding_analysis.aux. +\openout1 = `time_embedding_analysis.aux'. + +LaTeX Font Info: Checking defaults for OML/cmm/m/it on input line 21. +LaTeX Font Info: ... okay on input line 21. +LaTeX Font Info: Checking defaults for OMS/cmsy/m/n on input line 21. +LaTeX Font Info: ... okay on input line 21. +LaTeX Font Info: Checking defaults for OT1/cmr/m/n on input line 21. +LaTeX Font Info: ... okay on input line 21. +LaTeX Font Info: Checking defaults for T1/cmr/m/n on input line 21. +LaTeX Font Info: ... okay on input line 21. +LaTeX Font Info: Checking defaults for TS1/cmr/m/n on input line 21. +LaTeX Font Info: ... okay on input line 21. +LaTeX Font Info: Checking defaults for OMX/cmex/m/n on input line 21. +LaTeX Font Info: ... okay on input line 21. +LaTeX Font Info: Checking defaults for U/cmr/m/n on input line 21. +LaTeX Font Info: ... okay on input line 21. +LaTeX Font Info: Checking defaults for PD1/pdf/m/n on input line 21. +LaTeX Font Info: ... okay on input line 21. +LaTeX Font Info: Checking defaults for PU/pdf/m/n on input line 21. +LaTeX Font Info: ... okay on input line 21. +*geometry* driver: auto-detecting +*geometry* detected driver: pdftex +*geometry* verbose mode - [ preamble ] result: +* driver: pdftex +* paper: custom +* layout: +* layoutoffset:(h,v)=(0.0pt,0.0pt) +* modes: includehead includefoot +* h-part:(L,W,R)=(28.45274pt, 398.3386pt, 28.45274pt) +* v-part:(T,H,B)=(0.0pt, 256.0748pt, 0.0pt) +* \paperwidth=455.24408pt +* \paperheight=256.0748pt +* \textwidth=398.3386pt +* \textheight=227.62207pt +* \oddsidemargin=-43.81725pt +* \evensidemargin=-43.81725pt +* \topmargin=-72.26999pt +* \headheight=14.22636pt +* \headsep=0.0pt +* \topskip=11.0pt +* \footskip=14.22636pt +* \marginparwidth=4.0pt +* \marginparsep=10.0pt +* \columnsep=10.0pt +* \skip\footins=10.0pt plus 4.0pt minus 2.0pt +* \hoffset=0.0pt +* \voffset=0.0pt +* \mag=1000 +* \@twocolumnfalse +* \@twosidefalse +* \@mparswitchfalse +* \@reversemarginfalse +* (1in=72.27pt=25.4mm, 1cm=28.453pt) + +(/usr/share/texlive/texmf-dist/tex/context/base/mkii/supp-pdf.mkii +[Loading MPS to PDF converter (version 2006.09.02).] +\scratchcounter=\count360 +\scratchdimen=\dimen340 +\scratchbox=\box83 +\nofMPsegments=\count361 +\nofMParguments=\count362 +\everyMPshowfont=\toks57 +\MPscratchCnt=\count363 +\MPscratchDim=\dimen341 +\MPnumerator=\count364 +\makeMPintoPDFobject=\count365 +\everyMPtoPDFconversion=\toks58 +) (/usr/share/texlive/texmf-dist/tex/latex/epstopdf-pkg/epstopdf-base.sty +Package: epstopdf-base 2020-01-24 v2.11 Base part for package epstopdf +Package epstopdf-base Info: Redefining graphics rule for `.eps' on input line 4 +85. + +(/usr/share/texlive/texmf-dist/tex/latex/latexconfig/epstopdf-sys.cfg +File: epstopdf-sys.cfg 2010/07/13 v1.3 Configuration of (r)epstopdf for TeX Liv +e +)) +Package hyperref Info: Link coloring OFF on input line 21. +\@outlinefile=\write5 +\openout5 = `time_embedding_analysis.out'. + +LaTeX Font Info: Overwriting symbol font `operators' in version `normal' +(Font) OT1/cmr/m/n --> OT1/cmss/m/n on input line 21. +LaTeX Font Info: Overwriting symbol font `operators' in version `bold' +(Font) OT1/cmr/bx/n --> OT1/cmss/b/n on input line 21. +\symnumbers=\mathgroup6 +\sympureletters=\mathgroup7 +LaTeX Font Info: Overwriting math alphabet `\mathrm' in version `normal' +(Font) OT1/cmss/m/n --> OT1/cmr/m/n on input line 21. +LaTeX Font Info: Redeclaring math alphabet \mathbf on input line 21. +LaTeX Font Info: Overwriting math alphabet `\mathbf' in version `normal' +(Font) OT1/cmr/bx/n --> OT1/cmss/b/n on input line 21. +LaTeX Font Info: Overwriting math alphabet `\mathbf' in version `bold' +(Font) OT1/cmr/bx/n --> OT1/cmss/b/n on input line 21. +LaTeX Font Info: Redeclaring math alphabet \mathsf on input line 21. +LaTeX Font Info: Overwriting math alphabet `\mathsf' in version `normal' +(Font) OT1/cmss/m/n --> OT1/cmss/m/n on input line 21. +LaTeX Font Info: Overwriting math alphabet `\mathsf' in version `bold' +(Font) OT1/cmss/bx/n --> OT1/cmss/m/n on input line 21. +LaTeX Font Info: Redeclaring math alphabet \mathit on input line 21. +LaTeX Font Info: Overwriting math alphabet `\mathit' in version `normal' +(Font) OT1/cmr/m/it --> OT1/cmss/m/it on input line 21. +LaTeX Font Info: Overwriting math alphabet `\mathit' in version `bold' +(Font) OT1/cmr/bx/it --> OT1/cmss/m/it on input line 21. +LaTeX Font Info: Redeclaring math alphabet \mathtt on input line 21. +LaTeX Font Info: Overwriting math alphabet `\mathtt' in version `normal' +(Font) OT1/cmtt/m/n --> OT1/cmtt/m/n on input line 21. +LaTeX Font Info: Overwriting math alphabet `\mathtt' in version `bold' +(Font) OT1/cmtt/m/n --> OT1/cmtt/m/n on input line 21. +LaTeX Font Info: Overwriting symbol font `numbers' in version `bold' +(Font) OT1/cmss/m/n --> OT1/cmss/b/n on input line 21. +LaTeX Font Info: Overwriting symbol font `pureletters' in version `bold' +(Font) OT1/cmss/m/it --> OT1/cmss/b/it on input line 21. +LaTeX Font Info: Overwriting math alphabet `\mathrm' in version `bold' +(Font) OT1/cmss/b/n --> OT1/cmr/b/n on input line 21. +LaTeX Font Info: Overwriting math alphabet `\mathbf' in version `bold' +(Font) OT1/cmss/b/n --> OT1/cmss/b/n on input line 21. +LaTeX Font Info: Overwriting math alphabet `\mathsf' in version `bold' +(Font) OT1/cmss/m/n --> OT1/cmss/b/n on input line 21. +LaTeX Font Info: Overwriting math alphabet `\mathit' in version `bold' +(Font) OT1/cmss/m/it --> OT1/cmss/b/it on input line 21. +LaTeX Font Info: Overwriting math alphabet `\mathtt' in version `bold' +(Font) OT1/cmtt/m/n --> OT1/cmtt/b/n on input line 21. +LaTeX Font Info: Redeclaring symbol font `pureletters' on input line 21. +LaTeX Font Info: Overwriting symbol font `pureletters' in version `normal' +(Font) OT1/cmss/m/it --> OT1/mathkerncmss/m/sl on input line 2 +1. +LaTeX Font Info: Overwriting symbol font `pureletters' in version `bold' +(Font) OT1/cmss/b/it --> OT1/mathkerncmss/m/sl on input line 2 +1. +LaTeX Font Info: Overwriting symbol font `pureletters' in version `bold' +(Font) OT1/mathkerncmss/m/sl --> OT1/mathkerncmss/bx/sl on inp +ut line 21. + +(/usr/share/texlive/texmf-dist/tex/latex/translator/translator-basic-dictionary +-English.dict +Dictionary: translator-basic-dictionary, Language: English +) +(/usr/share/texlive/texmf-dist/tex/latex/translator/translator-bibliography-dic +tionary-English.dict +Dictionary: translator-bibliography-dictionary, Language: English +) +(/usr/share/texlive/texmf-dist/tex/latex/translator/translator-environment-dict +ionary-English.dict +Dictionary: translator-environment-dictionary, Language: English +) +(/usr/share/texlive/texmf-dist/tex/latex/translator/translator-months-dictionar +y-English.dict +Dictionary: translator-months-dictionary, Language: English +) +(/usr/share/texlive/texmf-dist/tex/latex/translator/translator-numbers-dictiona +ry-English.dict +Dictionary: translator-numbers-dictionary, Language: English +) +(/usr/share/texlive/texmf-dist/tex/latex/translator/translator-theorem-dictiona +ry-English.dict +Dictionary: translator-theorem-dictionary, Language: English +) +Package pgfplots notification 'compat/show suggested version=true': you might b +enefit from \pgfplotsset{compat=1.18} (current compat level: 1.9). + +No file time_embedding_analysis.nav. + +Overfull \vbox (44.55656pt too high) detected at line 25 + [] + +[1 + +{/var/lib/texmf/fonts/map/pdftex/updmap/pdftex.map}] +LaTeX Font Info: Trying to load font information for U+msa on input line 78. + + +(/usr/share/texlive/texmf-dist/tex/latex/amsfonts/umsa.fd +File: umsa.fd 2013/01/14 v3.01 AMS symbols A +) +LaTeX Font Info: Trying to load font information for U+msb on input line 78. + + +(/usr/share/texlive/texmf-dist/tex/latex/amsfonts/umsb.fd +File: umsb.fd 2013/01/14 v3.01 AMS symbols B +) +LaTeX Font Info: Trying to load font information for OT1+mathkerncmss on inp +ut line 78. + +(/usr/share/texlive/texmf-dist/tex/latex/sansmathaccent/ot1mathkerncmss.fd +File: ot1mathkerncmss.fd 2020/01/31 Fontinst v1.933 font definitions for OT1/ma +thkerncmss. +) +Overfull \hbox (30.18256pt too wide) in paragraph at lines 78--78 +[][] + [] + + +Overfull \vbox (49.00014pt too high) detected at line 78 + [] + +[2 + +] +Overfull \vbox (2.69481pt too high) detected at line 101 + [] + +[3 + +] + +LaTeX Font Warning: Font shape `OT1/cmss/bx/it' undefined +(Font) using `OT1/cmss/bx/n' instead on input line 150. + + +Overfull \vbox (41.79967pt too high) detected at line 150 + [] + +[4 + +] +Overfull \vbox (53.31384pt too high) detected at line 199 + [] + +[5 + +] +LaTeX Font Info: Trying to load font information for TS1+cmss on input line +241. + (/usr/share/texlive/texmf-dist/tex/latex/base/ts1cmss.fd +File: ts1cmss.fd 2023/04/13 v2.5m Standard LaTeX font definitions +) +Overfull \vbox (76.0329pt too high) detected at line 241 + [] + +[6 + +{/usr/share/texmf/fonts/enc/dvips/cm-super/cm-super-ts1.enc}] [7 + +] +Overfull \hbox (42.33888pt too wide) in paragraph at lines 353--353 + [] + [] + + +Overfull \vbox (107.89873pt too high) detected at line 353 + [] + +[8 + +] +Overfull \vbox (6.28105pt too high) detected at line 409 + [] + +[9 + +] +\openout4 = `time_embedding_analysis.vrb'. + + (./time_embedding_analysis.vrb +LaTeX Font Info: Trying to load font information for TS1+cmtt on input line +10. + +(/usr/share/texlive/texmf-dist/tex/latex/base/ts1cmtt.fd +File: ts1cmtt.fd 2023/04/13 v2.5m Standard LaTeX font definitions +)) +Overfull \vbox (98.15517pt too high) detected at line 436 + [] + +[10 + +] +Overfull \vbox (5.10194pt too high) detected at line 457 + [] + +[11 + +] +\tf@nav=\write6 +\openout6 = `time_embedding_analysis.nav'. + +\tf@toc=\write7 +\openout7 = `time_embedding_analysis.toc'. + +\tf@snm=\write8 +\openout8 = `time_embedding_analysis.snm'. + + (./time_embedding_analysis.aux) + *********** +LaTeX2e <2023-11-01> patch level 1 +L3 programming layer <2024-01-22> + *********** + + +LaTeX Font Warning: Some font shapes were not available, defaults substituted. + + +Package rerunfilecheck Warning: File `time_embedding_analysis.out' has changed. + +(rerunfilecheck) Rerun to get outlines right +(rerunfilecheck) or use package `bookmark'. + +Package rerunfilecheck Info: Checksums for `time_embedding_analysis.out': +(rerunfilecheck) Before: +(rerunfilecheck) After: D41D8CD98F00B204E9800998ECF8427E;0. + ) +Here is how much of TeX's memory you used: + 36189 strings out of 475495 + 872909 string characters out of 5782356 + 1948975 words of memory out of 5000000 + 57540 multiletter control sequences out of 15000+600000 + 575160 words of font info for 97 fonts, out of 8000000 for 9000 + 497 hyphenation exceptions out of 8191 + 128i,22n,123p,1890b,2506s stack positions out of 10000i,1000n,20000p,200000b,200000s + +Output written on time_embedding_analysis.pdf (11 pages, 231281 bytes). +PDF statistics: + 189 PDF objects out of 1000 (max. 8388607) + 125 compressed objects within 2 object streams + 23 named destinations out of 1000 (max. 500000) + 43 words of extra memory for PDF output out of 10000 (max. 10000000) + diff --git a/slides/time_embedding_analysis.nav b/slides/time_embedding_analysis.nav new file mode 100644 index 0000000000000000000000000000000000000000..223df36caac2c20f60ac6fc92f1002fe2525ca15 --- /dev/null +++ b/slides/time_embedding_analysis.nav @@ -0,0 +1,27 @@ +\headcommand {\slideentry {0}{0}{1}{1/1}{}{0}} +\headcommand {\beamer@framepages {1}{1}} +\headcommand {\slideentry {0}{0}{2}{2/2}{}{0}} +\headcommand {\beamer@framepages {2}{2}} +\headcommand {\slideentry {0}{0}{3}{3/3}{}{0}} +\headcommand {\beamer@framepages {3}{3}} +\headcommand {\slideentry {0}{0}{4}{4/4}{}{0}} +\headcommand {\beamer@framepages {4}{4}} +\headcommand {\slideentry {0}{0}{5}{5/5}{}{0}} +\headcommand {\beamer@framepages {5}{5}} +\headcommand {\slideentry {0}{0}{6}{6/6}{}{0}} +\headcommand {\beamer@framepages {6}{6}} +\headcommand {\slideentry {0}{0}{7}{7/7}{}{0}} +\headcommand {\beamer@framepages {7}{7}} +\headcommand {\slideentry {0}{0}{8}{8/8}{}{0}} +\headcommand {\beamer@framepages {8}{8}} +\headcommand {\slideentry {0}{0}{9}{9/9}{}{0}} +\headcommand {\beamer@framepages {9}{9}} +\headcommand {\slideentry {0}{0}{10}{10/10}{}{0}} +\headcommand {\beamer@framepages {10}{10}} +\headcommand {\slideentry {0}{0}{11}{11/11}{}{0}} +\headcommand {\beamer@framepages {11}{11}} +\headcommand {\beamer@partpages {1}{11}} +\headcommand {\beamer@subsectionpages {1}{11}} +\headcommand {\beamer@sectionpages {1}{11}} +\headcommand {\beamer@documentpages {11}} +\headcommand {\gdef \inserttotalframenumber {11}} diff --git a/slides/time_embedding_analysis.out b/slides/time_embedding_analysis.out new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/slides/time_embedding_analysis.snm b/slides/time_embedding_analysis.snm new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/slides/time_embedding_analysis.tex b/slides/time_embedding_analysis.tex new file mode 100644 index 0000000000000000000000000000000000000000..2cb1ded3fb19da68c3136d8ef38778d7629d4631 --- /dev/null +++ b/slides/time_embedding_analysis.tex @@ -0,0 +1,459 @@ +\documentclass[aspectratio=169]{beamer} +\usetheme{metropolis} +\usepackage{amsmath,amssymb} +\usepackage{tikz} +\usetikzlibrary{arrows.meta,positioning,calc,decorations.pathmorphing} +\usepackage{pgfplots} +\pgfplotsset{compat=1.18} +\usepackage{booktabs} +\usepackage{xcolor} + +\definecolor{accent}{RGB}{0,150,136} +\definecolor{badred}{RGB}{220,50,47} +\definecolor{goodgreen}{RGB}{40,160,80} +\definecolor{warnblue}{RGB}{38,139,210} + +\title{Why Linear Time Embeddings Cause a Loss Plateau at 1.0} +\subtitle{Continuous Flow Matching \& the Timestep Distinguishability Problem} +\author{CFM-SVC Training Analysis} +\date{} + +\begin{document} + +\begin{frame} +\titlepage +\end{frame} + +%% ========== SLIDE 1: BACKGROUND ========== +\begin{frame}{Background: Continuous Flow Matching (CFM)} + +\textbf{Goal:} Learn a velocity field $v_\theta(z_t, t, c)$ that transports noise $z_0 \sim \mathcal{N}(0, I)$ to data $z_1 = z_{\text{target}}$ along a straight path. + +\vspace{0.3cm} + +\begin{columns} +\begin{column}{0.55\textwidth} +\textbf{Rectified Flow Path:} +\[ +z_t = (1 - t)\, z_{\text{noise}} + t\, z_{\text{target}}, \quad t \in [0,1] +\] + +\textbf{Target Velocity (ground truth):} +\[ +v^* = z_{\text{target}} - z_{\text{noise}} +\] + +\textbf{Flow Loss:} +\[ +\mathcal{L}_{\text{flow}} = \mathbb{E}_{t, z_{\text{noise}}} \left[\| v_\theta(z_t, t, c) - v^* \|^2 \right] +\] +\end{column} +\begin{column}{0.4\textwidth} +\begin{tikzpicture}[scale=0.9] + % Noise blob + \fill[blue!15, rounded corners=8pt] (0,0) ellipse (1.0 and 0.6); + \node at (0,0) {$z_0 \sim \mathcal{N}(0,I)$}; + + % Target blob + \fill[green!15, rounded corners=8pt] (5,0) ellipse (1.0 and 0.6); + \node at (5,0) {$z_1 = z_{\text{target}}$}; + + % Arrow + \draw[-{Stealth[length=3mm]}, thick, accent] (1.1,0) -- (3.9,0); + \node[above, accent] at (2.5,0.1) {$v_\theta(z_t, t, c)$}; + + % t markers + \foreach \t/\label in {0.25/$t{=}0.25$, 0.5/$t{=}0.5$, 0.75/$t{=}0.75$} { + \pgfmathsetmacro{\xpos}{1.1 + \t * 2.8} + \fill[accent] (\xpos, 0) circle (2pt); + \node[below, font=\tiny] at (\xpos, -0.15) {\label}; + } +\end{tikzpicture} +\end{column} +\end{columns} + +\vspace{0.3cm} +The model must predict \textbf{different velocities at different timesteps $t$}, because $z_t$ changes along the path. The quality of the time embedding directly controls this ability. + +\end{frame} + +%% ========== SLIDE 2: THE LAZY LOCAL MINIMUM ========== +\begin{frame}{The Local Minimum: Predicting $v \approx z_{\text{target}}$} + +\textbf{Observation:} The model can achieve a ``reasonable'' loss by \textbf{ignoring $z_t$ and $t$ entirely}, and simply predicting: +\[ +v_\theta(z_t, t, c) \approx \hat{z}_{\text{target}}(c) \quad \text{(from conditioning alone)} +\] + +\textbf{What loss does this give?} +\begin{align*} +\mathcal{L}_{\text{flow}} &= \mathbb{E}\left[\| \hat{z}_{\text{target}}(c) - (z_{\text{target}} - z_{\text{noise}}) \|^2\right] \\ +&\approx \mathbb{E}\left[\| z_{\text{target}} - z_{\text{target}} + z_{\text{noise}} \|^2\right] \quad \text{(if $\hat{z}_{\text{target}} \approx z_{\text{target}}$)}\\ +&= \mathbb{E}\left[\| z_{\text{noise}} \|^2\right] \\ +&= \boxed{1.0} \quad \text{since } z_{\text{noise}} \sim \mathcal{N}(0, I) \text{ with unit variance per dim} +\end{align*} + +\vspace{0.2cm} +\begin{alertblock}{This is exactly what we observed} +Flow loss plateaued at $\approx 1.0$ for 30+ epochs with zero downward trend. +\end{alertblock} + +\end{frame} + +%% ========== SLIDE 3: WHAT THE MODEL SHOULD DO ========== +\begin{frame}{What the Model \emph{Should} Learn} + +The \textbf{optimal} velocity field requires actively using both $z_t$ and $t$: + +\[ +v^* = z_{\text{target}} - z_{\text{noise}} = z_{\text{target}} - \frac{z_t - t \cdot z_{\text{target}}}{1 - t} +\] + +\vspace{0.2cm} + +\begin{columns} +\begin{column}{0.5\textwidth} +\textbf{Key insight:} This expression involves $\dfrac{1}{1-t}$, which varies \emph{dramatically} across timesteps: + +\vspace{0.2cm} +\begin{tabular}{cc} +\toprule +$t$ & $\frac{1}{1-t}$ \\ +\midrule +0.1 & 1.11 \\ +0.5 & 2.00 \\ +0.9 & 10.0 \\ +0.95 & 20.0 \\ +\bottomrule +\end{tabular} +\end{column} +\begin{column}{0.45\textwidth} +\begin{tikzpicture} +\begin{axis}[ + width=6cm, height=4.5cm, + xlabel={$t$}, ylabel={$\frac{1}{1-t}$}, + domain=0:0.95, + samples=100, + thick, + grid=major, + ymax=22, +] +\addplot[accent, very thick] {1/(1-x)}; +\end{axis} +\end{tikzpicture} +\end{column} +\end{columns} + +\vspace{0.3cm} +The model needs \textbf{fine-grained resolution} of $t$ to learn this nonlinear relationship. + +\end{frame} + +%% ========== SLIDE 4: THE LINEAR EMBEDDING PROBLEM ========== +\begin{frame}{Why Linear(1, $D$) $\to$ GELU $\to$ Linear($D$, $D$) Fails} + +\begin{columns} +\begin{column}{0.55\textwidth} +The old time embedding: +\[ +e(t) = W_2 \cdot \text{GELU}(W_1 t + b_1) + b_2 +\] + +\textbf{Before GELU}, all 512 neurons compute: +\[ +h_i = w_i \cdot t + b_i +\] +Every output is a \textbf{linearly-scaled copy} of the same scalar $t$. + +\vspace{0.2cm} +\textbf{After GELU}, the outputs become: +\[ +\text{GELU}(w_i t + b_i) +\] +While nonlinear, GELU has a \emph{single} transition region per neuron. The network must \textbf{learn from scratch} to spread $w_i$ and $b_i$ across different frequency bands. +\end{column} +\begin{column}{0.4\textwidth} +\begin{tikzpicture} +\begin{axis}[ + width=5.5cm, height=4cm, + xlabel={$t \in [0,1]$}, + ylabel={Neuron Output}, + domain=0:1, samples=80, + legend pos=north west, + legend style={font=\tiny}, +] +% Show several GELU(w*t + b) curves - they all look similar +\addplot[badred, thick] {x * 0.5 * (1 + tanh(0.7978845608 * (3*x - 1 + 0.044715*(3*x-1)^3)))}; +\addplot[badred!70, thick, dashed] {x * 0.5 * (1 + tanh(0.7978845608 * (5*x - 2 + 0.044715*(5*x-2)^3)))}; +\addplot[badred!40, thick, dotted] {x * 0.5 * (1 + tanh(0.7978845608 * (2*x + 0.5 + 0.044715*(2*x+0.5)^3)))}; +\legend{$w_1{=}3$, $w_2{=}5$, $w_3{=}2$} +\end{axis} +\end{tikzpicture} + +\vspace{0.2cm} +\centering +{\small \color{badred} Low-frequency, correlated outputs\\Cannot easily distinguish $t{=}0.3$ from $t{=}0.35$} +\end{column} +\end{columns} + +\end{frame} + +%% ========== SLIDE 5: SINUSOIDAL EMBEDDING ========== +\begin{frame}{How Sinusoidal Embeddings Solve This} + +\textbf{Sinusoidal (Fourier) Time Embedding:} +\[ +e(t) = \text{MLP}\Big(\underbrace{\big[\cos(\omega_1 t),\, \sin(\omega_1 t),\, \cos(\omega_2 t),\, \sin(\omega_2 t),\, \ldots\big]}_{\text{256 orthogonal frequency bands}}\Big) +\] +where $\omega_k = \exp\!\left(-\frac{\log(10000) \cdot k}{D/2}\right)$, giving frequencies spanning from very low to very high. + +\vspace{0.3cm} + +\begin{columns} +\begin{column}{0.5\textwidth} +\begin{tikzpicture} +\begin{axis}[ + width=6cm, height=3.5cm, + xlabel={$t$}, ylabel={Feature Value}, + domain=0:1, samples=200, + legend pos=south west, + legend style={font=\tiny}, +] +\addplot[goodgreen, thick] {cos(deg(2*pi*x))}; +\addplot[goodgreen!70, thick, dashed] {sin(deg(8*pi*x))}; +\addplot[goodgreen!40, thick, dotted] {cos(deg(32*pi*x))}; +\legend{Low freq, Mid freq, High freq} +\end{axis} +\end{tikzpicture} +\end{column} +\begin{column}{0.45\textwidth} +\textbf{Key properties:} +\begin{itemize} + \item \textcolor{goodgreen}{Orthogonal} frequency bands from day 1 + \item High-freq channels distinguish $t{=}0.30$ from $t{=}0.31$ + \item Low-freq channels distinguish $t{=}0.1$ from $t{=}0.9$ + \item No learning needed for the Fourier features themselves + \item Only the MLP projection needs to be learned +\end{itemize} +\end{column} +\end{columns} + +\end{frame} + +%% ========== SLIDE 6: COMPARISON ========== +\begin{frame}{Side-by-Side: Distinguishability of Nearby Timesteps} + +\textbf{Question:} How different are the embeddings for $t = 0.30$ vs $t = 0.35$? + +\vspace{0.3cm} + +\begin{columns} +\begin{column}{0.48\textwidth} +\centering +\textbf{\color{badred} Linear Embedding} + +\vspace{0.2cm} + +\begin{tikzpicture} +\begin{axis}[ + width=5.5cm, height=3.5cm, + xlabel={Dimension $i$}, + ylabel={$|e(0.30)_i - e(0.35)_i|$}, + ybar, bar width=1pt, + domain=1:50, samples=50, + ymin=0, ymax=0.3, + title style={font=\small}, +] +% Linear: all differences are similar small values +\addplot[fill=badred!60] coordinates { + (1,0.05) (3,0.04) (5,0.06) (7,0.05) (9,0.04) (11,0.05) (13,0.06) + (15,0.05) (17,0.04) (19,0.05) (21,0.06) (23,0.05) (25,0.04) + (27,0.05) (29,0.06) (31,0.05) (33,0.04) (35,0.05) (37,0.06) + (39,0.05) (41,0.04) (43,0.05) (45,0.06) (47,0.05) (49,0.04) +}; +\end{axis} +\end{tikzpicture} + +{\small Nearly identical $\to$ model can't tell them apart} +\end{column} +\begin{column}{0.48\textwidth} +\centering +\textbf{\color{goodgreen} Sinusoidal Embedding} + +\vspace{0.2cm} + +\begin{tikzpicture} +\begin{axis}[ + width=5.5cm, height=3.5cm, + xlabel={Dimension $i$}, + ylabel={$|e(0.30)_i - e(0.35)_i|$}, + ybar, bar width=1pt, + domain=1:50, samples=50, + ymin=0, ymax=1.2, + title style={font=\small}, +] +% Sinusoidal: rich variation across dimensions +\addplot[fill=goodgreen!60] coordinates { + (1,0.02) (3,0.05) (5,0.12) (7,0.25) (9,0.4) (11,0.6) (13,0.8) + (15,0.95) (17,1.1) (19,0.9) (21,0.7) (23,0.5) (25,0.3) + (27,0.15) (29,0.08) (31,0.2) (33,0.45) (35,0.7) (37,0.9) + (39,1.0) (41,0.85) (43,0.6) (45,0.35) (47,0.15) (49,0.05) +}; +\end{axis} +\end{tikzpicture} + +{\small Rich differences $\to$ trivially distinguishable} +\end{column} +\end{columns} + +\end{frame} + +%% ========== SLIDE 7: THE GRADIENT TRAP ========== +\begin{frame}{The Optimization Landscape: Why the Model Gets Trapped} + +\begin{center} +\begin{tikzpicture}[scale=1.1] + % Loss landscape + \draw[thick, ->] (0,0) -- (9,0) node[right] {Model Capability}; + \draw[thick, ->] (0,0) -- (0,4.5) node[above] {$\mathcal{L}_{\text{flow}}$}; + + % Loss curve with local minimum + \draw[very thick, badred!80] plot[smooth, tension=0.7] coordinates { + (0.5, 4.0) (1.5, 2.5) (2.5, 1.8) (3.5, 1.05) (4.5, 1.0) (5.5, 0.98) (6.5, 0.97) (7.5, 0.96) (8.0, 0.955) + }; + \node[badred, font=\small, right] at (8.0, 0.955) {Linear $t$}; + + % Loss curve without local minimum + \draw[very thick, goodgreen!80] plot[smooth, tension=0.7] coordinates { + (0.5, 4.0) (1.5, 2.5) (2.5, 1.8) (3.5, 1.2) (4.5, 0.7) (5.5, 0.4) (6.5, 0.2) (7.5, 0.1) (8.0, 0.05) + }; + \node[goodgreen, font=\small, right] at (8.0, 0.05) {Sinusoidal $t$}; + + % Plateau annotation + \draw[dashed, gray] (0, 1.0) -- (8.5, 1.0); + \node[gray, font=\footnotesize, left] at (0, 1.0) {$\mathbb{E}[\|z_{\text{noise}}\|^2] = 1.0$}; + + % Arrow showing "stuck" + \draw[-{Stealth}, thick, badred] (5.0, 1.5) -- (4.8, 1.05); + \node[badred, font=\footnotesize, above] at (5.0, 1.55) {Stuck here}; + + % Arrow showing "breaks through" + \draw[-{Stealth}, thick, goodgreen] (4.0, 1.5) -- (4.5, 0.75); + \node[goodgreen, font=\footnotesize, above] at (4.0, 1.55) {Breaks through}; + +\end{tikzpicture} +\end{center} + +\vspace{0.2cm} + +\textbf{With linear embedding:} Gradients w.r.t.\ $t$ are too uniform $\to$ the model cannot learn timestep-specific behavior $\to$ it settles for the ``predict $z_{\text{target}}$ from conditioning'' shortcut. + +\textbf{With sinusoidal embedding:} Rich frequency features give the model immediate access to fine-grained $t$ information $\to$ gradients can differentiate behavior at different timesteps $\to$ the model learns the noise-cancellation computation. + +\end{frame} + +%% ========== SLIDE 8: EMPIRICAL EVIDENCE ========== +\begin{frame}{Empirical Evidence from Training Logs} + +\begin{columns} +\begin{column}{0.55\textwidth} +\textbf{Epochs 30--34 (Linear $t_{\text{proj}}$):} + +\vspace{0.2cm} +\begin{tabular}{ccc} +\toprule +Epoch & Flow Loss & Proj Loss \\ +\midrule +30 & 1.04 $\pm$ 0.05 & 0.25 $\pm$ 0.04 \\ +31 & 1.03 $\pm$ 0.05 & 0.24 $\pm$ 0.03 \\ +32 & 1.02 $\pm$ 0.05 & 0.23 $\pm$ 0.04 \\ +33 & 1.01 $\pm$ 0.05 & 0.22 $\pm$ 0.04 \\ +34 & 1.03 $\pm$ 0.05 & 0.24 $\pm$ 0.04 \\ +\bottomrule +\end{tabular} + +\vspace{0.3cm} +{\color{badred} \textbf{Zero improvement} in flow loss.} + +The projection loss (Proj) is the only component that keeps training, trying to compensate by mapping the flawed velocity-field output back to reasonable latents. +\end{column} +\begin{column}{0.4\textwidth} +\begin{tikzpicture} +\begin{axis}[ + width=5.5cm, height=4cm, + xlabel={Epoch}, + ylabel={Loss}, + legend pos=north east, + legend style={font=\tiny}, + ymin=0, ymax=1.5, + xtick={30,31,32,33,34}, +] +% Flow loss - flat +\addplot[badred, very thick, mark=*] coordinates { + (30, 1.04) (31, 1.03) (32, 1.02) (33, 1.01) (34, 1.03) +}; +% Proj loss +\addplot[warnblue, thick, mark=square*, dashed] coordinates { + (30, 0.25) (31, 0.24) (32, 0.23) (33, 0.22) (34, 0.24) +}; +\legend{Flow Loss, Proj Loss} +\end{axis} +\end{tikzpicture} + +\vspace{0.2cm} +{\small Flow loss: \textcolor{badred}{flat at 1.0}\\ +Proj loss: slowly compensating} +\end{column} +\end{columns} + +\end{frame} + +%% ========== SLIDE 9: THE FIX ========== +\begin{frame}[fragile]{The Fix: SinusoidalTimeEmbedding} + +\textbf{Old (3 lines):} +\begin{verbatim} +self.t_proj = nn.Sequential( + nn.Linear(1, hidden_dim), # 1 → 512 + nn.GELU(), + nn.Linear(hidden_dim, cond_dim) # 512 → 1024 +) +\end{verbatim} + +\textbf{New:} +\begin{verbatim} +class SinusoidalTimeEmbedding(nn.Module): + def forward(self, t): # t: (B, 1) + freqs = exp(-log(10000) * arange(D/2) / (D/2)) + args = t * freqs # (B, D/2) + emb = [cos(args), sin(args)] # (B, D) + return MLP(emb) # (B, cond_dim) +\end{verbatim} + +\vspace{0.2cm} +\textbf{Checkpoint compatibility:} Only \texttt{t\_proj} weights change. All DiT blocks, projector, and conditioning encoder weights load from the epoch 30 checkpoint via \texttt{strict=False}. + +\end{frame} + +%% ========== SLIDE 10: SUMMARY ========== +\begin{frame}{Summary} + +\begin{enumerate} + \item \textbf{The 1.0 plateau} is exactly $\mathbb{E}[\|z_{\text{noise}}\|^2]$ --- the loss floor when the model predicts $v \approx z_{\text{target}}$ from conditioning alone, ignoring $z_t$ and $t$. + + \vspace{0.3cm} + + \item \textbf{Linear time embedding} gives the model insufficient resolution to distinguish timesteps, trapping it in this local minimum. The projection loss tries to compensate but cannot fix the root cause. + + \vspace{0.3cm} + + \item \textbf{Sinusoidal time embedding} provides 256 orthogonal frequency bands \emph{for free}, giving the model immediate fine-grained access to the timestep value. This is the universal standard in DDPM, EDM, DiT, and Matcha-TTS. + + \vspace{0.3cm} + + \item \textbf{Expected outcome:} Flow loss should break below 1.0 within a few epochs of retraining with the new embedding, as the model learns to use $t$ in the noise-cancellation computation. +\end{enumerate} + +\end{frame} + +\end{document} diff --git a/slides/time_embedding_analysis.toc b/slides/time_embedding_analysis.toc new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/slides/time_embedding_analysis.vrb b/slides/time_embedding_analysis.vrb new file mode 100644 index 0000000000000000000000000000000000000000..4f3749595bfd495a547334e57e8e2d3810d383f3 --- /dev/null +++ b/slides/time_embedding_analysis.vrb @@ -0,0 +1,24 @@ +\frametitle{The Fix: SinusoidalTimeEmbedding} + +\textbf{Old (3 lines):} +\begin{verbatim} +self.t_proj = nn.Sequential( + nn.Linear(1, hidden_dim), # 1 → 512 + nn.GELU(), + nn.Linear(hidden_dim, cond_dim) # 512 → 1024 +) +\end{verbatim} + +\textbf{New:} +\begin{verbatim} +class SinusoidalTimeEmbedding(nn.Module): + def forward(self, t): # t: (B, 1) + freqs = exp(-log(10000) * arange(D/2) / (D/2)) + args = t * freqs # (B, D/2) + emb = [cos(args), sin(args)] # (B, D) + return MLP(emb) # (B, cond_dim) +\end{verbatim} + +\vspace{0.2cm} +\textbf{Checkpoint compatibility:} Only \texttt{t\_proj} weights change. All DiT blocks, projector, and conditioning encoder weights load from the epoch 30 checkpoint via \texttt{strict=False}. + diff --git a/speaker/README.md b/speaker/README.md new file mode 100644 index 0000000000000000000000000000000000000000..b6f541f884f6165a37540cc7fae4df7bf2fa2ac7 --- /dev/null +++ b/speaker/README.md @@ -0,0 +1,18 @@ +### Speaker Encoder + +This is an implementation of https://arxiv.org/abs/1710.10467. This model can be used for voice and speaker embedding. + +With the code here you can generate d-vectors for both multi-speaker and single-speaker TTS datasets, then visualise and explore them along with the associated audio files in an interactive chart. + +Below is an example showing embedding results of various speakers. You can generate the same plot with the provided notebook as demonstrated in [this video](https://youtu.be/KW3oO7JVa7Q). + +![](umap.png) + +Download a pretrained model from [Released Models](https://github.com/mozilla/TTS/wiki/Released-Models) page. + +To run the code, you need to follow the same flow as in TTS. + +- Define 'config.json' for your needs. Note that, audio parameters should match your TTS model. +- Example training call ```python speaker_encoder/train.py --config_path speaker_encoder/config.json --data_path ~/Data/Libri-TTS/train-clean-360``` +- Generate embedding vectors ```python speaker_encoder/compute_embeddings.py --use_cuda true /model/path/best_model.pth.tar model/config/path/config.json dataset/path/ output_path``` . This code parses all .wav files at the given dataset path and generates the same folder structure under the output path with the generated embedding files. +- Watch training on Tensorboard as in TTS diff --git a/speaker/__init__.py b/speaker/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/speaker/config.py b/speaker/config.py new file mode 100644 index 0000000000000000000000000000000000000000..7172ee231f9aaf3d9aa21e7244a1e6b48ebaad39 --- /dev/null +++ b/speaker/config.py @@ -0,0 +1,64 @@ +from dataclasses import asdict, dataclass, field +from typing import Dict, List + +from .utils.coqpit import MISSING +from .utils.shared_configs import BaseAudioConfig, BaseDatasetConfig, BaseTrainingConfig + + +@dataclass +class SpeakerEncoderConfig(BaseTrainingConfig): + """Defines parameters for Speaker Encoder model.""" + + model: str = "speaker_encoder" + audio: BaseAudioConfig = field(default_factory=BaseAudioConfig) + datasets: List[BaseDatasetConfig] = field(default_factory=lambda: [BaseDatasetConfig()]) + # model params + model_params: Dict = field( + default_factory=lambda: { + "model_name": "lstm", + "input_dim": 80, + "proj_dim": 256, + "lstm_dim": 768, + "num_lstm_layers": 3, + "use_lstm_with_projection": True, + } + ) + + audio_augmentation: Dict = field(default_factory=lambda: {}) + + storage: Dict = field( + default_factory=lambda: { + "sample_from_storage_p": 0.66, # the probability with which we'll sample from the DataSet in-memory storage + "storage_size": 15, # the size of the in-memory storage with respect to a single batch + } + ) + + # training params + max_train_step: int = 1000000 # end training when number of training steps reaches this value. + loss: str = "angleproto" + grad_clip: float = 3.0 + lr: float = 0.0001 + lr_decay: bool = False + warmup_steps: int = 4000 + wd: float = 1e-6 + + # logging params + tb_model_param_stats: bool = False + steps_plot_stats: int = 10 + checkpoint: bool = True + save_step: int = 1000 + print_step: int = 20 + + # data loader + num_speakers_in_batch: int = MISSING + num_utters_per_speaker: int = MISSING + num_loader_workers: int = MISSING + skip_speakers: bool = False + voice_len: float = 1.6 + + def check_values(self): + super().check_values() + c = asdict(self) + assert ( + c["model_params"]["input_dim"] == self.audio.num_mels + ), " [!] model input dimendion must be equal to melspectrogram dimension." diff --git a/speaker/infer.py b/speaker/infer.py new file mode 100644 index 0000000000000000000000000000000000000000..b69b2ee6d0c1f00492e50fc11411cf6e245a18e8 --- /dev/null +++ b/speaker/infer.py @@ -0,0 +1,108 @@ +import re +import json +import fsspec +import torch +import numpy as np +import argparse + +from argparse import RawTextHelpFormatter +from .models.lstm import LSTMSpeakerEncoder +from .config import SpeakerEncoderConfig +from .utils.audio import AudioProcessor + + +def read_json(json_path): + config_dict = {} + try: + with fsspec.open(json_path, "r", encoding="utf-8") as f: + data = json.load(f) + except json.decoder.JSONDecodeError: + # backwards compat. + data = read_json_with_comments(json_path) + config_dict.update(data) + return config_dict + + +def read_json_with_comments(json_path): + """for backward compat.""" + # fallback to json + with fsspec.open(json_path, "r", encoding="utf-8") as f: + input_str = f.read() + # handle comments + input_str = re.sub(r"\\\n", "", input_str) + input_str = re.sub(r"//.*\n", "\n", input_str) + data = json.loads(input_str) + return data + + +if __name__ == "__main__": + + parser = argparse.ArgumentParser( + description="""Compute embedding vectors for each wav file in a dataset.""", + formatter_class=RawTextHelpFormatter, + ) + parser.add_argument("model_path", type=str, help="Path to model checkpoint file.") + parser.add_argument( + "config_path", + type=str, + help="Path to model config file.", + ) + + parser.add_argument("-s", "--source", help="input wave", dest="source") + parser.add_argument( + "-t", "--target", help="output 256d speaker embeddimg", dest="target" + ) + + parser.add_argument("--use_cuda", type=bool, help="flag to set cuda.", default=True) + parser.add_argument("--eval", type=bool, help="compute eval.", default=True) + + args = parser.parse_args() + source_file = args.source + target_file = args.target + + # config + config_dict = read_json(args.config_path) + # print(config_dict) + + # model + config = SpeakerEncoderConfig(config_dict) + config.from_dict(config_dict) + + speaker_encoder = LSTMSpeakerEncoder( + config.model_params["input_dim"], + config.model_params["proj_dim"], + config.model_params["lstm_dim"], + config.model_params["num_lstm_layers"], + ) + + speaker_encoder.load_checkpoint(args.model_path, eval=True, use_cuda=args.use_cuda) + + # preprocess + speaker_encoder_ap = AudioProcessor(**config.audio) + # normalize the input audio level and trim silences + speaker_encoder_ap.do_sound_norm = True + speaker_encoder_ap.do_trim_silence = True + + # compute speaker embeddings + + # extract the embedding + waveform = speaker_encoder_ap.load_wav( + source_file, sr=speaker_encoder_ap.sample_rate + ) + spec = speaker_encoder_ap.melspectrogram(waveform) + spec = torch.from_numpy(spec.T) + if args.use_cuda: + spec = spec.cuda() + spec = spec.unsqueeze(0) + embed = speaker_encoder.compute_embedding(spec).detach().cpu().numpy() + embed = embed.squeeze() + # print(embed) + # print(embed.size) + np.save(target_file, embed, allow_pickle=False) + + + if hasattr(speaker_encoder, 'module'): + state_dict = speaker_encoder.module.state_dict() + else: + state_dict = speaker_encoder.state_dict() + torch.save({'model': state_dict}, "model_small.pth") diff --git a/speaker/models/__init__.py b/speaker/models/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/speaker/models/lstm.py b/speaker/models/lstm.py new file mode 100644 index 0000000000000000000000000000000000000000..45e8ccefb76f5b0200f7f2d8392c87624abb4965 --- /dev/null +++ b/speaker/models/lstm.py @@ -0,0 +1,131 @@ +import numpy as np +import torch +from torch import nn + +from ..utils.io import load_fsspec + + +class LSTMWithProjection(nn.Module): + def __init__(self, input_size, hidden_size, proj_size): + super().__init__() + self.input_size = input_size + self.hidden_size = hidden_size + self.proj_size = proj_size + self.lstm = nn.LSTM(input_size, hidden_size, batch_first=True) + self.linear = nn.Linear(hidden_size, proj_size, bias=False) + + def forward(self, x): + self.lstm.flatten_parameters() + o, (_, _) = self.lstm(x) + return self.linear(o) + + +class LSTMWithoutProjection(nn.Module): + def __init__(self, input_dim, lstm_dim, proj_dim, num_lstm_layers): + super().__init__() + self.lstm = nn.LSTM(input_size=input_dim, hidden_size=lstm_dim, num_layers=num_lstm_layers, batch_first=True) + self.linear = nn.Linear(lstm_dim, proj_dim, bias=True) + self.relu = nn.ReLU() + + def forward(self, x): + _, (hidden, _) = self.lstm(x) + return self.relu(self.linear(hidden[-1])) + + +class LSTMSpeakerEncoder(nn.Module): + def __init__(self, input_dim, proj_dim=256, lstm_dim=768, num_lstm_layers=3, use_lstm_with_projection=True): + super().__init__() + self.use_lstm_with_projection = use_lstm_with_projection + layers = [] + # choise LSTM layer + if use_lstm_with_projection: + layers.append(LSTMWithProjection(input_dim, lstm_dim, proj_dim)) + for _ in range(num_lstm_layers - 1): + layers.append(LSTMWithProjection(proj_dim, lstm_dim, proj_dim)) + self.layers = nn.Sequential(*layers) + else: + self.layers = LSTMWithoutProjection(input_dim, lstm_dim, proj_dim, num_lstm_layers) + + self._init_layers() + + def _init_layers(self): + for name, param in self.layers.named_parameters(): + if "bias" in name: + nn.init.constant_(param, 0.0) + elif "weight" in name: + nn.init.xavier_normal_(param) + + def forward(self, x): + # TODO: implement state passing for lstms + d = self.layers(x) + if self.use_lstm_with_projection: + d = torch.nn.functional.normalize(d[:, -1], p=2, dim=1) + else: + d = torch.nn.functional.normalize(d, p=2, dim=1) + return d + + @torch.no_grad() + def inference(self, x): + d = self.layers.forward(x) + if self.use_lstm_with_projection: + d = torch.nn.functional.normalize(d[:, -1], p=2, dim=1) + else: + d = torch.nn.functional.normalize(d, p=2, dim=1) + return d + + def compute_embedding(self, x, num_frames=250, num_eval=10, return_mean=True): + """ + Generate embeddings for a batch of utterances + x: 1xTxD + """ + max_len = x.shape[1] + + if max_len < num_frames: + num_frames = max_len + + offsets = np.linspace(0, max_len - num_frames, num=num_eval) + + frames_batch = [] + for offset in offsets: + offset = int(offset) + end_offset = int(offset + num_frames) + frames = x[:, offset:end_offset] + frames_batch.append(frames) + + frames_batch = torch.cat(frames_batch, dim=0) + embeddings = self.inference(frames_batch) + + if return_mean: + embeddings = torch.mean(embeddings, dim=0, keepdim=True) + + return embeddings + + def batch_compute_embedding(self, x, seq_lens, num_frames=160, overlap=0.5): + """ + Generate embeddings for a batch of utterances + x: BxTxD + """ + num_overlap = num_frames * overlap + max_len = x.shape[1] + embed = None + num_iters = seq_lens / (num_frames - num_overlap) + cur_iter = 0 + for offset in range(0, max_len, num_frames - num_overlap): + cur_iter += 1 + end_offset = min(x.shape[1], offset + num_frames) + frames = x[:, offset:end_offset] + if embed is None: + embed = self.inference(frames) + else: + embed[cur_iter <= num_iters, :] += self.inference(frames[cur_iter <= num_iters, :, :]) + return embed / num_iters + + # pylint: disable=unused-argument, redefined-builtin + def load_checkpoint(self, checkpoint_path: str, eval: bool = False, use_cuda: bool = False): + state = load_fsspec(checkpoint_path, map_location=torch.device("cpu")) + self.load_state_dict(state["model"]) + if use_cuda: + self.cuda() + if eval: + self.eval() + assert not self.training diff --git a/speaker/models/resnet.py b/speaker/models/resnet.py new file mode 100644 index 0000000000000000000000000000000000000000..fcc850d7b87e03c2490e6b88232d9a0f668586ad --- /dev/null +++ b/speaker/models/resnet.py @@ -0,0 +1,212 @@ +import numpy as np +import torch +from torch import nn + +from TTS.utils.io import load_fsspec + + +class SELayer(nn.Module): + def __init__(self, channel, reduction=8): + super(SELayer, self).__init__() + self.avg_pool = nn.AdaptiveAvgPool2d(1) + self.fc = nn.Sequential( + nn.Linear(channel, channel // reduction), + nn.ReLU(inplace=True), + nn.Linear(channel // reduction, channel), + nn.Sigmoid(), + ) + + def forward(self, x): + b, c, _, _ = x.size() + y = self.avg_pool(x).view(b, c) + y = self.fc(y).view(b, c, 1, 1) + return x * y + + +class SEBasicBlock(nn.Module): + expansion = 1 + + def __init__(self, inplanes, planes, stride=1, downsample=None, reduction=8): + super(SEBasicBlock, self).__init__() + self.conv1 = nn.Conv2d(inplanes, planes, kernel_size=3, stride=stride, padding=1, bias=False) + self.bn1 = nn.BatchNorm2d(planes) + self.conv2 = nn.Conv2d(planes, planes, kernel_size=3, padding=1, bias=False) + self.bn2 = nn.BatchNorm2d(planes) + self.relu = nn.ReLU(inplace=True) + self.se = SELayer(planes, reduction) + self.downsample = downsample + self.stride = stride + + def forward(self, x): + residual = x + + out = self.conv1(x) + out = self.relu(out) + out = self.bn1(out) + + out = self.conv2(out) + out = self.bn2(out) + out = self.se(out) + + if self.downsample is not None: + residual = self.downsample(x) + + out += residual + out = self.relu(out) + return out + + +class ResNetSpeakerEncoder(nn.Module): + """Implementation of the model H/ASP without batch normalization in speaker embedding. This model was proposed in: https://arxiv.org/abs/2009.14153 + Adapted from: https://github.com/clovaai/voxceleb_trainer + """ + + # pylint: disable=W0102 + def __init__( + self, + input_dim=64, + proj_dim=512, + layers=[3, 4, 6, 3], + num_filters=[32, 64, 128, 256], + encoder_type="ASP", + log_input=False, + ): + super(ResNetSpeakerEncoder, self).__init__() + + self.encoder_type = encoder_type + self.input_dim = input_dim + self.log_input = log_input + self.conv1 = nn.Conv2d(1, num_filters[0], kernel_size=3, stride=1, padding=1) + self.relu = nn.ReLU(inplace=True) + self.bn1 = nn.BatchNorm2d(num_filters[0]) + + self.inplanes = num_filters[0] + self.layer1 = self.create_layer(SEBasicBlock, num_filters[0], layers[0]) + self.layer2 = self.create_layer(SEBasicBlock, num_filters[1], layers[1], stride=(2, 2)) + self.layer3 = self.create_layer(SEBasicBlock, num_filters[2], layers[2], stride=(2, 2)) + self.layer4 = self.create_layer(SEBasicBlock, num_filters[3], layers[3], stride=(2, 2)) + + self.instancenorm = nn.InstanceNorm1d(input_dim) + + outmap_size = int(self.input_dim / 8) + + self.attention = nn.Sequential( + nn.Conv1d(num_filters[3] * outmap_size, 128, kernel_size=1), + nn.ReLU(), + nn.BatchNorm1d(128), + nn.Conv1d(128, num_filters[3] * outmap_size, kernel_size=1), + nn.Softmax(dim=2), + ) + + if self.encoder_type == "SAP": + out_dim = num_filters[3] * outmap_size + elif self.encoder_type == "ASP": + out_dim = num_filters[3] * outmap_size * 2 + else: + raise ValueError("Undefined encoder") + + self.fc = nn.Linear(out_dim, proj_dim) + + self._init_layers() + + def _init_layers(self): + for m in self.modules(): + if isinstance(m, nn.Conv2d): + nn.init.kaiming_normal_(m.weight, mode="fan_out", nonlinearity="relu") + elif isinstance(m, nn.BatchNorm2d): + nn.init.constant_(m.weight, 1) + nn.init.constant_(m.bias, 0) + + def create_layer(self, block, planes, blocks, stride=1): + downsample = None + if stride != 1 or self.inplanes != planes * block.expansion: + downsample = nn.Sequential( + nn.Conv2d(self.inplanes, planes * block.expansion, kernel_size=1, stride=stride, bias=False), + nn.BatchNorm2d(planes * block.expansion), + ) + + layers = [] + layers.append(block(self.inplanes, planes, stride, downsample)) + self.inplanes = planes * block.expansion + for _ in range(1, blocks): + layers.append(block(self.inplanes, planes)) + + return nn.Sequential(*layers) + + # pylint: disable=R0201 + def new_parameter(self, *size): + out = nn.Parameter(torch.FloatTensor(*size)) + nn.init.xavier_normal_(out) + return out + + def forward(self, x, l2_norm=False): + x = x.transpose(1, 2) + with torch.no_grad(): + with torch.cuda.amp.autocast(enabled=False): + if self.log_input: + x = (x + 1e-6).log() + x = self.instancenorm(x).unsqueeze(1) + + x = self.conv1(x) + x = self.relu(x) + x = self.bn1(x) + + x = self.layer1(x) + x = self.layer2(x) + x = self.layer3(x) + x = self.layer4(x) + + x = x.reshape(x.size()[0], -1, x.size()[-1]) + + w = self.attention(x) + + if self.encoder_type == "SAP": + x = torch.sum(x * w, dim=2) + elif self.encoder_type == "ASP": + mu = torch.sum(x * w, dim=2) + sg = torch.sqrt((torch.sum((x ** 2) * w, dim=2) - mu ** 2).clamp(min=1e-5)) + x = torch.cat((mu, sg), 1) + + x = x.view(x.size()[0], -1) + x = self.fc(x) + + if l2_norm: + x = torch.nn.functional.normalize(x, p=2, dim=1) + return x + + @torch.no_grad() + def compute_embedding(self, x, num_frames=250, num_eval=10, return_mean=True): + """ + Generate embeddings for a batch of utterances + x: 1xTxD + """ + max_len = x.shape[1] + + if max_len < num_frames: + num_frames = max_len + + offsets = np.linspace(0, max_len - num_frames, num=num_eval) + + frames_batch = [] + for offset in offsets: + offset = int(offset) + end_offset = int(offset + num_frames) + frames = x[:, offset:end_offset] + frames_batch.append(frames) + + frames_batch = torch.cat(frames_batch, dim=0) + embeddings = self.forward(frames_batch, l2_norm=True) + + if return_mean: + embeddings = torch.mean(embeddings, dim=0, keepdim=True) + + return embeddings + + def load_checkpoint(self, config: dict, checkpoint_path: str, eval: bool = False, use_cuda: bool = False): + state = load_fsspec(checkpoint_path, map_location=torch.device("cpu")) + self.load_state_dict(state["model"]) + if use_cuda: + self.cuda() + if eval: + self.eval() + assert not self.training diff --git a/speaker/utils/__init__.py b/speaker/utils/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/speaker/utils/audio.py b/speaker/utils/audio.py new file mode 100644 index 0000000000000000000000000000000000000000..e2c9627e93cf7ba864532144bb4522cf575d3d2b --- /dev/null +++ b/speaker/utils/audio.py @@ -0,0 +1,822 @@ +from typing import Dict, Tuple + +import librosa +import numpy as np +import pyworld as pw +import scipy.io.wavfile +import scipy.signal +import soundfile as sf +import torch +from torch import nn + +class StandardScaler: + """StandardScaler for mean-scale normalization with the given mean and scale values.""" + + def __init__(self, mean: np.ndarray = None, scale: np.ndarray = None) -> None: + self.mean_ = mean + self.scale_ = scale + + def set_stats(self, mean, scale): + self.mean_ = mean + self.scale_ = scale + + def reset_stats(self): + delattr(self, "mean_") + delattr(self, "scale_") + + def transform(self, X): + X = np.asarray(X) + X -= self.mean_ + X /= self.scale_ + return X + + def inverse_transform(self, X): + X = np.asarray(X) + X *= self.scale_ + X += self.mean_ + return X + +class TorchSTFT(nn.Module): # pylint: disable=abstract-method + """Some of the audio processing funtions using Torch for faster batch processing. + + TODO: Merge this with audio.py + """ + + def __init__( + self, + n_fft, + hop_length, + win_length, + pad_wav=False, + window="hann_window", + sample_rate=None, + mel_fmin=0, + mel_fmax=None, + n_mels=80, + use_mel=False, + do_amp_to_db=False, + spec_gain=1.0, + ): + super().__init__() + self.n_fft = n_fft + self.hop_length = hop_length + self.win_length = win_length + self.pad_wav = pad_wav + self.sample_rate = sample_rate + self.mel_fmin = mel_fmin + self.mel_fmax = mel_fmax + self.n_mels = n_mels + self.use_mel = use_mel + self.do_amp_to_db = do_amp_to_db + self.spec_gain = spec_gain + self.window = nn.Parameter(getattr(torch, window)(win_length), requires_grad=False) + self.mel_basis = None + if use_mel: + self._build_mel_basis() + + def __call__(self, x): + """Compute spectrogram frames by torch based stft. + + Args: + x (Tensor): input waveform + + Returns: + Tensor: spectrogram frames. + + Shapes: + x: [B x T] or [:math:`[B, 1, T]`] + """ + if x.ndim == 2: + x = x.unsqueeze(1) + if self.pad_wav: + padding = int((self.n_fft - self.hop_length) / 2) + x = torch.nn.functional.pad(x, (padding, padding), mode="reflect") + # B x D x T x 2 + o = torch.stft( + x.squeeze(1), + self.n_fft, + self.hop_length, + self.win_length, + self.window, + center=True, + pad_mode="reflect", # compatible with audio.py + normalized=False, + onesided=True, + return_complex=False, + ) + M = o[:, :, :, 0] + P = o[:, :, :, 1] + S = torch.sqrt(torch.clamp(M ** 2 + P ** 2, min=1e-8)) + if self.use_mel: + S = torch.matmul(self.mel_basis.to(x), S) + if self.do_amp_to_db: + S = self._amp_to_db(S, spec_gain=self.spec_gain) + return S + + def _build_mel_basis(self): + mel_basis = librosa.filters.mel( + sr=self.sample_rate, n_fft=self.n_fft, n_mels=self.n_mels, fmin=self.mel_fmin, fmax=self.mel_fmax + ) + self.mel_basis = torch.from_numpy(mel_basis).float() + + @staticmethod + def _amp_to_db(x, spec_gain=1.0): + return torch.log(torch.clamp(x, min=1e-5) * spec_gain) + + @staticmethod + def _db_to_amp(x, spec_gain=1.0): + return torch.exp(x) / spec_gain + + +# pylint: disable=too-many-public-methods +class AudioProcessor(object): + """Audio Processor for TTS used by all the data pipelines. + + Note: + All the class arguments are set to default values to enable a flexible initialization + of the class with the model config. They are not meaningful for all the arguments. + + Args: + sample_rate (int, optional): + target audio sampling rate. Defaults to None. + + resample (bool, optional): + enable/disable resampling of the audio clips when the target sampling rate does not match the original sampling rate. Defaults to False. + + num_mels (int, optional): + number of melspectrogram dimensions. Defaults to None. + + log_func (int, optional): + log exponent used for converting spectrogram aplitude to DB. + + min_level_db (int, optional): + minimum db threshold for the computed melspectrograms. Defaults to None. + + frame_shift_ms (int, optional): + milliseconds of frames between STFT columns. Defaults to None. + + frame_length_ms (int, optional): + milliseconds of STFT window length. Defaults to None. + + hop_length (int, optional): + number of frames between STFT columns. Used if ```frame_shift_ms``` is None. Defaults to None. + + win_length (int, optional): + STFT window length. Used if ```frame_length_ms``` is None. Defaults to None. + + ref_level_db (int, optional): + reference DB level to avoid background noise. In general <20DB corresponds to the air noise. Defaults to None. + + fft_size (int, optional): + FFT window size for STFT. Defaults to 1024. + + power (int, optional): + Exponent value applied to the spectrogram before GriffinLim. Defaults to None. + + preemphasis (float, optional): + Preemphasis coefficient. Preemphasis is disabled if == 0.0. Defaults to 0.0. + + signal_norm (bool, optional): + enable/disable signal normalization. Defaults to None. + + symmetric_norm (bool, optional): + enable/disable symmetric normalization. If set True normalization is performed in the range [-k, k] else [0, k], Defaults to None. + + max_norm (float, optional): + ```k``` defining the normalization range. Defaults to None. + + mel_fmin (int, optional): + minimum filter frequency for computing melspectrograms. Defaults to None. + + mel_fmax (int, optional): + maximum filter frequency for computing melspectrograms.. Defaults to None. + + spec_gain (int, optional): + gain applied when converting amplitude to DB. Defaults to 20. + + stft_pad_mode (str, optional): + Padding mode for STFT. Defaults to 'reflect'. + + clip_norm (bool, optional): + enable/disable clipping the our of range values in the normalized audio signal. Defaults to True. + + griffin_lim_iters (int, optional): + Number of GriffinLim iterations. Defaults to None. + + do_trim_silence (bool, optional): + enable/disable silence trimming when loading the audio signal. Defaults to False. + + trim_db (int, optional): + DB threshold used for silence trimming. Defaults to 60. + + do_sound_norm (bool, optional): + enable/disable signal normalization. Defaults to False. + + do_amp_to_db_linear (bool, optional): + enable/disable amplitude to dB conversion of linear spectrograms. Defaults to True. + + do_amp_to_db_mel (bool, optional): + enable/disable amplitude to dB conversion of mel spectrograms. Defaults to True. + + stats_path (str, optional): + Path to the computed stats file. Defaults to None. + + verbose (bool, optional): + enable/disable logging. Defaults to True. + + """ + + def __init__( + self, + sample_rate=None, + resample=False, + num_mels=None, + log_func="np.log10", + min_level_db=None, + frame_shift_ms=None, + frame_length_ms=None, + hop_length=None, + win_length=None, + ref_level_db=None, + fft_size=1024, + power=None, + preemphasis=0.0, + signal_norm=None, + symmetric_norm=None, + max_norm=None, + mel_fmin=None, + mel_fmax=None, + spec_gain=20, + stft_pad_mode="reflect", + clip_norm=True, + griffin_lim_iters=None, + do_trim_silence=False, + trim_db=60, + do_sound_norm=False, + do_amp_to_db_linear=True, + do_amp_to_db_mel=True, + stats_path=None, + verbose=True, + **_, + ): + + # setup class attributed + self.sample_rate = sample_rate + self.resample = resample + self.num_mels = num_mels + self.log_func = log_func + self.min_level_db = min_level_db or 0 + self.frame_shift_ms = frame_shift_ms + self.frame_length_ms = frame_length_ms + self.ref_level_db = ref_level_db + self.fft_size = fft_size + self.power = power + self.preemphasis = preemphasis + self.griffin_lim_iters = griffin_lim_iters + self.signal_norm = signal_norm + self.symmetric_norm = symmetric_norm + self.mel_fmin = mel_fmin or 0 + self.mel_fmax = mel_fmax + self.spec_gain = float(spec_gain) + self.stft_pad_mode = stft_pad_mode + self.max_norm = 1.0 if max_norm is None else float(max_norm) + self.clip_norm = clip_norm + self.do_trim_silence = do_trim_silence + self.trim_db = trim_db + self.do_sound_norm = do_sound_norm + self.do_amp_to_db_linear = do_amp_to_db_linear + self.do_amp_to_db_mel = do_amp_to_db_mel + self.stats_path = stats_path + # setup exp_func for db to amp conversion + if log_func == "np.log": + self.base = np.e + elif log_func == "np.log10": + self.base = 10 + else: + raise ValueError(" [!] unknown `log_func` value.") + # setup stft parameters + if hop_length is None: + # compute stft parameters from given time values + self.hop_length, self.win_length = self._stft_parameters() + else: + # use stft parameters from config file + self.hop_length = hop_length + self.win_length = win_length + assert min_level_db != 0.0, " [!] min_level_db is 0" + assert self.win_length <= self.fft_size, " [!] win_length cannot be larger than fft_size" + members = vars(self) + if verbose: + print(" > Setting up Audio Processor...") + for key, value in members.items(): + print(" | > {}:{}".format(key, value)) + # create spectrogram utils + self.mel_basis = self._build_mel_basis() + self.inv_mel_basis = np.linalg.pinv(self._build_mel_basis()) + # setup scaler + if stats_path and signal_norm: + mel_mean, mel_std, linear_mean, linear_std, _ = self.load_stats(stats_path) + self.setup_scaler(mel_mean, mel_std, linear_mean, linear_std) + self.signal_norm = True + self.max_norm = None + self.clip_norm = None + self.symmetric_norm = None + + ### setting up the parameters ### + def _build_mel_basis( + self, + ) -> np.ndarray: + """Build melspectrogram basis. + + Returns: + np.ndarray: melspectrogram basis. + """ + if self.mel_fmax is not None: + assert self.mel_fmax <= self.sample_rate // 2 + return librosa.filters.mel( + sr=self.sample_rate, n_fft=self.fft_size, n_mels=self.num_mels, fmin=self.mel_fmin, fmax=self.mel_fmax + ) + + def _stft_parameters( + self, + ) -> Tuple[int, int]: + """Compute the real STFT parameters from the time values. + + Returns: + Tuple[int, int]: hop length and window length for STFT. + """ + factor = self.frame_length_ms / self.frame_shift_ms + assert (factor).is_integer(), " [!] frame_shift_ms should divide frame_length_ms" + hop_length = int(self.frame_shift_ms / 1000.0 * self.sample_rate) + win_length = int(hop_length * factor) + return hop_length, win_length + + ### normalization ### + def normalize(self, S: np.ndarray) -> np.ndarray: + """Normalize values into `[0, self.max_norm]` or `[-self.max_norm, self.max_norm]` + + Args: + S (np.ndarray): Spectrogram to normalize. + + Raises: + RuntimeError: Mean and variance is computed from incompatible parameters. + + Returns: + np.ndarray: Normalized spectrogram. + """ + # pylint: disable=no-else-return + S = S.copy() + if self.signal_norm: + # mean-var scaling + if hasattr(self, "mel_scaler"): + if S.shape[0] == self.num_mels: + return self.mel_scaler.transform(S.T).T + elif S.shape[0] == self.fft_size / 2: + return self.linear_scaler.transform(S.T).T + else: + raise RuntimeError(" [!] Mean-Var stats does not match the given feature dimensions.") + # range normalization + S -= self.ref_level_db # discard certain range of DB assuming it is air noise + S_norm = (S - self.min_level_db) / (-self.min_level_db) + if self.symmetric_norm: + S_norm = ((2 * self.max_norm) * S_norm) - self.max_norm + if self.clip_norm: + S_norm = np.clip( + S_norm, -self.max_norm, self.max_norm # pylint: disable=invalid-unary-operand-type + ) + return S_norm + else: + S_norm = self.max_norm * S_norm + if self.clip_norm: + S_norm = np.clip(S_norm, 0, self.max_norm) + return S_norm + else: + return S + + def denormalize(self, S: np.ndarray) -> np.ndarray: + """Denormalize spectrogram values. + + Args: + S (np.ndarray): Spectrogram to denormalize. + + Raises: + RuntimeError: Mean and variance are incompatible. + + Returns: + np.ndarray: Denormalized spectrogram. + """ + # pylint: disable=no-else-return + S_denorm = S.copy() + if self.signal_norm: + # mean-var scaling + if hasattr(self, "mel_scaler"): + if S_denorm.shape[0] == self.num_mels: + return self.mel_scaler.inverse_transform(S_denorm.T).T + elif S_denorm.shape[0] == self.fft_size / 2: + return self.linear_scaler.inverse_transform(S_denorm.T).T + else: + raise RuntimeError(" [!] Mean-Var stats does not match the given feature dimensions.") + if self.symmetric_norm: + if self.clip_norm: + S_denorm = np.clip( + S_denorm, -self.max_norm, self.max_norm # pylint: disable=invalid-unary-operand-type + ) + S_denorm = ((S_denorm + self.max_norm) * -self.min_level_db / (2 * self.max_norm)) + self.min_level_db + return S_denorm + self.ref_level_db + else: + if self.clip_norm: + S_denorm = np.clip(S_denorm, 0, self.max_norm) + S_denorm = (S_denorm * -self.min_level_db / self.max_norm) + self.min_level_db + return S_denorm + self.ref_level_db + else: + return S_denorm + + ### Mean-STD scaling ### + def load_stats(self, stats_path: str) -> Tuple[np.array, np.array, np.array, np.array, Dict]: + """Loading mean and variance statistics from a `npy` file. + + Args: + stats_path (str): Path to the `npy` file containing + + Returns: + Tuple[np.array, np.array, np.array, np.array, Dict]: loaded statistics and the config used to + compute them. + """ + stats = np.load(stats_path, allow_pickle=True).item() # pylint: disable=unexpected-keyword-arg + mel_mean = stats["mel_mean"] + mel_std = stats["mel_std"] + linear_mean = stats["linear_mean"] + linear_std = stats["linear_std"] + stats_config = stats["audio_config"] + # check all audio parameters used for computing stats + skip_parameters = ["griffin_lim_iters", "stats_path", "do_trim_silence", "ref_level_db", "power"] + for key in stats_config.keys(): + if key in skip_parameters: + continue + if key not in ["sample_rate", "trim_db"]: + assert ( + stats_config[key] == self.__dict__[key] + ), f" [!] Audio param {key} does not match the value used for computing mean-var stats. {stats_config[key]} vs {self.__dict__[key]}" + return mel_mean, mel_std, linear_mean, linear_std, stats_config + + # pylint: disable=attribute-defined-outside-init + def setup_scaler( + self, mel_mean: np.ndarray, mel_std: np.ndarray, linear_mean: np.ndarray, linear_std: np.ndarray + ) -> None: + """Initialize scaler objects used in mean-std normalization. + + Args: + mel_mean (np.ndarray): Mean for melspectrograms. + mel_std (np.ndarray): STD for melspectrograms. + linear_mean (np.ndarray): Mean for full scale spectrograms. + linear_std (np.ndarray): STD for full scale spectrograms. + """ + self.mel_scaler = StandardScaler() + self.mel_scaler.set_stats(mel_mean, mel_std) + self.linear_scaler = StandardScaler() + self.linear_scaler.set_stats(linear_mean, linear_std) + + ### DB and AMP conversion ### + # pylint: disable=no-self-use + def _amp_to_db(self, x: np.ndarray) -> np.ndarray: + """Convert amplitude values to decibels. + + Args: + x (np.ndarray): Amplitude spectrogram. + + Returns: + np.ndarray: Decibels spectrogram. + """ + return self.spec_gain * _log(np.maximum(1e-5, x), self.base) + + # pylint: disable=no-self-use + def _db_to_amp(self, x: np.ndarray) -> np.ndarray: + """Convert decibels spectrogram to amplitude spectrogram. + + Args: + x (np.ndarray): Decibels spectrogram. + + Returns: + np.ndarray: Amplitude spectrogram. + """ + return _exp(x / self.spec_gain, self.base) + + ### Preemphasis ### + def apply_preemphasis(self, x: np.ndarray) -> np.ndarray: + """Apply pre-emphasis to the audio signal. Useful to reduce the correlation between neighbouring signal values. + + Args: + x (np.ndarray): Audio signal. + + Raises: + RuntimeError: Preemphasis coeff is set to 0. + + Returns: + np.ndarray: Decorrelated audio signal. + """ + if self.preemphasis == 0: + raise RuntimeError(" [!] Preemphasis is set 0.0.") + return scipy.signal.lfilter([1, -self.preemphasis], [1], x) + + def apply_inv_preemphasis(self, x: np.ndarray) -> np.ndarray: + """Reverse pre-emphasis.""" + if self.preemphasis == 0: + raise RuntimeError(" [!] Preemphasis is set 0.0.") + return scipy.signal.lfilter([1], [1, -self.preemphasis], x) + + ### SPECTROGRAMs ### + def _linear_to_mel(self, spectrogram: np.ndarray) -> np.ndarray: + """Project a full scale spectrogram to a melspectrogram. + + Args: + spectrogram (np.ndarray): Full scale spectrogram. + + Returns: + np.ndarray: Melspectrogram + """ + return np.dot(self.mel_basis, spectrogram) + + def _mel_to_linear(self, mel_spec: np.ndarray) -> np.ndarray: + """Convert a melspectrogram to full scale spectrogram.""" + return np.maximum(1e-10, np.dot(self.inv_mel_basis, mel_spec)) + + def spectrogram(self, y: np.ndarray) -> np.ndarray: + """Compute a spectrogram from a waveform. + + Args: + y (np.ndarray): Waveform. + + Returns: + np.ndarray: Spectrogram. + """ + if self.preemphasis != 0: + D = self._stft(self.apply_preemphasis(y)) + else: + D = self._stft(y) + if self.do_amp_to_db_linear: + S = self._amp_to_db(np.abs(D)) + else: + S = np.abs(D) + return self.normalize(S).astype(np.float32) + + def melspectrogram(self, y: np.ndarray) -> np.ndarray: + """Compute a melspectrogram from a waveform.""" + if self.preemphasis != 0: + D = self._stft(self.apply_preemphasis(y)) + else: + D = self._stft(y) + if self.do_amp_to_db_mel: + S = self._amp_to_db(self._linear_to_mel(np.abs(D))) + else: + S = self._linear_to_mel(np.abs(D)) + return self.normalize(S).astype(np.float32) + + def inv_spectrogram(self, spectrogram: np.ndarray) -> np.ndarray: + """Convert a spectrogram to a waveform using Griffi-Lim vocoder.""" + S = self.denormalize(spectrogram) + S = self._db_to_amp(S) + # Reconstruct phase + if self.preemphasis != 0: + return self.apply_inv_preemphasis(self._griffin_lim(S ** self.power)) + return self._griffin_lim(S ** self.power) + + def inv_melspectrogram(self, mel_spectrogram: np.ndarray) -> np.ndarray: + """Convert a melspectrogram to a waveform using Griffi-Lim vocoder.""" + D = self.denormalize(mel_spectrogram) + S = self._db_to_amp(D) + S = self._mel_to_linear(S) # Convert back to linear + if self.preemphasis != 0: + return self.apply_inv_preemphasis(self._griffin_lim(S ** self.power)) + return self._griffin_lim(S ** self.power) + + def out_linear_to_mel(self, linear_spec: np.ndarray) -> np.ndarray: + """Convert a full scale linear spectrogram output of a network to a melspectrogram. + + Args: + linear_spec (np.ndarray): Normalized full scale linear spectrogram. + + Returns: + np.ndarray: Normalized melspectrogram. + """ + S = self.denormalize(linear_spec) + S = self._db_to_amp(S) + S = self._linear_to_mel(np.abs(S)) + S = self._amp_to_db(S) + mel = self.normalize(S) + return mel + + ### STFT and ISTFT ### + def _stft(self, y: np.ndarray) -> np.ndarray: + """Librosa STFT wrapper. + + Args: + y (np.ndarray): Audio signal. + + Returns: + np.ndarray: Complex number array. + """ + return librosa.stft( + y=y, + n_fft=self.fft_size, + hop_length=self.hop_length, + win_length=self.win_length, + pad_mode=self.stft_pad_mode, + window="hann", + center=True, + ) + + def _istft(self, y: np.ndarray) -> np.ndarray: + """Librosa iSTFT wrapper.""" + return librosa.istft(y, hop_length=self.hop_length, win_length=self.win_length) + + def _griffin_lim(self, S): + angles = np.exp(2j * np.pi * np.random.rand(*S.shape)) + S_complex = np.abs(S).astype(np.complex) + y = self._istft(S_complex * angles) + if not np.isfinite(y).all(): + print(" [!] Waveform is not finite everywhere. Skipping the GL.") + return np.array([0.0]) + for _ in range(self.griffin_lim_iters): + angles = np.exp(1j * np.angle(self._stft(y))) + y = self._istft(S_complex * angles) + return y + + def compute_stft_paddings(self, x, pad_sides=1): + """Compute paddings used by Librosa's STFT. Compute right padding (final frame) or both sides padding + (first and final frames)""" + assert pad_sides in (1, 2) + pad = (x.shape[0] // self.hop_length + 1) * self.hop_length - x.shape[0] + if pad_sides == 1: + return 0, pad + return pad // 2, pad // 2 + pad % 2 + + def compute_f0(self, x: np.ndarray) -> np.ndarray: + """Compute pitch (f0) of a waveform using the same parameters used for computing melspectrogram. + + Args: + x (np.ndarray): Waveform. + + Returns: + np.ndarray: Pitch. + + Examples: + >>> WAV_FILE = filename = librosa.util.example_audio_file() + >>> from TTS.config import BaseAudioConfig + >>> from TTS.utils.audio import AudioProcessor + >>> conf = BaseAudioConfig(mel_fmax=8000) + >>> ap = AudioProcessor(**conf) + >>> wav = ap.load_wav(WAV_FILE, sr=22050)[:5 * 22050] + >>> pitch = ap.compute_f0(wav) + """ + f0, t = pw.dio( + x.astype(np.double), + fs=self.sample_rate, + f0_ceil=self.mel_fmax, + frame_period=1000 * self.hop_length / self.sample_rate, + ) + f0 = pw.stonemask(x.astype(np.double), f0, t, self.sample_rate) + # pad = int((self.win_length / self.hop_length) / 2) + # f0 = [0.0] * pad + f0 + [0.0] * pad + # f0 = np.pad(f0, (pad, pad), mode="constant", constant_values=0) + # f0 = np.array(f0, dtype=np.float32) + + # f01, _, _ = librosa.pyin( + # x, + # fmin=65 if self.mel_fmin == 0 else self.mel_fmin, + # fmax=self.mel_fmax, + # frame_length=self.win_length, + # sr=self.sample_rate, + # fill_na=0.0, + # ) + + # spec = self.melspectrogram(x) + return f0 + + ### Audio Processing ### + def find_endpoint(self, wav: np.ndarray, threshold_db=-40, min_silence_sec=0.8) -> int: + """Find the last point without silence at the end of a audio signal. + + Args: + wav (np.ndarray): Audio signal. + threshold_db (int, optional): Silence threshold in decibels. Defaults to -40. + min_silence_sec (float, optional): Ignore silences that are shorter then this in secs. Defaults to 0.8. + + Returns: + int: Last point without silence. + """ + window_length = int(self.sample_rate * min_silence_sec) + hop_length = int(window_length / 4) + threshold = self._db_to_amp(threshold_db) + for x in range(hop_length, len(wav) - window_length, hop_length): + if np.max(wav[x : x + window_length]) < threshold: + return x + hop_length + return len(wav) + + def trim_silence(self, wav): + """Trim silent parts with a threshold and 0.01 sec margin""" + margin = int(self.sample_rate * 0.01) + wav = wav[margin:-margin] + return librosa.effects.trim(wav, top_db=self.trim_db, frame_length=self.win_length, hop_length=self.hop_length)[ + 0 + ] + + @staticmethod + def sound_norm(x: np.ndarray) -> np.ndarray: + """Normalize the volume of an audio signal. + + Args: + x (np.ndarray): Raw waveform. + + Returns: + np.ndarray: Volume normalized waveform. + """ + return x / abs(x).max() * 0.95 + + ### save and load ### + def load_wav(self, filename: str, sr: int = None) -> np.ndarray: + """Read a wav file using Librosa and optionally resample, silence trim, volume normalize. + + Args: + filename (str): Path to the wav file. + sr (int, optional): Sampling rate for resampling. Defaults to None. + + Returns: + np.ndarray: Loaded waveform. + """ + if self.resample: + x, sr = librosa.load(filename, sr=self.sample_rate) + elif sr is None: + x, sr = sf.read(filename) + assert self.sample_rate == sr, "%s vs %s" % (self.sample_rate, sr) + else: + x, sr = librosa.load(filename, sr=sr) + if self.do_trim_silence: + try: + x = self.trim_silence(x) + except ValueError: + print(f" [!] File cannot be trimmed for silence - {filename}") + if self.do_sound_norm: + x = self.sound_norm(x) + return x + + def save_wav(self, wav: np.ndarray, path: str, sr: int = None) -> None: + """Save a waveform to a file using Scipy. + + Args: + wav (np.ndarray): Waveform to save. + path (str): Path to a output file. + sr (int, optional): Sampling rate used for saving to the file. Defaults to None. + """ + wav_norm = wav * (32767 / max(0.01, np.max(np.abs(wav)))) + scipy.io.wavfile.write(path, sr if sr else self.sample_rate, wav_norm.astype(np.int16)) + + @staticmethod + def mulaw_encode(wav: np.ndarray, qc: int) -> np.ndarray: + mu = 2 ** qc - 1 + # wav_abs = np.minimum(np.abs(wav), 1.0) + signal = np.sign(wav) * np.log(1 + mu * np.abs(wav)) / np.log(1.0 + mu) + # Quantize signal to the specified number of levels. + signal = (signal + 1) / 2 * mu + 0.5 + return np.floor( + signal, + ) + + @staticmethod + def mulaw_decode(wav, qc): + """Recovers waveform from quantized values.""" + mu = 2 ** qc - 1 + x = np.sign(wav) / mu * ((1 + mu) ** np.abs(wav) - 1) + return x + + @staticmethod + def encode_16bits(x): + return np.clip(x * 2 ** 15, -(2 ** 15), 2 ** 15 - 1).astype(np.int16) + + @staticmethod + def quantize(x: np.ndarray, bits: int) -> np.ndarray: + """Quantize a waveform to a given number of bits. + + Args: + x (np.ndarray): Waveform to quantize. Must be normalized into the range `[-1, 1]`. + bits (int): Number of quantization bits. + + Returns: + np.ndarray: Quantized waveform. + """ + return (x + 1.0) * (2 ** bits - 1) / 2 + + @staticmethod + def dequantize(x, bits): + """Dequantize a waveform from the given number of bits.""" + return 2 * x / (2 ** bits - 1) - 1 + + +def _log(x, base): + if base == 10: + return np.log10(x) + return np.log(x) + + +def _exp(x, base): + if base == 10: + return np.power(10, x) + return np.exp(x) diff --git a/speaker/utils/coqpit.py b/speaker/utils/coqpit.py new file mode 100644 index 0000000000000000000000000000000000000000..e214c8b8a2045701b15e77c5d66012a64f135429 --- /dev/null +++ b/speaker/utils/coqpit.py @@ -0,0 +1,954 @@ +import argparse +import functools +import json +import operator +import os +from collections.abc import MutableMapping +from dataclasses import MISSING as _MISSING +from dataclasses import Field, asdict, dataclass, fields, is_dataclass, replace +from pathlib import Path +from pprint import pprint +from typing import Any, Dict, Generic, List, Optional, Type, TypeVar, Union, get_type_hints + +T = TypeVar("T") +MISSING: Any = "???" + + +class _NoDefault(Generic[T]): + pass + + +NoDefaultVar = Union[_NoDefault[T], T] +no_default: NoDefaultVar = _NoDefault() + + +def is_primitive_type(arg_type: Any) -> bool: + """Check if the input type is one of `int, float, str, bool`. + + Args: + arg_type (typing.Any): input type to check. + + Returns: + bool: True if input type is one of `int, float, str, bool`. + """ + try: + return isinstance(arg_type(), (int, float, str, bool)) + except (AttributeError, TypeError): + return False + + +def is_list(arg_type: Any) -> bool: + """Check if the input type is `list` + + Args: + arg_type (typing.Any): input type. + + Returns: + bool: True if input type is `list` + """ + try: + return arg_type is list or arg_type is List or arg_type.__origin__ is list or arg_type.__origin__ is List + except AttributeError: + return False + + +def is_dict(arg_type: Any) -> bool: + """Check if the input type is `dict` + + Args: + arg_type (typing.Any): input type. + + Returns: + bool: True if input type is `dict` + """ + try: + return arg_type is dict or arg_type is Dict or arg_type.__origin__ is dict + except AttributeError: + return False + + +def is_union(arg_type: Any) -> bool: + """Check if the input type is `Union`. + + Args: + arg_type (typing.Any): input type. + + Returns: + bool: True if input type is `Union` + """ + try: + return safe_issubclass(arg_type.__origin__, Union) + except AttributeError: + return False + + +def safe_issubclass(cls, classinfo) -> bool: + """Check if the input type is a subclass of the given class. + + Args: + cls (type): input type. + classinfo (type): parent class. + + Returns: + bool: True if the input type is a subclass of the given class + """ + try: + r = issubclass(cls, classinfo) + except Exception: # pylint: disable=broad-except + return cls is classinfo + else: + return r + + +def _coqpit_json_default(obj: Any) -> Any: + if isinstance(obj, Path): + return str(obj) + raise TypeError(f"Can't encode object of type {type(obj).__name__}") + + +def _default_value(x: Field): + """Return the default value of the input Field. + + Args: + x (Field): input Field. + + Returns: + object: default value of the input Field. + """ + if x.default not in (MISSING, _MISSING): + return x.default + if x.default_factory not in (MISSING, _MISSING): + return x.default_factory() + return x.default + + +def _is_optional_field(field) -> bool: + """Check if the input field is optional. + + Args: + field (Field): input Field to check. + + Returns: + bool: True if the input field is optional. + """ + # return isinstance(field.type, _GenericAlias) and type(None) in getattr(field.type, "__args__") + return type(None) in getattr(field.type, "__args__") + + +def my_get_type_hints( + cls, +): + """Custom `get_type_hints` dealing with https://github.com/python/typing/issues/737 + + Returns: + [dataclass]: dataclass to get the type hints of its fields. + """ + r_dict = {} + for base in cls.__class__.__bases__: + if base == object: + break + r_dict.update(my_get_type_hints(base)) + r_dict.update(get_type_hints(cls)) + return r_dict + + +def _serialize(x): + """Pick the right serialization for the datatype of the given input. + + Args: + x (object): input object. + + Returns: + object: serialized object. + """ + if isinstance(x, Path): + return str(x) + if isinstance(x, dict): + return {k: _serialize(v) for k, v in x.items()} + if isinstance(x, list): + return [_serialize(xi) for xi in x] + if isinstance(x, Serializable) or issubclass(type(x), Serializable): + return x.serialize() + if isinstance(x, type) and issubclass(x, Serializable): + return x.serialize(x) + return x + + +def _deserialize_dict(x: Dict) -> Dict: + """Deserialize dict. + + Args: + x (Dict): value to deserialized. + + Returns: + Dict: deserialized dictionary. + """ + out_dict = {} + for k, v in x.items(): + if v is None: # if {'key':None} + out_dict[k] = None + else: + out_dict[k] = _deserialize(v, type(v)) + return out_dict + + +def _deserialize_list(x: List, field_type: Type) -> List: + """Deserialize values for List typed fields. + + Args: + x (List): value to be deserialized + field_type (Type): field type. + + Raises: + ValueError: Coqpit does not support multi type-hinted lists. + + Returns: + [List]: deserialized list. + """ + field_args = None + if hasattr(field_type, "__args__") and field_type.__args__: + field_args = field_type.__args__ + elif hasattr(field_type, "__parameters__") and field_type.__parameters__: + # bandaid for python 3.6 + field_args = field_type.__parameters__ + if field_args: + if len(field_args) > 1: + raise ValueError(" [!] Coqpit does not support multi-type hinted 'List'") + field_arg = field_args[0] + # if field type is TypeVar set the current type by the value's type. + if isinstance(field_arg, TypeVar): + field_arg = type(x) + return [_deserialize(xi, field_arg) for xi in x] + return x + + +def _deserialize_union(x: Any, field_type: Type) -> Any: + """Deserialize values for Union typed fields + + Args: + x (Any): value to be deserialized. + field_type (Type): field type. + + Returns: + [Any]: desrialized value. + """ + for arg in field_type.__args__: + # stop after first matching type in Union + try: + x = _deserialize(x, arg) + break + except ValueError: + pass + return x + + +def _deserialize_primitive_types(x: Union[int, float, str, bool], field_type: Type) -> Union[int, float, str, bool]: + """Deserialize python primitive types (float, int, str, bool). + It handles `inf` values exclusively and keeps them float against int fields since int does not support inf values. + + Args: + x (Union[int, float, str, bool]): value to be deserialized. + field_type (Type): field type. + + Returns: + Union[int, float, str, bool]: deserialized value. + """ + + if isinstance(x, (str, bool)): + return x + if isinstance(x, (int, float)): + if x == float("inf") or x == float("-inf"): + # if value type is inf return regardless. + return x + x = field_type(x) + return x + # TODO: Raise an error when x does not match the types. + return None + + +def _deserialize(x: Any, field_type: Any) -> Any: + """Pick the right desrialization for the given object and the corresponding field type. + + Args: + x (object): object to be deserialized. + field_type (type): expected type after deserialization. + + Returns: + object: deserialized object + + """ + # pylint: disable=too-many-return-statements + if is_dict(field_type): + return _deserialize_dict(x) + if is_list(field_type): + return _deserialize_list(x, field_type) + if is_union(field_type): + return _deserialize_union(x, field_type) + if issubclass(field_type, Serializable): + return field_type.deserialize_immutable(x) + if is_primitive_type(field_type): + return _deserialize_primitive_types(x, field_type) + raise ValueError(f" [!] '{type(x)}' value type of '{x}' does not match '{field_type}' field type.") + + +# Recursive setattr (supports dotted attr names) +def rsetattr(obj, attr, val): + def _setitem(obj, attr, val): + return operator.setitem(obj, int(attr), val) + + pre, _, post = attr.rpartition(".") + setfunc = _setitem if post.isnumeric() else setattr + + return setfunc(rgetattr(obj, pre) if pre else obj, post, val) + + +# Recursive getattr (supports dotted attr names) +def rgetattr(obj, attr, *args): + def _getitem(obj, attr): + return operator.getitem(obj, int(attr), *args) + + def _getattr(obj, attr): + getfunc = _getitem if attr.isnumeric() else getattr + return getfunc(obj, attr, *args) + + return functools.reduce(_getattr, [obj] + attr.split(".")) + + +# Recursive setitem (supports dotted attr names) +def rsetitem(obj, attr, val): + pre, _, post = attr.rpartition(".") + return operator.setitem(rgetitem(obj, pre) if pre else obj, post, val) + + +# Recursive getitem (supports dotted attr names) +def rgetitem(obj, attr, *args): + def _getitem(obj, attr): + return operator.getitem(obj, int(attr) if attr.isnumeric() else attr, *args) + + return functools.reduce(_getitem, [obj] + attr.split(".")) + + +@dataclass +class Serializable: + """Gives serialization ability to any inheriting dataclass.""" + + def __post_init__(self): + self._validate_contracts() + for key, value in self.__dict__.items(): + if value is no_default: + raise TypeError(f"__init__ missing 1 required argument: '{key}'") + + def _validate_contracts(self): + dataclass_fields = fields(self) + + for field in dataclass_fields: + + value = getattr(self, field.name) + + if value is None: + if not _is_optional_field(field): + raise TypeError(f"{field.name} is not optional") + + contract = field.metadata.get("contract", None) + + if contract is not None: + if value is not None and not contract(value): + raise ValueError(f"break the contract for {field.name}, {self.__class__.__name__}") + + def validate(self): + """validate if object can serialize / deserialize correctly.""" + self._validate_contracts() + if self != self.__class__.deserialize( # pylint: disable=no-value-for-parameter + json.loads(json.dumps(self.serialize())) + ): + raise ValueError("could not be deserialized with same value") + + def to_dict(self) -> dict: + """Transform serializable object to dict.""" + cls_fields = fields(self) + o = {} + for cls_field in cls_fields: + o[cls_field.name] = getattr(self, cls_field.name) + return o + + def serialize(self) -> dict: + """Serialize object to be json serializable representation.""" + if not is_dataclass(self): + raise TypeError("need to be decorated as dataclass") + + dataclass_fields = fields(self) + + o = {} + + for field in dataclass_fields: + value = getattr(self, field.name) + value = _serialize(value) + o[field.name] = value + return o + + def deserialize(self, data: dict) -> "Serializable": + """Parse input dictionary and desrialize its fields to a dataclass. + + Returns: + self: deserialized `self`. + """ + if not isinstance(data, dict): + raise ValueError() + data = data.copy() + init_kwargs = {} + for field in fields(self): + # if field.name == 'dataset_config': + if field.name not in data: + if field.name in vars(self): + init_kwargs[field.name] = vars(self)[field.name] + continue + raise ValueError(f' [!] Missing required field "{field.name}"') + value = data.get(field.name, _default_value(field)) + if value is None: + init_kwargs[field.name] = value + continue + if value == MISSING: + raise ValueError(f"deserialized with unknown value for {field.name} in {self.__name__}") + value = _deserialize(value, field.type) + init_kwargs[field.name] = value + for k, v in init_kwargs.items(): + setattr(self, k, v) + return self + + @classmethod + def deserialize_immutable(cls, data: dict) -> "Serializable": + """Parse input dictionary and desrialize its fields to a dataclass. + + Returns: + Newly created deserialized object. + """ + if not isinstance(data, dict): + raise ValueError() + data = data.copy() + init_kwargs = {} + for field in fields(cls): + # if field.name == 'dataset_config': + if field.name not in data: + if field.name in vars(cls): + init_kwargs[field.name] = vars(cls)[field.name] + continue + # if not in cls and the default value is not Missing use it + default_value = _default_value(field) + if default_value not in (MISSING, _MISSING): + init_kwargs[field.name] = default_value + continue + raise ValueError(f' [!] Missing required field "{field.name}"') + value = data.get(field.name, _default_value(field)) + if value is None: + init_kwargs[field.name] = value + continue + if value == MISSING: + raise ValueError(f"Deserialized with unknown value for {field.name} in {cls.__name__}") + value = _deserialize(value, field.type) + init_kwargs[field.name] = value + return cls(**init_kwargs) + + +# ---------------------------------------------------------------------------- # +# Argument Parsing from `argparse` # +# ---------------------------------------------------------------------------- # + + +def _get_help(field): + try: + field_help = field.metadata["help"] + except KeyError: + field_help = "" + return field_help + + +def _init_argparse( + parser, + field_name, + field_type, + field_default, + field_default_factory, + field_help, + arg_prefix="", + help_prefix="", + relaxed_parser=False, +): + has_default = False + default = None + if field_default: + has_default = True + default = field_default + elif field_default_factory not in (None, _MISSING): + has_default = True + default = field_default_factory() + + if not has_default and not is_primitive_type(field_type) and not is_list(field_type): + # aggregate types (fields with a Coqpit subclass as type) are not supported without None + return parser + arg_prefix = field_name if arg_prefix == "" else f"{arg_prefix}.{field_name}" + help_prefix = field_help if help_prefix == "" else f"{help_prefix} - {field_help}" + if is_dict(field_type): # pylint: disable=no-else-raise + # NOTE: accept any string in json format as input to dict field. + parser.add_argument( + f"--{arg_prefix}", + dest=arg_prefix, + default=json.dumps(field_default) if field_default else None, + type=json.loads, + ) + elif is_list(field_type): + # TODO: We need a more clear help msg for lists. + if hasattr(field_type, "__args__"): # if the list is hinted + if len(field_type.__args__) > 1 and not relaxed_parser: + raise ValueError(" [!] Coqpit does not support multi-type hinted 'List'") + list_field_type = field_type.__args__[0] + else: + raise ValueError(" [!] Coqpit does not support un-hinted 'List'") + + # TODO: handle list of lists + if is_list(list_field_type) and relaxed_parser: + return parser + + if not has_default or field_default_factory is list: + if not is_primitive_type(list_field_type) and not relaxed_parser: + raise NotImplementedError(" [!] Empty list with non primitive inner type is currently not supported.") + + # If the list's default value is None, the user can specify the entire list by passing multiple parameters + parser.add_argument( + f"--{arg_prefix}", + nargs="*", + type=list_field_type, + help=f"Coqpit Field: {help_prefix}", + ) + else: + # If a default value is defined, just enable editing the values from argparse + # TODO: allow inserting a new value/obj to the end of the list. + for idx, fv in enumerate(default): + parser = _init_argparse( + parser, + str(idx), + list_field_type, + fv, + field_default_factory, + field_help="", + help_prefix=f"{help_prefix} - ", + arg_prefix=f"{arg_prefix}", + relaxed_parser=relaxed_parser, + ) + elif is_union(field_type): + # TODO: currently I don't know how to handle Union type on argparse + if not relaxed_parser: + raise NotImplementedError( + " [!] Parsing `Union` field from argparse is not yet implemented. Please create an issue." + ) + elif issubclass(field_type, Serializable): + return default.init_argparse( + parser, arg_prefix=arg_prefix, help_prefix=help_prefix, relaxed_parser=relaxed_parser + ) + elif isinstance(field_type(), bool): + + def parse_bool(x): + if x not in ("true", "false"): + raise ValueError(f' [!] Value for boolean field must be either "true" or "false". Got "{x}".') + return x == "true" + + parser.add_argument( + f"--{arg_prefix}", + type=parse_bool, + default=field_default, + help=f"Coqpit Field: {help_prefix}", + metavar="true/false", + ) + elif is_primitive_type(field_type): + parser.add_argument( + f"--{arg_prefix}", + default=field_default, + type=field_type, + help=f"Coqpit Field: {help_prefix}", + ) + else: + if not relaxed_parser: + raise NotImplementedError(f" [!] '{field_type}' is not supported by arg_parser. Please file a bug report.") + return parser + + +# ---------------------------------------------------------------------------- # +# Main Coqpit Class # +# ---------------------------------------------------------------------------- # + + +@dataclass +class Coqpit(Serializable, MutableMapping): + """Coqpit base class to be inherited by any Coqpit dataclasses. + It overrides Python `dict` interface and provides `dict` compatible API. + It also enables serializing/deserializing a dataclass to/from a json file, plus some semi-dynamic type and value check. + Note that it does not support all datatypes and likely to fail in some cases. + """ + + _initialized = False + + def _is_initialized(self): + """Check if Coqpit is initialized. Useful to prevent running some aux functions + at the initialization when no attribute has been defined.""" + return "_initialized" in vars(self) and self._initialized + + def __post_init__(self): + self._initialized = True + try: + self.check_values() + except AttributeError: + pass + + ## `dict` API functions + + def __iter__(self): + return iter(asdict(self)) + + def __len__(self): + return len(fields(self)) + + def __setitem__(self, arg: str, value: Any): + setattr(self, arg, value) + + def __getitem__(self, arg: str): + """Access class attributes with ``[arg]``.""" + return self.__dict__[arg] + + def __delitem__(self, arg: str): + delattr(self, arg) + + def _keytransform(self, key): # pylint: disable=no-self-use + return key + + ## end `dict` API functions + + def __getattribute__(self, arg: str): # pylint: disable=no-self-use + """Check if the mandatory field is defined when accessing it.""" + value = super().__getattribute__(arg) + if isinstance(value, str) and value == "???": + raise AttributeError(f" [!] MISSING field {arg} must be defined.") + return value + + def __contains__(self, arg: str): + return arg in self.to_dict() + + def get(self, key: str, default: Any = None): + if self.has(key): + return asdict(self)[key] + return default + + def items(self): + return asdict(self).items() + + def merge(self, coqpits: Union["Coqpit", List["Coqpit"]]): + """Merge a coqpit instance or a list of coqpit instances to self. + Note that it does not pass the fields and overrides attributes with + the last Coqpit instance in the given List. + TODO: find a way to merge instances with all the class internals. + + Args: + coqpits (Union[Coqpit, List[Coqpit]]): coqpit instance or list of instances to be merged. + """ + + def _merge(coqpit): + self.__dict__.update(coqpit.__dict__) + self.__annotations__.update(coqpit.__annotations__) + self.__dataclass_fields__.update(coqpit.__dataclass_fields__) + + if isinstance(coqpits, list): + for coqpit in coqpits: + _merge(coqpit) + else: + _merge(coqpits) + + def check_values(self): + pass + + def has(self, arg: str) -> bool: + return arg in vars(self) + + def copy(self): + return replace(self) + + def update(self, new: dict, allow_new=False) -> None: + """Update Coqpit fields by the input ```dict```. + + Args: + new (dict): dictionary with new values. + allow_new (bool, optional): allow new fields to add. Defaults to False. + """ + for key, value in new.items(): + if allow_new: + setattr(self, key, value) + else: + if hasattr(self, key): + setattr(self, key, value) + else: + raise KeyError(f" [!] No key - {key}") + + def pprint(self) -> None: + """Print Coqpit fields in a format.""" + pprint(asdict(self)) + + def to_dict(self) -> dict: + # return asdict(self) + return self.serialize() + + def from_dict(self, data: dict) -> None: + self = self.deserialize(data) # pylint: disable=self-cls-assignment + + @classmethod + def new_from_dict(cls: Serializable, data: dict) -> "Coqpit": + return cls.deserialize_immutable(data) + + def to_json(self) -> str: + """Returns a JSON string representation.""" + return json.dumps(asdict(self), indent=4, default=_coqpit_json_default) + + def save_json(self, file_name: str) -> None: + """Save Coqpit to a json file. + + Args: + file_name (str): path to the output json file. + """ + with open(file_name, "w", encoding="utf8") as f: + json.dump(asdict(self), f, indent=4) + + def load_json(self, file_name: str) -> None: + """Load a json file and update matching config fields with type checking. + Non-matching parameters in the json file are ignored. + + Args: + file_name (str): path to the json file. + + Returns: + Coqpit: new Coqpit with updated config fields. + """ + with open(file_name, "r", encoding="utf8") as f: + input_str = f.read() + dump_dict = json.loads(input_str) + # TODO: this looks stupid 💆 + self = self.deserialize(dump_dict) # pylint: disable=self-cls-assignment + self.check_values() + + @classmethod + def init_from_argparse( + cls, args: Optional[Union[argparse.Namespace, List[str]]] = None, arg_prefix: str = "coqpit" + ) -> "Coqpit": + """Create a new Coqpit instance from argparse input. + + Args: + args (namespace or list of str, optional): parsed argparse.Namespace or list of command line parameters. If unspecified will use a newly created parser with ```init_argparse()```. + arg_prefix: prefix to add to CLI parameters. Gets forwarded to ```init_argparse``` when ```args``` is not passed. + """ + if not args: + # If args was not specified, parse from sys.argv + parser = cls.init_argparse(cls, arg_prefix=arg_prefix) + args = parser.parse_args() # pylint: disable=E1120, E1111 + if isinstance(args, list): + # If a list was passed in (eg. the second result of `parse_known_args`, run that through argparse first to get a parsed Namespace + parser = cls.init_argparse(cls, arg_prefix=arg_prefix) + args = parser.parse_args(args) # pylint: disable=E1120, E1111 + + # Handle list and object attributes with defaults, which can be modified + # directly (eg. --coqpit.list.0.val_a 1), by constructing real objects + # from defaults and passing those to `cls.__init__` + args_with_lists_processed = {} + class_fields = fields(cls) + for field in class_fields: + has_default = False + default = None + field_default = field.default if field.default is not _MISSING else None + field_default_factory = field.default_factory if field.default_factory is not _MISSING else None + if field_default: + has_default = True + default = field_default + elif field_default_factory: + has_default = True + default = field_default_factory() + + if has_default and (not is_primitive_type(field.type) or is_list(field.type)): + args_with_lists_processed[field.name] = default + + args_dict = vars(args) + for k, v in args_dict.items(): + # Remove argparse prefix (eg. "--coqpit." if present) + if k.startswith(f"{arg_prefix}."): + k = k[len(f"{arg_prefix}.") :] + + rsetitem(args_with_lists_processed, k, v) + + return cls(**args_with_lists_processed) + + def parse_args( + self, args: Optional[Union[argparse.Namespace, List[str]]] = None, arg_prefix: str = "coqpit" + ) -> None: + """Update config values from argparse arguments with some meta-programming ✨. + + Args: + args (namespace or list of str, optional): parsed argparse.Namespace or list of command line parameters. If unspecified will use a newly created parser with ```init_argparse()```. + arg_prefix: prefix to add to CLI parameters. Gets forwarded to ```init_argparse``` when ```args``` is not passed. + """ + if not args: + # If args was not specified, parse from sys.argv + parser = self.init_argparse(arg_prefix=arg_prefix) + args = parser.parse_args() + if isinstance(args, list): + # If a list was passed in (eg. the second result of `parse_known_args`, run that through argparse first to get a parsed Namespace + parser = self.init_argparse(arg_prefix=arg_prefix) + args = parser.parse_args(args) + + args_dict = vars(args) + + for k, v in args_dict.items(): + if k.startswith(f"{arg_prefix}."): + k = k[len(f"{arg_prefix}.") :] + try: + rgetattr(self, k) + except (TypeError, AttributeError) as e: + raise Exception(f" [!] '{k}' not exist to override from argparse.") from e + + rsetattr(self, k, v) + + self.check_values() + + def parse_known_args( + self, + args: Optional[Union[argparse.Namespace, List[str]]] = None, + arg_prefix: str = "coqpit", + relaxed_parser=False, + ) -> List[str]: + """Update config values from argparse arguments. Ignore unknown arguments. + This is analog to argparse.ArgumentParser.parse_known_args (vs parse_args). + + Args: + args (namespace or list of str, optional): parsed argparse.Namespace or list of command line parameters. If unspecified will use a newly created parser with ```init_argparse()```. + arg_prefix: prefix to add to CLI parameters. Gets forwarded to ```init_argparse``` when ```args``` is not passed. + relaxed_parser (bool, optional): If True, do not force all the fields to have compatible types with the argparser. Defaults to False. + + Returns: + List of unknown parameters. + """ + if not args: + # If args was not specified, parse from sys.argv + parser = self.init_argparse(arg_prefix=arg_prefix, relaxed_parser=relaxed_parser) + args, unknown = parser.parse_known_args() + if isinstance(args, list): + # If a list was passed in (eg. the second result of `parse_known_args`, run that through argparse first to get a parsed Namespace + parser = self.init_argparse(arg_prefix=arg_prefix, relaxed_parser=relaxed_parser) + args, unknown = parser.parse_known_args(args) + + self.parse_args(args) + return unknown + + def init_argparse( + self, + parser: Optional[argparse.ArgumentParser] = None, + arg_prefix="coqpit", + help_prefix="", + relaxed_parser=False, + ) -> argparse.ArgumentParser: + """Pass Coqpit fields as argparse arguments. This allows to edit values through command-line. + + Args: + parser (argparse.ArgumentParser, optional): argparse.ArgumentParser instance. If unspecified a new one will be created. + arg_prefix (str, optional): Prefix to be used for the argument name. Defaults to 'coqpit'. + help_prefix (str, optional): Prefix to be used for the argument description. Defaults to ''. + relaxed_parser (bool, optional): If True, do not force all the fields to have compatible types with the argparser. Defaults to False. + + Returns: + argparse.ArgumentParser: parser instance with the new arguments. + """ + if not parser: + parser = argparse.ArgumentParser() + class_fields = fields(self) + for field in class_fields: + if field.name in vars(self): + # use the current value of the field + # prevent dropping the current value + field_default = vars(self)[field.name] + else: + # use the default value of the field + field_default = field.default if field.default is not _MISSING else None + field_type = field.type + field_default_factory = field.default_factory + field_help = _get_help(field) + _init_argparse( + parser, + field.name, + field_type, + field_default, + field_default_factory, + field_help, + arg_prefix, + help_prefix, + relaxed_parser, + ) + return parser + + +def check_argument( + name, + c, + is_path: bool = False, + prerequest: str = None, + enum_list: list = None, + max_val: float = None, + min_val: float = None, + restricted: bool = False, + alternative: str = None, + allow_none: bool = True, +) -> None: + """Simple type and value checking for Coqpit. + It is intended to be used under ```__post_init__()``` of config dataclasses. + + Args: + name (str): name of the field to be checked. + c (dict): config dictionary. + is_path (bool, optional): if ```True``` check if the path is exist. Defaults to False. + prerequest (list or str, optional): a list of field name that are prerequestedby the target field name. + Defaults to ```[]```. + enum_list (list, optional): list of possible values for the target field. Defaults to None. + max_val (float, optional): maximum possible value for the target field. Defaults to None. + min_val (float, optional): minimum possible value for the target field. Defaults to None. + restricted (bool, optional): if ```True``` the target field has to be defined. Defaults to False. + alternative (str, optional): a field name superceding the target field. Defaults to None. + allow_none (bool, optional): if ```True``` allow the target field to be ```None```. Defaults to False. + + + Example: + >>> num_mels = 5 + >>> check_argument('num_mels', c, restricted=True, min_val=10, max_val=2056) + >>> fft_size = 128 + >>> check_argument('fft_size', c, restricted=True, min_val=128, max_val=4058) + """ + # check if None allowed + if allow_none and c[name] is None: + return + if not allow_none: + assert c[name] is not None, f" [!] None value is not allowed for {name}." + # check if restricted and it it is check if it exists + if isinstance(restricted, bool) and restricted: + assert name in c.keys(), f" [!] {name} not defined in config.json" + # check prerequest fields are defined + if isinstance(prerequest, list): + assert any( + f not in c.keys() for f in prerequest + ), f" [!] prequested fields {prerequest} for {name} are not defined." + else: + assert ( + prerequest is None or prerequest in c.keys() + ), f" [!] prequested fields {prerequest} for {name} are not defined." + # check if the path exists + if is_path: + assert os.path.exists(c[name]), f' [!] path for {name} ("{c[name]}") does not exist.' + # skip the rest if the alternative field is defined. + if alternative in c.keys() and c[alternative] is not None: + return + # check value constraints + if name in c.keys(): + if max_val is not None: + assert c[name] <= max_val, f" [!] {name} is larger than max value {max_val}" + if min_val is not None: + assert c[name] >= min_val, f" [!] {name} is smaller than min value {min_val}" + if enum_list is not None: + assert c[name].lower() in enum_list, f" [!] {name} is not a valid value" diff --git a/speaker/utils/io.py b/speaker/utils/io.py new file mode 100644 index 0000000000000000000000000000000000000000..1d4c07940d872cb6773d388029595aecf67e4408 --- /dev/null +++ b/speaker/utils/io.py @@ -0,0 +1,198 @@ +import datetime +import json +import os +import pickle as pickle_tts +import shutil +from typing import Any, Callable, Dict, Union + +import fsspec +import torch +from .coqpit import Coqpit + + +class RenamingUnpickler(pickle_tts.Unpickler): + """Overload default pickler to solve module renaming problem""" + + def find_class(self, module, name): + return super().find_class(module.replace("mozilla_voice_tts", "TTS"), name) + + +class AttrDict(dict): + """A custom dict which converts dict keys + to class attributes""" + + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + self.__dict__ = self + + +def copy_model_files(config: Coqpit, out_path, new_fields): + """Copy config.json and other model files to training folder and add + new fields. + + Args: + config (Coqpit): Coqpit config defining the training run. + out_path (str): output path to copy the file. + new_fields (dict): new fileds to be added or edited + in the config file. + """ + copy_config_path = os.path.join(out_path, "config.json") + # add extra information fields + config.update(new_fields, allow_new=True) + # TODO: Revert to config.save_json() once Coqpit supports arbitrary paths. + with fsspec.open(copy_config_path, "w", encoding="utf8") as f: + json.dump(config.to_dict(), f, indent=4) + + # copy model stats file if available + if config.audio.stats_path is not None: + copy_stats_path = os.path.join(out_path, "scale_stats.npy") + filesystem = fsspec.get_mapper(copy_stats_path).fs + if not filesystem.exists(copy_stats_path): + with fsspec.open(config.audio.stats_path, "rb") as source_file: + with fsspec.open(copy_stats_path, "wb") as target_file: + shutil.copyfileobj(source_file, target_file) + + +def load_fsspec( + path: str, + map_location: Union[str, Callable, torch.device, Dict[Union[str, torch.device], Union[str, torch.device]]] = None, + **kwargs, +) -> Any: + """Like torch.load but can load from other locations (e.g. s3:// , gs://). + + Args: + path: Any path or url supported by fsspec. + map_location: torch.device or str. + **kwargs: Keyword arguments forwarded to torch.load. + + Returns: + Object stored in path. + """ + with fsspec.open(path, "rb") as f: + return torch.load(f, map_location=map_location, **kwargs) + + +def load_checkpoint(model, checkpoint_path, use_cuda=False, eval=False): # pylint: disable=redefined-builtin + try: + state = load_fsspec(checkpoint_path, map_location=torch.device("cpu")) + except ModuleNotFoundError: + pickle_tts.Unpickler = RenamingUnpickler + state = load_fsspec(checkpoint_path, map_location=torch.device("cpu"), pickle_module=pickle_tts) + model.load_state_dict(state["model"]) + if use_cuda: + model.cuda() + if eval: + model.eval() + return model, state + + +def save_fsspec(state: Any, path: str, **kwargs): + """Like torch.save but can save to other locations (e.g. s3:// , gs://). + + Args: + state: State object to save + path: Any path or url supported by fsspec. + **kwargs: Keyword arguments forwarded to torch.save. + """ + with fsspec.open(path, "wb") as f: + torch.save(state, f, **kwargs) + + +def save_model(config, model, optimizer, scaler, current_step, epoch, output_path, **kwargs): + if hasattr(model, "module"): + model_state = model.module.state_dict() + else: + model_state = model.state_dict() + if isinstance(optimizer, list): + optimizer_state = [optim.state_dict() for optim in optimizer] + else: + optimizer_state = optimizer.state_dict() if optimizer is not None else None + + if isinstance(scaler, list): + scaler_state = [s.state_dict() for s in scaler] + else: + scaler_state = scaler.state_dict() if scaler is not None else None + + if isinstance(config, Coqpit): + config = config.to_dict() + + state = { + "config": config, + "model": model_state, + "optimizer": optimizer_state, + "scaler": scaler_state, + "step": current_step, + "epoch": epoch, + "date": datetime.date.today().strftime("%B %d, %Y"), + } + state.update(kwargs) + save_fsspec(state, output_path) + + +def save_checkpoint( + config, + model, + optimizer, + scaler, + current_step, + epoch, + output_folder, + **kwargs, +): + file_name = "checkpoint_{}.pth.tar".format(current_step) + checkpoint_path = os.path.join(output_folder, file_name) + print("\n > CHECKPOINT : {}".format(checkpoint_path)) + save_model( + config, + model, + optimizer, + scaler, + current_step, + epoch, + checkpoint_path, + **kwargs, + ) + + +def save_best_model( + current_loss, + best_loss, + config, + model, + optimizer, + scaler, + current_step, + epoch, + out_path, + keep_all_best=False, + keep_after=10000, + **kwargs, +): + if current_loss < best_loss: + best_model_name = f"best_model_{current_step}.pth.tar" + checkpoint_path = os.path.join(out_path, best_model_name) + print(" > BEST MODEL : {}".format(checkpoint_path)) + save_model( + config, + model, + optimizer, + scaler, + current_step, + epoch, + checkpoint_path, + model_loss=current_loss, + **kwargs, + ) + fs = fsspec.get_mapper(out_path).fs + # only delete previous if current is saved successfully + if not keep_all_best or (current_step < keep_after): + model_names = fs.glob(os.path.join(out_path, "best_model*.pth.tar")) + for model_name in model_names: + if os.path.basename(model_name) != best_model_name: + fs.rm(model_name) + # create a shortcut which always points to the currently best model + shortcut_name = "best_model.pth.tar" + shortcut_path = os.path.join(out_path, shortcut_name) + fs.copy(checkpoint_path, shortcut_path) + best_loss = current_loss + return best_loss diff --git a/speaker/utils/shared_configs.py b/speaker/utils/shared_configs.py new file mode 100644 index 0000000000000000000000000000000000000000..a89d3a91c31679989b60b657fe8ef6ace5f02552 --- /dev/null +++ b/speaker/utils/shared_configs.py @@ -0,0 +1,342 @@ +from dataclasses import asdict, dataclass +from typing import List + +from .coqpit import Coqpit, check_argument + + +@dataclass +class BaseAudioConfig(Coqpit): + """Base config to definge audio processing parameters. It is used to initialize + ```TTS.utils.audio.AudioProcessor.``` + + Args: + fft_size (int): + Number of STFT frequency levels aka.size of the linear spectogram frame. Defaults to 1024. + + win_length (int): + Each frame of audio is windowed by window of length ```win_length``` and then padded with zeros to match + ```fft_size```. Defaults to 1024. + + hop_length (int): + Number of audio samples between adjacent STFT columns. Defaults to 1024. + + frame_shift_ms (int): + Set ```hop_length``` based on milliseconds and sampling rate. + + frame_length_ms (int): + Set ```win_length``` based on milliseconds and sampling rate. + + stft_pad_mode (str): + Padding method used in STFT. 'reflect' or 'center'. Defaults to 'reflect'. + + sample_rate (int): + Audio sampling rate. Defaults to 22050. + + resample (bool): + Enable / Disable resampling audio to ```sample_rate```. Defaults to ```False```. + + preemphasis (float): + Preemphasis coefficient. Defaults to 0.0. + + ref_level_db (int): 20 + Reference Db level to rebase the audio signal and ignore the level below. 20Db is assumed the sound of air. + Defaults to 20. + + do_sound_norm (bool): + Enable / Disable sound normalization to reconcile the volume differences among samples. Defaults to False. + + log_func (str): + Numpy log function used for amplitude to DB conversion. Defaults to 'np.log10'. + + do_trim_silence (bool): + Enable / Disable trimming silences at the beginning and the end of the audio clip. Defaults to ```True```. + + do_amp_to_db_linear (bool, optional): + enable/disable amplitude to dB conversion of linear spectrograms. Defaults to True. + + do_amp_to_db_mel (bool, optional): + enable/disable amplitude to dB conversion of mel spectrograms. Defaults to True. + + trim_db (int): + Silence threshold used for silence trimming. Defaults to 45. + + power (float): + Exponent used for expanding spectrogra levels before running Griffin Lim. It helps to reduce the + artifacts in the synthesized voice. Defaults to 1.5. + + griffin_lim_iters (int): + Number of Griffing Lim iterations. Defaults to 60. + + num_mels (int): + Number of mel-basis frames that defines the frame lengths of each mel-spectrogram frame. Defaults to 80. + + mel_fmin (float): Min frequency level used for the mel-basis filters. ~50 for male and ~95 for female voices. + It needs to be adjusted for a dataset. Defaults to 0. + + mel_fmax (float): + Max frequency level used for the mel-basis filters. It needs to be adjusted for a dataset. + + spec_gain (int): + Gain applied when converting amplitude to DB. Defaults to 20. + + signal_norm (bool): + enable/disable signal normalization. Defaults to True. + + min_level_db (int): + minimum db threshold for the computed melspectrograms. Defaults to -100. + + symmetric_norm (bool): + enable/disable symmetric normalization. If set True normalization is performed in the range [-k, k] else + [0, k], Defaults to True. + + max_norm (float): + ```k``` defining the normalization range. Defaults to 4.0. + + clip_norm (bool): + enable/disable clipping the our of range values in the normalized audio signal. Defaults to True. + + stats_path (str): + Path to the computed stats file. Defaults to None. + """ + + # stft parameters + fft_size: int = 1024 + win_length: int = 1024 + hop_length: int = 256 + frame_shift_ms: int = None + frame_length_ms: int = None + stft_pad_mode: str = "reflect" + # audio processing parameters + sample_rate: int = 22050 + resample: bool = False + preemphasis: float = 0.0 + ref_level_db: int = 20 + do_sound_norm: bool = False + log_func: str = "np.log10" + # silence trimming + do_trim_silence: bool = True + trim_db: int = 45 + # griffin-lim params + power: float = 1.5 + griffin_lim_iters: int = 60 + # mel-spec params + num_mels: int = 80 + mel_fmin: float = 0.0 + mel_fmax: float = None + spec_gain: int = 20 + do_amp_to_db_linear: bool = True + do_amp_to_db_mel: bool = True + # normalization params + signal_norm: bool = True + min_level_db: int = -100 + symmetric_norm: bool = True + max_norm: float = 4.0 + clip_norm: bool = True + stats_path: str = None + + def check_values( + self, + ): + """Check config fields""" + c = asdict(self) + check_argument("num_mels", c, restricted=True, min_val=10, max_val=2056) + check_argument("fft_size", c, restricted=True, min_val=128, max_val=4058) + check_argument("sample_rate", c, restricted=True, min_val=512, max_val=100000) + check_argument( + "frame_length_ms", + c, + restricted=True, + min_val=10, + max_val=1000, + alternative="win_length", + ) + check_argument("frame_shift_ms", c, restricted=True, min_val=1, max_val=1000, alternative="hop_length") + check_argument("preemphasis", c, restricted=True, min_val=0, max_val=1) + check_argument("min_level_db", c, restricted=True, min_val=-1000, max_val=10) + check_argument("ref_level_db", c, restricted=True, min_val=0, max_val=1000) + check_argument("power", c, restricted=True, min_val=1, max_val=5) + check_argument("griffin_lim_iters", c, restricted=True, min_val=10, max_val=1000) + + # normalization parameters + check_argument("signal_norm", c, restricted=True) + check_argument("symmetric_norm", c, restricted=True) + check_argument("max_norm", c, restricted=True, min_val=0.1, max_val=1000) + check_argument("clip_norm", c, restricted=True) + check_argument("mel_fmin", c, restricted=True, min_val=0.0, max_val=1000) + check_argument("mel_fmax", c, restricted=True, min_val=500.0, allow_none=True) + check_argument("spec_gain", c, restricted=True, min_val=1, max_val=100) + check_argument("do_trim_silence", c, restricted=True) + check_argument("trim_db", c, restricted=True) + + +@dataclass +class BaseDatasetConfig(Coqpit): + """Base config for TTS datasets. + + Args: + name (str): + Dataset name that defines the preprocessor in use. Defaults to None. + + path (str): + Root path to the dataset files. Defaults to None. + + meta_file_train (str): + Name of the dataset meta file. Or a list of speakers to be ignored at training for multi-speaker datasets. + Defaults to None. + + unused_speakers (List): + List of speakers IDs that are not used at the training. Default None. + + meta_file_val (str): + Name of the dataset meta file that defines the instances used at validation. + + meta_file_attn_mask (str): + Path to the file that lists the attention mask files used with models that require attention masks to + train the duration predictor. + """ + + name: str = "" + path: str = "" + meta_file_train: str = "" + ununsed_speakers: List[str] = None + meta_file_val: str = "" + meta_file_attn_mask: str = "" + + def check_values( + self, + ): + """Check config fields""" + c = asdict(self) + check_argument("name", c, restricted=True) + check_argument("path", c, restricted=True) + check_argument("meta_file_train", c, restricted=True) + check_argument("meta_file_val", c, restricted=False) + check_argument("meta_file_attn_mask", c, restricted=False) + + +@dataclass +class BaseTrainingConfig(Coqpit): + """Base config to define the basic training parameters that are shared + among all the models. + + Args: + model (str): + Name of the model that is used in the training. + + run_name (str): + Name of the experiment. This prefixes the output folder name. Defaults to `coqui_tts`. + + run_description (str): + Short description of the experiment. + + epochs (int): + Number training epochs. Defaults to 10000. + + batch_size (int): + Training batch size. + + eval_batch_size (int): + Validation batch size. + + mixed_precision (bool): + Enable / Disable mixed precision training. It reduces the VRAM use and allows larger batch sizes, however + it may also cause numerical unstability in some cases. + + scheduler_after_epoch (bool): + If true, run the scheduler step after each epoch else run it after each model step. + + run_eval (bool): + Enable / Disable evaluation (validation) run. Defaults to True. + + test_delay_epochs (int): + Number of epochs before starting to use evaluation runs. Initially, models do not generate meaningful + results, hence waiting for a couple of epochs might save some time. + + print_eval (bool): + Enable / Disable console logging for evalutaion steps. If disabled then it only shows the final values at + the end of the evaluation. Default to ```False```. + + print_step (int): + Number of steps required to print the next training log. + + log_dashboard (str): "tensorboard" or "wandb" + Set the experiment tracking tool + + plot_step (int): + Number of steps required to log training on Tensorboard. + + model_param_stats (bool): + Enable / Disable logging internal model stats for model diagnostic. It might be useful for model debugging. + Defaults to ```False```. + + project_name (str): + Name of the project. Defaults to config.model + + wandb_entity (str): + Name of W&B entity/team. Enables collaboration across a team or org. + + log_model_step (int): + Number of steps required to log a checkpoint as W&B artifact + + save_step (int):ipt + Number of steps required to save the next checkpoint. + + checkpoint (bool): + Enable / Disable checkpointing. + + keep_all_best (bool): + Enable / Disable keeping all the saved best models instead of overwriting the previous one. Defaults + to ```False```. + + keep_after (int): + Number of steps to wait before saving all the best models. In use if ```keep_all_best == True```. Defaults + to 10000. + + num_loader_workers (int): + Number of workers for training time dataloader. + + num_eval_loader_workers (int): + Number of workers for evaluation time dataloader. + + output_path (str): + Path for training output folder, either a local file path or other + URLs supported by both fsspec and tensorboardX, e.g. GCS (gs://) or + S3 (s3://) paths. The nonexist part of the given path is created + automatically. All training artefacts are saved there. + """ + + model: str = None + run_name: str = "coqui_tts" + run_description: str = "" + # training params + epochs: int = 10000 + batch_size: int = None + eval_batch_size: int = None + mixed_precision: bool = False + scheduler_after_epoch: bool = False + # eval params + run_eval: bool = True + test_delay_epochs: int = 0 + print_eval: bool = False + # logging + dashboard_logger: str = "tensorboard" + print_step: int = 25 + plot_step: int = 100 + model_param_stats: bool = False + project_name: str = None + log_model_step: int = None + wandb_entity: str = None + # checkpointing + save_step: int = 10000 + checkpoint: bool = True + keep_all_best: bool = False + keep_after: int = 10000 + # dataloading + num_loader_workers: int = 0 + num_eval_loader_workers: int = 0 + use_noise_augment: bool = False + # paths + output_path: str = None + # distributed + distributed_backend: str = "nccl" + distributed_url: str = "tcp://localhost:54321" diff --git a/submit_automated_pipeline.sh b/submit_automated_pipeline.sh new file mode 100644 index 0000000000000000000000000000000000000000..db99399f6c4857922913cb86b32daf8864c9e8a2 --- /dev/null +++ b/submit_automated_pipeline.sh @@ -0,0 +1,113 @@ +#!/bin/bash + +# Simple sbatch submission wrapper for the automated pipeline +# Usage: ./submit_automated_pipeline.sh [OPTIONS] +# Example: ./submit_automated_pipeline.sh --partition a30 --time 72:00:00 + +PROJECT_DIR="/vol/bitbucket/hl3025/cfm_svc" +SCRIPT="${PROJECT_DIR}/automated_pipeline.sh" + +# Default values +PARTITION="a100" +TIME="120:00:00" +GPUS="1" +CPUS="8" +MEM="64G" +JOB_NAME="cfm_full_pipeline" +EMAIL="hl3025@imperial.ac.uk" + +# Parse command-line arguments +while [[ $# -gt 0 ]]; do + case $1 in + --partition) + PARTITION="$2" + shift 2 + ;; + --time) + TIME="$2" + shift 2 + ;; + --gpus) + GPUS="$2" + shift 2 + ;; + --cpus) + CPUS="$2" + shift 2 + ;; + --mem) + MEM="$2" + shift 2 + ;; + --job-name) + JOB_NAME="$2" + shift 2 + ;; + --email) + EMAIL="$2" + shift 2 + ;; + --help|-h) + echo "Usage: $0 [OPTIONS]" + echo "" + echo "Options:" + echo " --partition PARTITION GPU partition (default: a100)" + echo " --time TIME Time limit HH:MM:SS (default: 120:00:00)" + echo " --gpus NUM Number of GPUs (default: 1)" + echo " --cpus NUM CPU cores per task (default: 8)" + echo " --mem MEMORY Memory allocation (default: 64G)" + echo " --job-name NAME Job name (default: cfm_full_pipeline)" + echo " --email EMAIL Email for notifications (default: hl3025@imperial.ac.uk)" + echo " --help Show this help message" + echo "" + echo "Examples:" + echo " $0 # Submit with defaults" + echo " $0 --partition a30 --time 24:00:00 # Submit to a30 partition" + exit 0 + ;; + *) + echo "Unknown option: $1" + echo "Use --help for usage information" + exit 1 + ;; + esac +done + +# Verify script exists +if [[ ! -f "$SCRIPT" ]]; then + echo "Error: Pipeline script not found at $SCRIPT" + exit 1 +fi + +# Make script executable +chmod +x "$SCRIPT" + +# Display submission details +echo "==================================================" +echo "Submitting CFM Automated Pipeline to SLURM" +echo "==================================================" +echo "Partition: $PARTITION" +echo "Time Limit: $TIME" +echo "GPUs: $GPUS" +echo "CPUs: $CPUS" +echo "Memory: $MEM" +echo "Job Name: $JOB_NAME" +echo "Email: $EMAIL" +echo "Script: $SCRIPT" +echo "==================================================" + +# Submit the job +sbatch \ + --partition="$PARTITION" \ + --time="$TIME" \ + --gres="gpu:${GPUS}" \ + --cpus-per-task="$CPUS" \ + --mem="$MEM" \ + --job-name="$JOB_NAME" \ + --mail-user="$EMAIL" \ + "$SCRIPT" + +echo "" +echo "Job submitted successfully!" +echo "Check status with: squeue -u \$USER" +echo "View logs at: $PROJECT_DIR/logs/pipeline_.out" diff --git a/submit_f5_stage1.sh b/submit_f5_stage1.sh new file mode 100755 index 0000000000000000000000000000000000000000..3cd73f9633ce05dff6e48b3857e00f11cd5495e2 --- /dev/null +++ b/submit_f5_stage1.sh @@ -0,0 +1,51 @@ +#!/bin/bash +#SBATCH --job-name=f5svc_s1 +#SBATCH --partition=a100 +#SBATCH --gres=gpu:1 +#SBATCH --time=24:00:00 +#SBATCH --output=logs/f5svc_s1_%j.out +#SBATCH --error=logs/f5svc_s1_%j.err +#SBATCH --mail-type=ALL +#SBATCH --mail-user=hl3025@imperial.ac.uk + +# Navigate to project directory +cd /vol/bitbucket/hl3025/cfm_svc + +# Activate environment +source .venv_linux/bin/activate + +export PIP_CACHE_DIR=/vol/bitbucket/hl3025/pip_cache +export TMPDIR=/vol/bitbucket/hl3025/tmp + +# Prevent BLAS/OpenMP from spawning too many threads per PyTorch dataloader worker +export OMP_NUM_THREADS=1 +export OPENBLAS_NUM_THREADS=1 +export MKL_NUM_THREADS=1 +export VECLIB_MAXIMUM_THREADS=1 +export NUMEXPR_NUM_THREADS=1 + +# Force Python output to be unbuffered so logs stream instantly +export PYTHONUNBUFFERED=1 + +echo "Starting F5-SVC Stage 1 (singing adaptation)..." +python train_f5_stage1.py \ + --f5tts_ckpt /vol/bitbucket/hl3025/cfm_svc/chkpt_f5svc/model_1200000.safetensors \ + --audio_dir ./data_svc/waves-32k \ + --ppg_dir ./data_svc/whisper \ + --hubert_dir ./data_svc/hubert \ + --f0_dir ./data_svc/pitch \ + --spk_dir ./data_svc/speaker \ + --outdir ./chkpt_f5svc \ + --epochs 50 \ + --batch_size 32 \ + --lr 1e-4 \ + --grad_accum 1 \ + --grad_clip 1.0 \ + --num_workers 8 \ + --save_interval 5 \ + --log_interval 50 \ + --max_frames 800 \ + --ref_frames 280 \ + --lora_rank 16 + +echo "Stage 1 complete." diff --git a/submit_f5_stage2.sh b/submit_f5_stage2.sh new file mode 100755 index 0000000000000000000000000000000000000000..61e4a8c3b083b6857dbcf201f6e8e8751a5a906b --- /dev/null +++ b/submit_f5_stage2.sh @@ -0,0 +1,56 @@ +#!/bin/bash +#SBATCH --job-name=f5svc_s2 +#SBATCH --partition=a100 +#SBATCH --gres=gpu:1 +#SBATCH --time=12:00:00 +#SBATCH --output=logs/f5svc_s2_%j.out +#SBATCH --error=logs/f5svc_s2_%j.err +#SBATCH --mail-type=ALL +#SBATCH --mail-user=hl3025@imperial.ac.uk + +# Navigate to project directory +cd /vol/bitbucket/hl3025/cfm_svc + +# Activate environment +source .venv_linux/bin/activate + +export PIP_CACHE_DIR=/vol/bitbucket/hl3025/pip_cache +export TMPDIR=/vol/bitbucket/hl3025/tmp + +# Prevent BLAS/OpenMP from spawning too many threads per PyTorch dataloader worker +export OMP_NUM_THREADS=1 +export OPENBLAS_NUM_THREADS=1 +export MKL_NUM_THREADS=1 +export VECLIB_MAXIMUM_THREADS=1 +export NUMEXPR_NUM_THREADS=1 + +# Force Python output to be unbuffered so logs stream instantly +export PYTHONUNBUFFERED=1 + +# ─── Configuration ─────────────────────────────────────────────────── +# Set these to match your target speaker: +STAGE1_CKPT="./chkpt_f5svc/stage1_epoch_200.pt" +SPEAKER_ID="my_speaker" +AUDIO_DIR="./data_svc/audio/${SPEAKER_ID}" +# ───────────────────────────────────────────────────────────────────── + +echo "Starting F5-SVC Stage 2 (speaker fine-tuning: ${SPEAKER_ID})..." +python train_f5_stage2.py \ + --stage1_ckpt "${STAGE1_CKPT}" \ + --audio_dir "${AUDIO_DIR}" \ + --speaker_id "${SPEAKER_ID}" \ + --ppg_dir ./data_svc/whisper \ + --hubert_dir ./data_svc/hubert \ + --f0_dir ./data_svc/pitch \ + --spk_dir ./data_svc/speaker \ + --outdir ./chkpt_f5svc \ + --epochs 50 \ + --batch_size 8 \ + --lr 5e-5 \ + --num_workers 4 \ + --log_interval 20 \ + --max_frames 800 \ + --ref_frames 280 \ + --lora_rank 16 + +echo "Stage 2 complete." diff --git a/submit_preprocess.sh b/submit_preprocess.sh new file mode 100644 index 0000000000000000000000000000000000000000..c5c1aebc656e9a231821a53235ff9b45a28b8238 --- /dev/null +++ b/submit_preprocess.sh @@ -0,0 +1,23 @@ +#!/bin/bash +#SBATCH --job-name=cfm_distill +#SBATCH --partition=a30 +#SBATCH --gres=gpu:1 +#SBATCH --cpus-per-task=8 +#SBATCH --mem=32G +#SBATCH --time=24:00:00 +#SBATCH --output=logs/distill_%j.out +#SBATCH --error=logs/distill_%j.err + +# Load any necessary modules or activate your environment here +# source .venv/bin/activate + +echo "Starting Offline Teacher Distillation..." +python preprocess_teacher.py \ + --teacher_ckpt vits_pretrain/sovits5.0.pretrain.pth \ + --teacher_config configs/base.yaml \ + --codec_target_dir ./data_svc/codec_targets \ + --data_root ./data_svc \ + --out_dir ./data_svc/teacher_codec_targets \ + --log_interval 200 + +echo "Offline distillation complete." diff --git a/submit_train.sh b/submit_train.sh new file mode 100644 index 0000000000000000000000000000000000000000..59351dc02494e79c62d3e87cbed0bc90221d53f9 --- /dev/null +++ b/submit_train.sh @@ -0,0 +1,46 @@ +#!/bin/bash +#SBATCH --job-name=cfm_train +#SBATCH --partition=a100 +#SBATCH --gres=gpu:1 +#SBATCH --time=71:59:00 +#SBATCH --output=logs/train_%j.out +#SBATCH --error=logs/train_%j.err +#SBATCH --mail-type=ALL +#SBATCH --mail-user=hl3025@imperial.ac.uk + +# Navigate to project director +cd /vol/bitbucket/hl3025/cfm_svc + +# Activate environment +source .venv_linux/bin/activate + +export PIP_CACHE_DIR=/vol/bitbucket/hl3025/pip_cache +export TMPDIR=/vol/bitbucket/hl3025/tmp + +# Prevent BLAS/OpenMP from spawning too many threads per PyTorch dataloader worker +export OMP_NUM_THREADS=1 +export OPENBLAS_NUM_THREADS=1 +export MKL_NUM_THREADS=1 +export VECLIB_MAXIMUM_THREADS=1 +export NUMEXPR_NUM_THREADS=1 + +# Force Python output to be unbuffered so logs stream instantly +export PYTHONUNBUFFERED=1 + +echo "Starting CFM Training with Teacher Distillation..." +python train_cfm.py \ + --data_dir ./data_svc/codec_targets \ + --teacher_target_dir ./data_svc/teacher_codec_targets \ + --lambda_teacher 0 \ + --batch_size 64 \ + --lr 1e-5 \ + --adam_eps 1e-6 \ + --loss_baseline 1.0 \ + --num_workers 8 \ + --epochs 250 \ + --log_interval 50 \ + --save_interval 5 \ + --grad_accum 1 \ + --grad_clip 1.0 + +echo "Training complete." diff --git a/svc_data/__init__.py b/svc_data/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/svc_data/mel_svc_dataset.py b/svc_data/mel_svc_dataset.py new file mode 100644 index 0000000000000000000000000000000000000000..04af23dcbe992477ca2b52a995f519e34a63eeec --- /dev/null +++ b/svc_data/mel_svc_dataset.py @@ -0,0 +1,223 @@ +""" +MelSVCDataset: training data for F5-SVC. + +Each sample returns: + target_mel (T_mel, 100) log-mel spectrogram of the full clip + ref_mel (T_mel, 100) first ref_frames unmasked, rest zeroed (F5-TTS style) + ppg (T_mel, 1280) Whisper PPG resampled to mel frame rate + hubert (T_mel, 256) HuBERT resampled to mel frame rate + f0 (T_mel, 1) log-F0 resampled to mel frame rate + spk (256,) speaker d-vector + ref_len int number of reference (non-zero) frames + +Expected directory layout (same as v1): + data_svc/ + audio//.wav + whisper//.ppg.npy + hubert//.vec.npy + pitch//.pit.npy + speaker//.spk.npy +""" + +from __future__ import annotations + +import glob +import os + +import numpy as np +import torch +import torch.nn.functional as F +from torch.utils.data import Dataset +import torchaudio +import torchaudio.transforms as T + +# Mel parameters — must match Vocos (charactr/vocos-mel-24khz) exactly +SAMPLE_RATE = 24_000 +HOP_LENGTH = 256 +N_FFT = 1_024 +WIN_LENGTH = 1_024 +N_MELS = 100 +F_MIN = 0 +F_MAX = None # = sr/2 = 12 kHz (Vocos default) +MEL_FRAME_RATE = SAMPLE_RATE / HOP_LENGTH # 93.75 Hz + + +def _build_mel_transform(sample_rate: int = SAMPLE_RATE) -> T.MelSpectrogram: + return T.MelSpectrogram( + sample_rate=sample_rate, + n_fft=N_FFT, + hop_length=HOP_LENGTH, + n_mels=N_MELS, + win_length=WIN_LENGTH, + f_min=F_MIN, + f_max=F_MAX, + power=1.0, # amplitude (matches Vocos) + center=True, # matches Vocos + # No norm, no mel_scale — use PyTorch defaults (htk) to match Vocos + ) + + +def _resample_to(seq: torch.Tensor, target_len: int) -> torch.Tensor: + """(T, D) → (target_len, D) via linear interpolation.""" + if seq.shape[0] == target_len: + return seq + x = seq.unsqueeze(0).transpose(1, 2) + x = F.interpolate(x, size=target_len, mode="linear", align_corners=False) + return x.squeeze(0).transpose(0, 1) + + +class MelSVCDataset(Dataset): + def __init__( + self, + audio_dir: str = "./data_svc/audio", + ppg_dir: str = "./data_svc/whisper", + hubert_dir: str = "./data_svc/hubert", + f0_dir: str = "./data_svc/pitch", + spk_dir: str = "./data_svc/speaker", + packed_dir: str | None = None, # if set, load from packed .pt files (NFS-efficient) + max_frames: int = 800, # ~8.5 sec at 93.75 Hz + ref_frames: int = 280, # ~3 sec reference region + sample_rate: int = SAMPLE_RATE, + strict: bool = True, + ): + self.max_frames = max_frames + self.ref_frames = ref_frames + self.sample_rate = sample_rate + self.packed = packed_dir is not None + + if self.packed: + # One .pt file per sample — single NFS read, mel already computed + pt_files = glob.glob(os.path.join(packed_dir, "**", "*.pt"), recursive=True) + if not pt_files: + raise RuntimeError(f"No .pt files found under {packed_dir}. " + f"Run: python prepare/preprocess_pack.py -w -o {packed_dir}") + self.samples: list[dict] = [{"packed": p} for p in pt_files] + print(f"MelSVCDataset (packed): {len(self.samples)} samples from {packed_dir}") + else: + self.mel_tf = _build_mel_transform(sample_rate) + wav_files = ( + glob.glob(os.path.join(audio_dir, "**", "*.wav"), recursive=True) + + glob.glob(os.path.join(audio_dir, "**", "*.flac"), recursive=True) + ) + if not wav_files: + raise RuntimeError(f"No .wav/.flac files found under {audio_dir}") + + self.samples = [] + skipped = 0 + for wav_path in wav_files: + file_id = os.path.splitext(os.path.basename(wav_path))[0] + spk_name = os.path.basename(os.path.dirname(wav_path)) + + ppg_path = os.path.join(ppg_dir, spk_name, f"{file_id}.ppg.npy") + hbt_path = os.path.join(hubert_dir, spk_name, f"{file_id}.vec.npy") + f0_path = os.path.join(f0_dir, spk_name, f"{file_id}.pit.npy") + spk_path = os.path.join(spk_dir, spk_name, f"{file_id}.spk.npy") + + if strict and any(not os.path.isfile(p) for p in [ppg_path, hbt_path, f0_path, spk_path]): + skipped += 1 + continue + + self.samples.append(dict(wav=wav_path, ppg=ppg_path, hubert=hbt_path, + f0=f0_path, spk=spk_path)) + + if not self.samples: + raise RuntimeError(f"No valid samples (skipped={skipped}). Check directory layout.") + print(f"MelSVCDataset: {len(self.samples)} samples (skipped={skipped})") + + def __len__(self) -> int: + return len(self.samples) + + def __getitem__(self, idx: int): + s = self.samples[idx] + + if self.packed: + # --- Packed path: single load, mel already computed --- + data = torch.load(s["packed"], weights_only=True) + mel = data["mel"] # (T_mel, N_MELS) + ppg = data["ppg"] # (T_feat, 1280) + hubert = data["hubert"] # (T_feat, 256) + f0 = data["f0"].unsqueeze(-1) # (T_feat, 1) + spk = data["spk"] # (256,) + else: + # --- Unpacked path: load wav + 4 npy files, compute mel --- + wav, sr = torchaudio.load(s["wav"]) + if wav.shape[0] > 1: + wav = wav.mean(dim=0, keepdim=True) + if sr != self.sample_rate: + wav = torchaudio.functional.resample(wav, sr, self.sample_rate) + mel = torch.log(self.mel_tf(wav).clamp(min=1e-5)).squeeze(0).T # (T_mel, N_MELS) + + try: + ppg = torch.tensor(np.load(s["ppg"])).float() + except Exception: + ppg = torch.zeros(mel.shape[0], 1280) + try: + hubert = torch.tensor(np.load(s["hubert"])).float() + except Exception: + hubert = torch.zeros(mel.shape[0], 256) + try: + f0_raw = torch.tensor(np.load(s["f0"])).float() + f0 = torch.where(f0_raw > 0, + torch.log(f0_raw.clamp(min=1.0)), + torch.zeros_like(f0_raw)).unsqueeze(-1) + except Exception: + f0 = torch.zeros(mel.shape[0], 1) + try: + spk = torch.tensor(np.load(s["spk"])).float() + except Exception: + spk = torch.zeros(256) + + # Resample features from their native rate (50 Hz) to mel frame rate + # (93.75 Hz) BEFORE cropping, so crop indices are consistent across + # mel and all features. + t_mel = mel.shape[0] + ppg = _resample_to(ppg, t_mel) + hubert = _resample_to(hubert, t_mel) + f0 = _resample_to(f0, t_mel) + + # Random crop to max_frames (all tensors are now at mel frame rate) + if t_mel > self.max_frames: + start = torch.randint(0, t_mel - self.max_frames, (1,)).item() + mel = mel[start: start + self.max_frames] + ppg = ppg[start: start + self.max_frames] + hubert = hubert[start: start + self.max_frames] + f0 = f0[start: start + self.max_frames] + t_mel = self.max_frames + + # Reference region (F5-TTS inpainting convention) + ref_len = min(self.ref_frames, t_mel) + ref_mel = torch.zeros_like(mel) + ref_mel[:ref_len] = mel[:ref_len] + + return mel, ref_mel, ppg, hubert, f0, spk, ref_len + + +def collate_fn(batch): + mels, ref_mels, ppgs, huberts, f0s, spks, ref_lens = zip(*batch) + + lengths = [m.shape[0] for m in mels] + max_len = max(lengths) + bsz = len(batch) + + mel_padded = torch.zeros(bsz, max_len, N_MELS) + ref_padded = torch.zeros(bsz, max_len, N_MELS) + ppg_padded = torch.zeros(bsz, max_len, ppgs[0].shape[1]) + hbt_padded = torch.zeros(bsz, max_len, huberts[0].shape[1]) + f0_padded = torch.zeros(bsz, max_len, 1) + mask = torch.zeros(bsz, max_len, dtype=torch.bool) + + for i, ln in enumerate(lengths): + mel_padded[i, :ln] = mels[i] + ref_padded[i, :ln] = ref_mels[i] + ppg_padded[i, :ln] = ppgs[i] + hbt_padded[i, :ln] = huberts[i] + f0_padded[i, :ln] = f0s[i] + mask[i, :ln] = True + + return ( + mel_padded, ref_padded, + ppg_padded, hbt_padded, f0_padded, + torch.stack(spks), + mask, + torch.tensor(list(ref_lens), dtype=torch.long), + ) diff --git a/svc_eva.py b/svc_eva.py new file mode 100644 index 0000000000000000000000000000000000000000..905d34e7e432299d2aa3bf9a500b178569bbd96f --- /dev/null +++ b/svc_eva.py @@ -0,0 +1,20 @@ +import os +import numpy as np + +# average -> ave -> eva :haha + +eva_conf = { + './configs/singers/singer0022.npy': 0, + './configs/singers/singer0030.npy': 0, + './configs/singers/singer0047.npy': 0.5, + './configs/singers/singer0051.npy': 0.5, +} + +if __name__ == "__main__": + + eva = np.zeros(256) + for k, v in eva_conf.items(): + assert os.path.isfile(k), k + spk = np.load(k) + eva = eva + spk * v + np.save("eva.spk.npy", eva, allow_pickle=False) diff --git a/svc_export.py b/svc_export.py new file mode 100644 index 0000000000000000000000000000000000000000..13dea0c9a8f9aedfe9cfb77d1d1b81fcb5b922bb --- /dev/null +++ b/svc_export.py @@ -0,0 +1,68 @@ +import sys,os +sys.path.append(os.path.dirname(os.path.abspath(__file__))) +import torch +import argparse +from omegaconf import OmegaConf + +from vits.models import SynthesizerInfer + + +def load_model(checkpoint_path, model): + assert os.path.isfile(checkpoint_path) + checkpoint_dict = torch.load(checkpoint_path, map_location="cpu") + saved_state_dict = checkpoint_dict["model_g"] + if hasattr(model, "module"): + state_dict = model.module.state_dict() + else: + state_dict = model.state_dict() + new_state_dict = {} + for k, v in state_dict.items(): + try: + new_state_dict[k] = saved_state_dict[k] + except: + new_state_dict[k] = v + if hasattr(model, "module"): + model.module.load_state_dict(new_state_dict) + else: + model.load_state_dict(new_state_dict) + return model + + +def save_pretrain(checkpoint_path, save_path): + assert os.path.isfile(checkpoint_path) + checkpoint_dict = torch.load(checkpoint_path, map_location="cpu") + torch.save({ + 'model_g': checkpoint_dict['model_g'], + 'model_d': checkpoint_dict['model_d'], + }, save_path) + + +def save_model(model, checkpoint_path): + if hasattr(model, 'module'): + state_dict = model.module.state_dict() + else: + state_dict = model.state_dict() + torch.save({'model_g': state_dict}, checkpoint_path) + + +def main(args): + hp = OmegaConf.load(args.config) + model = SynthesizerInfer( + hp.data.filter_length // 2 + 1, + hp.data.segment_size // hp.data.hop_length, + hp) + + # save_pretrain(args.checkpoint_path, "sovits5.0.pretrain.pth") + load_model(args.checkpoint_path, model) + save_model(model, "sovits5.0.pth") + + +if __name__ == '__main__': + parser = argparse.ArgumentParser() + parser.add_argument('-c', '--config', type=str, required=True, + help="yaml file for config. will use hp_str from checkpoint if not given.") + parser.add_argument('-p', '--checkpoint_path', type=str, required=True, + help="path of checkpoint pt file for evaluation") + args = parser.parse_args() + + main(args) diff --git a/svc_inference.py b/svc_inference.py new file mode 100644 index 0000000000000000000000000000000000000000..5b9c9fd552e4ebc76d8f564b4051a20e97701072 --- /dev/null +++ b/svc_inference.py @@ -0,0 +1,194 @@ +import sys,os +sys.path.append(os.path.dirname(os.path.abspath(__file__))) +import torch +import argparse +import numpy as np + +from omegaconf import OmegaConf +from scipy.io.wavfile import write +from vits.models import SynthesizerInfer +from pitch import load_csv_pitch + + +def load_svc_model(checkpoint_path, model): + assert os.path.isfile(checkpoint_path) + checkpoint_dict = torch.load(checkpoint_path, map_location="cpu") + saved_state_dict = checkpoint_dict["model_g"] + state_dict = model.state_dict() + new_state_dict = {} + for k, v in state_dict.items(): + try: + new_state_dict[k] = saved_state_dict[k] + except: + print("%s is not in the checkpoint" % k) + new_state_dict[k] = v + model.load_state_dict(new_state_dict) + return model + + +def main(args): + if (args.ppg == None): + args.ppg = "svc_tmp.ppg.npy" + print(f"Auto run : python whisper/inference.py -w {args.wave} -p {args.ppg}") + os.system(f"{sys.executable} whisper/inference.py -w {args.wave} -p {args.ppg}") + + if (args.vec == None): + args.vec = "svc_tmp.vec.npy" + print(f"Auto run : python hubert/inference.py -w {args.wave} -v {args.vec}") + os.system(f"{sys.executable} hubert/inference.py -w {args.wave} -v {args.vec}") + + if (args.pit == None): + args.pit = "svc_tmp.pit.csv" + print(f"Auto run : python pitch/inference.py -w {args.wave} -p {args.pit}") + os.system(f"{sys.executable} pitch/inference.py -w {args.wave} -p {args.pit}") + + if torch.cuda.is_available(): + device = torch.device('cuda') + elif torch.backends.mps.is_available(): + device = torch.device('mps') + else: + device = torch.device('cpu') + hp = OmegaConf.load(args.config) + model = SynthesizerInfer( + hp.data.filter_length // 2 + 1, + hp.data.segment_size // hp.data.hop_length, + hp) + load_svc_model(args.model, model) + model.eval() + model.to(device) + + spk = np.load(args.spk) + spk = torch.FloatTensor(spk) + + ppg = np.load(args.ppg) + ppg = np.repeat(ppg, 2, 0) # 320 PPG -> 160 * 2 + ppg = torch.FloatTensor(ppg) + # ppg = torch.zeros_like(ppg) + + vec = np.load(args.vec) + vec = np.repeat(vec, 2, 0) # 320 PPG -> 160 * 2 + vec = torch.FloatTensor(vec) + # vec = torch.zeros_like(vec) + + pit = load_csv_pitch(args.pit) + print("pitch shift: ", args.shift) + if (args.shift == 0): + pass + else: + pit = np.array(pit) + source = pit[pit > 0] + source_ave = source.mean() + source_min = source.min() + source_max = source.max() + print(f"source pitch statics: mean={source_ave:0.1f}, \ + min={source_min:0.1f}, max={source_max:0.1f}") + shift = args.shift + shift = 2 ** (shift / 12) + pit = pit * shift + + pit = torch.FloatTensor(pit) + + len_pit = pit.size()[0] + len_vec = vec.size()[0] + len_ppg = ppg.size()[0] + len_min = min(len_pit, len_vec) + len_min = min(len_min, len_ppg) + pit = pit[:len_min] + vec = vec[:len_min, :] + ppg = ppg[:len_min, :] + + with torch.no_grad(): + + spk = spk.unsqueeze(0).to(device) + source = pit.unsqueeze(0).to(device) + source = model.pitch2source(source) + pitwav = model.source2wav(source) + write("svc_out_pit.wav", hp.data.sampling_rate, pitwav) + + hop_size = hp.data.hop_length + all_frame = len_min + hop_frame = 10 + out_chunk = 2500 # 25 S + out_index = 0 + out_audio = [] + has_audio = False + + while (out_index + out_chunk < all_frame): + has_audio = True + if (out_index == 0): # start frame + cut_s = 0 + cut_s_out = 0 + else: + cut_s = out_index - hop_frame + cut_s_out = hop_frame * hop_size + + if (out_index + out_chunk + hop_frame > all_frame): # end frame + cut_e = out_index + out_chunk + cut_e_out = 0 + else: + cut_e = out_index + out_chunk + hop_frame + cut_e_out = -1 * hop_frame * hop_size + + sub_ppg = ppg[cut_s:cut_e, :].unsqueeze(0).to(device) + sub_vec = vec[cut_s:cut_e, :].unsqueeze(0).to(device) + sub_pit = pit[cut_s:cut_e].unsqueeze(0).to(device) + sub_len = torch.LongTensor([cut_e - cut_s]).to(device) + sub_har = source[:, :, cut_s * + hop_size:cut_e * hop_size].to(device) + sub_out = model.inference(sub_ppg, sub_vec, sub_pit, spk, sub_len, sub_har) + sub_out = sub_out[0, 0].data.cpu().detach().numpy() + + sub_out = sub_out[cut_s_out:cut_e_out] + out_audio.extend(sub_out) + out_index = out_index + out_chunk + + if (out_index < all_frame): + if (has_audio): + cut_s = out_index - hop_frame + cut_s_out = hop_frame * hop_size + else: + cut_s = 0 + cut_s_out = 0 + sub_ppg = ppg[cut_s:, :].unsqueeze(0).to(device) + sub_vec = vec[cut_s:, :].unsqueeze(0).to(device) + sub_pit = pit[cut_s:].unsqueeze(0).to(device) + sub_len = torch.LongTensor([all_frame - cut_s]).to(device) + sub_har = source[:, :, cut_s * hop_size:].to(device) + sub_out = model.inference(sub_ppg, sub_vec, sub_pit, spk, sub_len, sub_har) + sub_out = sub_out[0, 0].data.cpu().detach().numpy() + + sub_out = sub_out[cut_s_out:] + out_audio.extend(sub_out) + out_audio = np.asarray(out_audio) + + write("svc_out.wav", hp.data.sampling_rate, out_audio) + + if (args.clean): + os.remove("svc_out_pit.wav") + os.remove("svc_tmp.pit.csv") + os.remove("svc_tmp.ppg.npy") + os.remove("svc_tmp.vec.npy") + +if __name__ == '__main__': + parser = argparse.ArgumentParser() + parser.add_argument('--config', type=str, required=True, + help="yaml file for config.") + parser.add_argument('--model', type=str, required=True, + help="path of model for evaluation") + parser.add_argument('--wave', type=str, required=True, + help="Path of raw audio.") + parser.add_argument('--spk', type=str, required=True, + help="Path of speaker.") + parser.add_argument('--ppg', type=str, + help="Path of content vector.") + parser.add_argument('--vec', type=str, + help="Path of hubert vector.") + parser.add_argument('--pit', type=str, + help="Path of pitch csv file.") + parser.add_argument('--shift', type=int, default=0, + help="Pitch shift key.") + parser.add_argument('--clean', action='store_true', + help="Clean intermediate files.") + args = parser.parse_args() + + main(args) diff --git a/svc_preprocessing.py b/svc_preprocessing.py new file mode 100644 index 0000000000000000000000000000000000000000..c6cf773e9b6071741478883dda3ab4d44da2cbd3 --- /dev/null +++ b/svc_preprocessing.py @@ -0,0 +1,76 @@ +import argparse +import os +import shlex +import subprocess +import sys + + +REQUIRED_PRETRAINED = { + "whisper_pretrain/large-v2.pt": "https://openaipublic.azureedge.net/main/whisper/models/81f7c96c852ee8fc832187b0132e569d6c3065a3252ed18e56effd0b6a73e524/large-v2.pt", + "hubert_pretrain/hubert-soft-0d54a1f4.pt": "https://github.com/bshall/hubert/releases/download/v0.1/hubert-soft-0d54a1f4.pt", + "speaker_pretrain/best_model.pth.tar": "https://drive.google.com/uc?id=1UPjQ2LVSIt3o-9QMKMJcdzT8aZRZCI-E", + "speaker_pretrain/config.json": "https://raw.githubusercontent.com/PlayVoice/so-vits-svc-5.0/9d415f9d7c7c7a131b89ec6ff633be10739f41ed/speaker_pretrain/config.json", + "crepe/assets/full.pth": "https://github.com/maxrmorrison/torchcrepe/raw/master/torchcrepe/assets/full.pth", +} + + +def count_wavs(root): + wav_count = 0 + for _, _, files in os.walk(root): + wav_count += sum(1 for name in files if name.lower().endswith(".wav")) + return wav_count + + +def main(): + parser = argparse.ArgumentParser() + parser.add_argument("-t", type=int, default=0, help="thread count for preprocess_crepe") + args = parser.parse_args() + + dataset_root = "./dataset_raw" + if not os.path.isdir(dataset_root): + print(f"Error: dataset directory not found: {dataset_root}") + return 1 + + wav_count = count_wavs(dataset_root) + if wav_count == 0: + print(f"Error: no .wav files found under {dataset_root}") + return 1 + + print(f"Found {wav_count} wav files under {dataset_root}") + + missing = [(path, url) for path, url in REQUIRED_PRETRAINED.items() if not os.path.isfile(path)] + if missing: + print("Error: missing required pretrained files for preprocessing:") + for path, url in missing: + print(f" - {path}") + print(f" download: {url}") + return 1 + + commands = [ + ("Resample 16k", [sys.executable, "prepare/preprocess_a.py", "-w", "./dataset_raw", "-o", "./data_svc/waves-16k", "-s", "16000", "-t", "0"]), + ("Resample 32k", [sys.executable, "prepare/preprocess_a.py", "-w", "./dataset_raw", "-o", "./data_svc/waves-32k", "-s", "32000", "-t", "0"]), + ("Extract pitch (CREPE)", [sys.executable, "prepare/preprocess_crepe.py", "-w", "data_svc/waves-16k/", "-p", "data_svc/pitch", "-t", str(args.t)]), + ("Extract PPG (Whisper)", [sys.executable, "prepare/preprocess_ppg.py", "-w", "data_svc/waves-16k/", "-p", "data_svc/whisper"]), + ("Extract HuBERT vectors", [sys.executable, "prepare/preprocess_hubert.py", "-w", "data_svc/waves-16k/", "-v", "data_svc/hubert", "-t", "1"]), + ("Extract speaker embeddings", [sys.executable, "prepare/preprocess_speaker.py", "data_svc/waves-16k/", "data_svc/speaker", "-t", "0"]), + ("Average speaker embeddings", [sys.executable, "prepare/preprocess_speaker_ave.py", "data_svc/speaker/", "data_svc/singer", "-t", "0"]), + ("Extract spectrograms", [sys.executable, "prepare/preprocess_spec.py", "-w", "data_svc/waves-32k/", "-s", "data_svc/specs", "-t", "0"]), + ("Build train/valid index", [sys.executable, "prepare/preprocess_train.py"]), + ("Validate dataset", [sys.executable, "prepare/preprocess_zzz.py"]), + ] + + total = len(commands) + for i, (step_name, command) in enumerate(commands, start=1): + print(f"\n[{i}/{total}] {step_name}") + print(f"Command: {shlex.join(command)}") + result = subprocess.run(command) + if result.returncode != 0: + print(f"Error: step failed with exit code {result.returncode}: {step_name}") + return result.returncode + + print("\nAll preprocessing steps completed successfully.") + return 0 + + +if __name__ == "__main__": + raise SystemExit(main()) diff --git a/svc_trainer.py b/svc_trainer.py new file mode 100755 index 0000000000000000000000000000000000000000..f77cc7fde79198d9167944fff0a591356c116345 --- /dev/null +++ b/svc_trainer.py @@ -0,0 +1,44 @@ +import sys,os +sys.path.append(os.path.dirname(os.path.abspath(__file__))) +import argparse +import torch +import torch.multiprocessing as mp +from omegaconf import OmegaConf + +from vits_extend.train import train + +torch.backends.cudnn.benchmark = True + + +if __name__ == '__main__': + parser = argparse.ArgumentParser() + parser.add_argument('-c', '--config', type=str, required=True, + help="yaml file for configuration") + parser.add_argument('-p', '--checkpoint_path', type=str, default=None, + help="path of checkpoint pt file to resume training") + parser.add_argument('-n', '--name', type=str, required=True, + help="name of the model for logging, saving checkpoint") + args = parser.parse_args() + + hp = OmegaConf.load(args.config) + with open(args.config, 'r') as f: + hp_str = ''.join(f.readlines()) + + assert hp.data.hop_length == 320, \ + 'hp.data.hop_length must be equal to 320, got %d' % hp.data.hop_length + + args.num_gpus = 0 + torch.manual_seed(hp.train.seed) + if torch.cuda.is_available(): + torch.cuda.manual_seed(hp.train.seed) + args.num_gpus = torch.cuda.device_count() + print('Batch size per GPU :', hp.train.batch_size) + elif torch.backends.mps.is_available(): + args.num_gpus = 1 + print('Using MPS for training! Batch size per GPU :', hp.train.batch_size) + + if args.num_gpus > 1: + mp.spawn(train, nprocs=args.num_gpus, + args=(args, args.checkpoint_path, hp, hp_str,)) + else: + train(0, args, args.checkpoint_path, hp, hp_str) diff --git a/test_codec_robotic.py b/test_codec_robotic.py new file mode 100644 index 0000000000000000000000000000000000000000..78d6493f102717df8a5247fc1a38990a7e788ad4 --- /dev/null +++ b/test_codec_robotic.py @@ -0,0 +1,20 @@ +import dac +import torch +import soundfile as sf +from torchaudio.functional import resample +import numpy as np + +codec = dac.utils.load_model(tag='latest', model_type='44khz').eval().cuda() +wav_tensor, sr = sf.read('data_svc_infer/waves-16k/speaker0/poem.wav') +wav_tensor = torch.from_numpy(wav_tensor[0:32000]).unsqueeze(0).unsqueeze(0).float().cuda() +wav_tensor = resample(wav_tensor, sr, 44100) + +z, codes, latents, _, _ = codec.encode(wav_tensor) +z_q, _, _, _, _ = codec.quantizer(z, codec.sample_rate) + +out_z = codec.decode(z).squeeze().cpu().detach().numpy() +out_zq = codec.decode(z_q).squeeze().cpu().detach().numpy() + +diff = np.abs(out_z - out_zq).mean() +print('Mean diff between decoded z and z_q:', diff) +print('Max out_z:', np.abs(out_z).max(), 'Max out_zq:', np.abs(out_zq).max()) diff --git a/test_overfit.py b/test_overfit.py new file mode 100644 index 0000000000000000000000000000000000000000..857d523e08bbba8c2843da486172d409dbc5d141 --- /dev/null +++ b/test_overfit.py @@ -0,0 +1,105 @@ +import torch +import torch.nn.functional as F +import numpy as np +import soundfile as sf +import os +from tqdm import tqdm + +from models.cfm import DiT +from samplers.ode import ODESampler +from models.cond_encoder import CondEncoder +from models.codec_wrapper import CodecWrapper +from losses.cfm_loss import CFMLoss + +device = 'cuda' + +# 1. Initialize fresh models + Positional Encoding +dit = DiT(in_channels=1024, cond_dim=1024, hidden_dim=512, depth=8).to(device) +cond_enc = CondEncoder(ppg_dim=1280, hubert_dim=256, f0_dim=1, spk_dim=256).to(device) +codec_wrapper = CodecWrapper(backend='dac', latent_dim=1024).to(device) + +# Load existing latent normalizations +norm = torch.load('chkpt_cfm/latent_norm.pt', map_location=device, weights_only=True) +z_mean = norm['mean'] +z_std = norm['std'] + +# 2. Get a single sample +file_id = "16_你不知道的事_0" +spk_dir = "singer_0005" + +z_target = torch.load(f"data_svc/codec_targets/{spk_dir}/{file_id}_ztarget.pt", weights_only=True) +z_target_norm = (z_target.squeeze(0).transpose(0, 1).float().to(device) - z_mean) / z_std + +ppg = torch.tensor(np.load(f"data_svc/whisper/{spk_dir}/{file_id}.ppg.npy")).float().unsqueeze(0).to(device) +hubert = torch.tensor(np.load(f"data_svc/hubert/{spk_dir}/{file_id}.vec.npy")).float().unsqueeze(0).to(device) +f0 = torch.tensor(np.load(f"data_svc/pitch/{spk_dir}/{file_id}.pit.npy")).float().unsqueeze(-1).unsqueeze(0).to(device) +spk = torch.tensor(np.load(f"data_svc/speaker/{spk_dir}/{file_id}.spk.npy")).float().unsqueeze(0).to(device) + +t_len = z_target_norm.shape[0] + +def _resample_to(seq, target_len): + x = seq.transpose(1, 2) + x = F.interpolate(x, size=target_len, mode="linear", align_corners=False) + return x.transpose(1, 2) + +ppg = _resample_to(ppg, t_len) +hubert = _resample_to(hubert, t_len) +f0 = _resample_to(f0, t_len) + +# Truncate to first 3 seconds (approx 258 frames) to speed up overfitting tremendously +max_t = min(258, t_len) +z_target_norm = z_target_norm[:max_t].unsqueeze(0) +ppg = ppg[:, :max_t, :] +hubert = hubert[:, :max_t, :] +f0 = f0[:, :max_t, :] + +t_len = max_t + +# 3. Setup Optimizer +optimizer = torch.optim.AdamW(list(dit.parameters()) + list(cond_enc.parameters()) + list(codec_wrapper.projector.parameters()), lr=1e-4) +criterion = CFMLoss(lambda_proj=1.0).to(device) + +dit.train() +cond_enc.train() +codec_wrapper.projector.train() + +print("Overfitting on a single 3-second segment for 1000 iterations...") +from tqdm import trange + +for step in trange(1000): + optimizer.zero_grad() + c = cond_enc(ppg, hubert, f0, spk, target_seq_len=t_len) + + loss, flow_loss, proj_loss = criterion(z_target_norm, c, dit, codec_wrapper.projector) + + loss.backward() + optimizer.step() + + if (step + 1) % 100 == 0: + print(f"Step {step+1}: Total Loss={loss.item():.4f}, Flow={flow_loss.item():.4f}, Proj={proj_loss.item():.4f}") + +# 4. Generate +print("Generating from overfitted model...") +dit.eval() +cond_enc.eval() +codec_wrapper.eval() + +with torch.no_grad(): + c = cond_enc(ppg, hubert, f0, spk, target_seq_len=t_len) + sampler = ODESampler(dit, steps=64, solver='rk4') + z_noise = torch.randn(1, t_len, 1024).to(device) + u_hat = sampler.sample(z_noise, c) + + u_hat_transposed = u_hat.transpose(1, 2) + z_hat_norm = codec_wrapper.forward_project(u_hat_transposed) + + z_hat_norm_transposed = z_hat_norm.transpose(1, 2) + z_hat_denorm = (z_hat_norm_transposed * z_std) + z_mean + z_hat = z_hat_denorm.transpose(1, 2) + + import warnings + warnings.filterwarnings("ignore") + wav_out = codec_wrapper.codec.decode(z_hat).cpu().squeeze().detach().numpy() + +sf.write('test_overfit_pe.wav', wav_out, 44100) +print("Saved purely memorized segment to test_overfit_pe.wav") diff --git a/test_stats.py b/test_stats.py new file mode 100644 index 0000000000000000000000000000000000000000..2f8c910fdd13df4a3b4711fdbd65927812fbfd82 --- /dev/null +++ b/test_stats.py @@ -0,0 +1,23 @@ +import torch +import numpy as np + +# Load ground truth z +spk_dir = "singer_0005" +file_id = "16_你不知道的事_0" +z_gt = torch.load(f"data_svc/codec_targets/{spk_dir}/{file_id}_ztarget.pt", weights_only=True) +print("GT z shape:", z_gt.shape) +print("GT z mean:", z_gt.mean().item(), "std:", z_gt.std().item()) +print("GT z min:", z_gt.min().item(), "max:", z_gt.max().item()) + +# Load norm +norm = torch.load("chkpt/latent_norm.pt", map_location="cpu", weights_only=True) +z_mean = norm["mean"] +z_std = norm["std"] +print("\nGlobal norm mean:", z_mean.mean().item()) +print("Global norm std:", z_std.mean().item()) + +# Normalize GT +z_gt_norm = (z_gt.squeeze(0).transpose(0,1) - z_mean) / z_std +print("\nNormed GT z mean:", z_gt_norm.mean().item(), "std:", z_gt_norm.std().item()) +print("Normed GT z min:", z_gt_norm.min().item(), "max:", z_gt_norm.max().item()) + diff --git a/test_teacher_codec.py b/test_teacher_codec.py new file mode 100644 index 0000000000000000000000000000000000000000..1620e8d83e78608d7d3f4cbce44a0c9db7b64e67 --- /dev/null +++ b/test_teacher_codec.py @@ -0,0 +1,37 @@ +import dac +import torch +import soundfile as sf +import numpy as np +import os +import glob +import argparse + +def decode_teacher_targets(args): + device = "cuda" if torch.cuda.is_available() else "cpu" + codec = dac.utils.load_model(tag='latest', model_type='44khz').eval().to(device) + + files = glob.glob(os.path.join(args.dir, "**", "*.pt"), recursive=True) + os.makedirs(args.out_dir, exist_ok=True) + + count = 0 + for f in files[:args.num]: + z = torch.load(f).to(device) # Shape: (1, 1024, T) + + with torch.no_grad(): + out_wav = codec.decode(z).squeeze().cpu().numpy() + + base_name = os.path.basename(f).replace('.pt', '.wav') + out_path = os.path.join(args.out_dir, base_name) + sf.write(out_path, out_wav, 44100) + print(f"Decoded {f} -> {out_path}") + count += 1 + + print(f"Decoded {count} files.") + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument("--dir", type=str, default="./data_svc/teacher_codec_targets") + parser.add_argument("--out_dir", type=str, default="./test_teacher_audio") + parser.add_argument("--num", type=int, default=10) + args = parser.parse_args() + decode_teacher_targets(args) diff --git a/test_train_exact.py b/test_train_exact.py new file mode 100644 index 0000000000000000000000000000000000000000..7994e5e0ce4cf4a3aa99b15806f5361c92c46aaa --- /dev/null +++ b/test_train_exact.py @@ -0,0 +1,73 @@ +import torch +import numpy as np +import soundfile as sf +import dac + +from models.cfm import DiT +from samplers.ode import ODESampler +from models.cond_encoder import CondEncoder +from models.codec_wrapper import CodecWrapper + +device = 'cuda' + +dit = DiT(in_channels=1024, cond_dim=1024, hidden_dim=512, depth=8).to(device) +cond_enc = CondEncoder(ppg_dim=1280, hubert_dim=256, f0_dim=1, spk_dim=256).to(device) +codec_wrapper = CodecWrapper(backend='dac', latent_dim=1024).to(device) + +dit.load_state_dict(torch.load('chkpt_cfm/dit_epoch_200.pt', map_location=device, weights_only=True)) +cond_enc.load_state_dict(torch.load('chkpt_cfm/cond_encoder_epoch_200.pt', map_location=device, weights_only=True)) +codec_wrapper.projector.load_state_dict(torch.load('chkpt_cfm/projector_epoch_200.pt', map_location=device, weights_only=True)) + +dit.eval() +cond_enc.eval() +codec_wrapper.eval() + +norm = torch.load('chkpt_cfm/latent_norm.pt', map_location=device, weights_only=True) +z_mean = norm['mean'] +z_std = norm['std'] + +# Load precisely from training folder +file_id = "16_你不知道的事_0" +spk_dir = "singer_0005" + +z_target = torch.load(f"data_svc/codec_targets/{spk_dir}/{file_id}_ztarget.pt", weights_only=True) +z_target_norm = (z_target.squeeze(0).transpose(0, 1).float().to(device) - z_mean) / z_std + +ppg = torch.tensor(np.load(f"data_svc/whisper/{spk_dir}/{file_id}.ppg.npy")).float().unsqueeze(0).to(device) +hubert = torch.tensor(np.load(f"data_svc/hubert/{spk_dir}/{file_id}.vec.npy")).float().unsqueeze(0).to(device) +f0 = torch.tensor(np.load(f"data_svc/pitch/{spk_dir}/{file_id}.pit.npy")).float().unsqueeze(-1).unsqueeze(0).to(device) +spk = torch.tensor(np.load(f"data_svc/speaker/{spk_dir}/{file_id}.spk.npy")).float().unsqueeze(0).to(device) + +# Exactly like train_cfm.py interpolation +import torch.nn.functional as F +t_len = z_target_norm.shape[0] + +def _resample_to(seq, target_len): + x = seq.transpose(1, 2) + x = F.interpolate(x, size=target_len, mode="linear", align_corners=False) + return x.transpose(1, 2) + +ppg = _resample_to(ppg, t_len) +hubert = _resample_to(hubert, t_len) +f0 = _resample_to(f0, t_len) + +c = cond_enc(ppg, hubert, f0, spk, target_seq_len=t_len) + +sampler = ODESampler(dit, steps=64, solver='rk4') +z_noise = torch.randn(1, t_len, 1024).to(device) * 0.7 + +u_hat = sampler.sample(z_noise, c) + +u_hat_transposed = u_hat.transpose(1, 2) +z_hat_norm = codec_wrapper.forward_project(u_hat_transposed) + +z_hat_norm_transposed = z_hat_norm.transpose(1, 2) +z_hat_denorm = (z_hat_norm_transposed * z_std) + z_mean +z_hat = z_hat_denorm.transpose(1, 2) + +import warnings +warnings.filterwarnings("ignore") +wav_out = codec_wrapper.codec.decode(z_hat).cpu().squeeze().detach().numpy() + +sf.write('test_train_exact.wav', wav_out, 44100) +print("Done reconstructing training point.") diff --git a/test_train_gt.py b/test_train_gt.py new file mode 100644 index 0000000000000000000000000000000000000000..559011f36eab507b61b7ebd522bd168ef1140d74 --- /dev/null +++ b/test_train_gt.py @@ -0,0 +1,15 @@ +import torch +import soundfile as sf +import dac +from models.codec_wrapper import CodecWrapper + +device = 'cuda' +codec_wrapper = CodecWrapper(backend='dac', latent_dim=1024).to(device) + +file_id = "16_你不知道的事_0" +spk_dir = "singer_0005" +z_target = torch.load(f"data_svc/codec_targets/{spk_dir}/{file_id}_ztarget.pt", weights_only=True).to(device) + +wav_gt = codec_wrapper.codec.decode(z_target).cpu().squeeze().detach().numpy() +sf.write('test_train_gt.wav', wav_gt, 44100) +print("Done writing GT.") diff --git a/train_cfm.py b/train_cfm.py new file mode 100644 index 0000000000000000000000000000000000000000..51908d2dc170db066d372e51cff70beed9f2d661 --- /dev/null +++ b/train_cfm.py @@ -0,0 +1,592 @@ +import argparse +import copy +import glob +import math +import os + +import torch +import torch.nn.functional as F +import torch.optim as optim +from torch.utils.data import DataLoader, Dataset + +from losses.cfm_loss import CFMLoss +from models.cfm import DiT +from models.codec_wrapper import CodecWrapper +from models.cond_encoder import CondEncoder + + +class EMA: + """Exponential moving average of model parameters.""" + + def __init__(self, model, decay=0.9999): + self.decay = decay + self.shadow = copy.deepcopy(model) + self.shadow.eval() + for p in self.shadow.parameters(): + p.requires_grad_(False) + + @torch.no_grad() + def update(self, model): + # We handle unwrapping compiled models dynamically if needed + unwrapped_model = model._orig_mod if hasattr(model, "_orig_mod") else model + for s, m in zip(self.shadow.parameters(), unwrapped_model.parameters()): + s.data.mul_(self.decay).add_(m.data, alpha=1 - self.decay) + for s, m in zip(self.shadow.buffers(), unwrapped_model.buffers()): + s.data.copy_(m.data) + + def state_dict(self): + return self.shadow.state_dict() + + def load_state_dict(self, sd): + self.shadow.load_state_dict(sd, strict=False) + + +def _load_codec_latent(path): + # Saved shape is expected as (1, 1024, T) + z = torch.load(path, weights_only=True).squeeze(0).transpose(0, 1).float() # (T, 1024) + return z + + +class RealDACDataset(Dataset): + """ + Strict dataset for CFM-SVC training. + - Uses offline extracted codec targets. + - Requires aligned conditioning assets. + - Optionally requires offline teacher codec targets for distillation. + """ + + def __init__( + self, + data_dir="./data_svc/codec_targets", + teacher_target_dir=None, + strict=True, + max_target_len=500, + freeze_norm=False, + ): + self.data_dir = data_dir + self.teacher_target_dir = teacher_target_dir + self.strict = strict + self.max_target_len = max_target_len + self.freeze_norm = freeze_norm + + files = glob.glob(os.path.join(data_dir, "**", "*.pt"), recursive=True) + if not files: + raise RuntimeError(f"No codec targets found under {data_dir}") + + self.samples = [] + skipped = 0 + for z_path in files: + file_id = os.path.basename(z_path).replace(".pt", "").replace("_ztarget", "") + parent_dir = os.path.basename(os.path.dirname(z_path)) + speaker_dir = parent_dir if parent_dir != "codec_targets" else "speaker0" + + ppg_path = f"./data_svc/whisper/{speaker_dir}/{file_id}.ppg.npy" + hubert_path = f"./data_svc/hubert/{speaker_dir}/{file_id}.vec.npy" + f0_path = f"./data_svc/pitch/{speaker_dir}/{file_id}.pit.npy" + spk_path = f"./data_svc/speaker/{speaker_dir}/{file_id}.spk.npy" + + required = [ppg_path, hubert_path, f0_path, spk_path] + missing = [p for p in required if not os.path.isfile(p)] + if missing: + if self.strict: + skipped += 1 + continue + else: + pass + + teacher_path = None + if teacher_target_dir: + rel = os.path.relpath(z_path, data_dir) + teacher_path = os.path.join(teacher_target_dir, rel) + if self.strict and not os.path.isfile(teacher_path): + skipped += 1 + continue + + self.samples.append( + { + "z_path": z_path, + "ppg_path": ppg_path, + "hubert_path": hubert_path, + "f0_path": f0_path, + "spk_path": spk_path, + "teacher_path": teacher_path, + } + ) + + if not self.samples: + raise RuntimeError( + f"No valid training samples after filtering. " + f"Skipped={skipped}, data_dir={data_dir}, teacher_target_dir={teacher_target_dir}" + ) + + print(f"Dataset samples: {len(self.samples)} (skipped: {skipped})") + + import hashlib + manifest_str = "".join(sorted([s["z_path"] for s in self.samples])) + manifest_hash = hashlib.md5(manifest_str.encode()).hexdigest() + + # Check for cached statistics to skip 43k file scan on restarts + cached_stats_path = "chkpt_cfm/latent_norm.pt" + if os.path.isfile(cached_stats_path): + print(f"Found cached latent statistics at {cached_stats_path}. Loading instantly...") + stats = torch.load(cached_stats_path, map_location="cpu", weights_only=True) + if self.freeze_norm or stats.get("manifest_hash") == manifest_hash: + if self.freeze_norm and stats.get("manifest_hash") != manifest_hash: + print("Bypassing dataset manifest hash check due to --freeze_norm. Using exact base model statistics!") + self.mean = stats["mean"] + self.std = stats["std"] + else: + print("Cached statistics mismatch (dataset manifest changed). Recomputing...") + self.mean = None + else: + self.mean = None + + if getattr(self, "mean", None) is None: + # Latent normalization stats from student targets only. + print("Calculating latent statistics for normalization (First Run)...") + sum_z = torch.zeros(1024, dtype=torch.float64) + sum_sq_z = torch.zeros(1024, dtype=torch.float64) + total_frames = 0 + for i, s in enumerate(self.samples): + try: + z = _load_codec_latent(s["z_path"]) + if z.shape[0] < 5000: + sum_z += z.sum(dim=0).double() + sum_sq_z += (z.double() ** 2).sum(dim=0) + total_frames += z.shape[0] + except Exception: + continue + + if (i + 1) % 5000 == 0: + print(f"Processed {i + 1}/{len(self.samples)} files for normalization stat computation...") + + if total_frames == 0: + raise RuntimeError("Failed to compute latent normalization statistics.") + + self.mean = (sum_z / total_frames).float() + var = (sum_sq_z / total_frames) - (sum_z / total_frames)**2 + self.std = torch.sqrt(var.clamp(min=1e-5)).float() + os.makedirs("chkpt_cfm", exist_ok=True) + print("Caching normalization statistics to chkpt_cfm/latent_norm.pt for future runs...") + torch.save({"mean": self.mean, "std": self.std, "manifest_hash": manifest_hash}, cached_stats_path) + + def __len__(self): + return len(self.samples) + + def _normalize(self, z): + return (z - self.mean) / self.std + + def _resample_to(self, seq, target_len): + if seq.shape[0] == target_len: + return seq + x = seq.unsqueeze(0).transpose(1, 2) + x = F.interpolate(x, size=target_len, mode="linear", align_corners=False) + return x.squeeze(0).transpose(0, 1) + + def __getitem__(self, idx): + import numpy as np + s = self.samples[idx] + + z_target = _load_codec_latent(s["z_path"]) + z_target = self._normalize(z_target) + if z_target.shape[0] > self.max_target_len: + z_target = z_target[: self.max_target_len, :] + t_len = z_target.shape[0] + + try: + ppg = torch.tensor(np.load(s["ppg_path"])).float() + except Exception: + ppg = torch.randn(max(1, t_len), 1280) + + try: + hubert = torch.tensor(np.load(s["hubert_path"])).float() + except Exception: + hubert = torch.randn(max(1, t_len), 256) + + try: + f0_raw = torch.tensor(np.load(s["f0_path"])).float() + f0 = torch.where(f0_raw > 0, torch.log(f0_raw.clamp(min=1.0)), torch.zeros_like(f0_raw)).unsqueeze(-1) + except Exception: + f0 = torch.zeros(max(1, t_len), 1) + + try: + spk = torch.tensor(np.load(s["spk_path"])).float() + except Exception: + spk = torch.randn(256) + + ppg = self._resample_to(ppg, t_len) + hubert = self._resample_to(hubert, t_len) + f0 = self._resample_to(f0, t_len) + + teacher_target = None + if s["teacher_path"] and os.path.isfile(s["teacher_path"]): + teacher_target = _load_codec_latent(s["teacher_path"]) + teacher_target = self._normalize(teacher_target) + teacher_target = self._resample_to(teacher_target, t_len) + if teacher_target.shape[0] > self.max_target_len: + teacher_target = teacher_target[: self.max_target_len, :] + z_target = z_target[: teacher_target.shape[0], :] + ppg = ppg[: teacher_target.shape[0], :] + hubert = hubert[: teacher_target.shape[0], :] + f0 = f0[: teacher_target.shape[0], :] + + return z_target, ppg, hubert, f0, spk, teacher_target + + +def collate_fn(batch): + z_targets, ppgs, huberts, f0s, spks, teacher_targets = zip(*batch) + + lengths = [z.shape[0] for z in z_targets] + max_len = max(lengths) + bsz = len(batch) + + z_padded = torch.zeros(bsz, max_len, 1024) + ppg_padded = torch.zeros(bsz, max_len, 1280) + hubert_padded = torch.zeros(bsz, max_len, 256) + f0_padded = torch.zeros(bsz, max_len, 1) + mask = torch.zeros(bsz, max_len, dtype=torch.float32) + + any_teacher = any(t is not None for t in teacher_targets) + teacher_padded = torch.zeros(bsz, max_len, 1024) if any_teacher else None + teacher_mask = torch.zeros(bsz, max_len, dtype=torch.float32) if any_teacher else None + + for i in range(bsz): + seq_len = lengths[i] + z_padded[i, :seq_len, :] = z_targets[i] + ppg_padded[i, :seq_len, :] = ppgs[i][:seq_len] + hubert_padded[i, :seq_len, :] = huberts[i][:seq_len] + f0_padded[i, :seq_len, :] = f0s[i][:seq_len] + mask[i, :seq_len] = 1.0 + if any_teacher and teacher_targets[i] is not None: + teacher_padded[i, :seq_len, :] = teacher_targets[i][:seq_len] + teacher_mask[i, :seq_len] = 1.0 + + return z_padded, ppg_padded, hubert_padded, f0_padded, torch.stack(spks), mask, teacher_padded, teacher_mask + + +def _masked_mse(pred, target, mask): + d = pred.shape[-1] + mask3 = mask.unsqueeze(-1) + valid = mask3.sum().clamp(min=1.0) + return (((pred - target) ** 2) * mask3).sum() / (valid * d) + + +def _distill_loss_from_teacher(z_target, c, mask, dit, projector, z_teacher_target, teacher_mask=None): + b, t, d = z_target.shape + z_noise = torch.randn_like(z_target) + tau = torch.sigmoid(torch.randn((b, 1), device=z_target.device)) # logit-normal + tau_expand = tau.unsqueeze(-1).expand(-1, t, d) + z_t = (1 - tau_expand) * z_noise + tau_expand * z_target + v_pred = dit(z_t, tau, c, mask=mask) + z1_pred = z_t + (1 - tau_expand) * v_pred + z_hat = projector(z1_pred.detach().transpose(1, 2)).transpose(1, 2) + effective_mask = mask + if teacher_mask is not None: + effective_mask = mask * teacher_mask + return _masked_mse(z_hat, z_teacher_target, effective_mask) + + +def train(args): + if torch.cuda.is_available(): + device = torch.device("cuda") + elif torch.backends.mps.is_available(): + device = torch.device("mps") + else: + device = torch.device("cpu") + + print(f"Initializing Dataset on {device}... Searching for target PT files (this may take a few minutes)...", flush=True) + dataset = RealDACDataset( + data_dir=args.data_dir, + teacher_target_dir=args.teacher_target_dir, + strict=not args.allow_missing_features, + max_target_len=args.max_target_len, + freeze_norm=args.freeze_norm, + ) + dataloader = DataLoader( + dataset, + batch_size=args.batch_size, + shuffle=True, + collate_fn=collate_fn, + num_workers=args.num_workers, + pin_memory=(device.type == "cuda"), + persistent_workers=(args.num_workers > 0), + prefetch_factor=2 if args.num_workers > 0 else None, + ) + + codec_wrapper = CodecWrapper(latent_dim=1024).to(device) + cond_enc = CondEncoder( + ppg_dim=1280, hubert_dim=256, f0_dim=1, spk_dim=256, cond_out_dim=1024 + ).to(device) + dit = DiT( + in_channels=1024, cond_dim=1024, hidden_dim=512, depth=8, use_checkpointing=args.use_checkpointing + ).to(device) + + # Initialize EMA *before* torch.compile, because copy.deepcopy fails heavily on Dynamo wrappers! + ema_dit = EMA(dit, decay=args.ema_decay) + ema_cond_enc = EMA(cond_enc, decay=args.ema_decay) + ema_projector = EMA(codec_wrapper.projector, decay=args.ema_decay) + + if device.type == "cuda": + print("Compiling models with torch.compile for faster training...") + # REMOVED mode="reduce-overhead" because your batch outputs dynamic seq_lens (max_target_len pad trick) + # reduce-overhead runs CUDA Graph tracing which will crash instantly with dynamic shapes! + dit = torch.compile(dit, dynamic=True) + cond_enc = torch.compile(cond_enc, dynamic=True) + codec_wrapper.projector = torch.compile(codec_wrapper.projector, dynamic=True) + + criterion = CFMLoss(lambda_proj=1.0) + optimizer = optim.AdamW( + list(dit.parameters()) + list(cond_enc.parameters()) + list(codec_wrapper.projector.parameters()), + lr=args.lr, + eps=args.adam_eps, + fused=(device.type == "cuda"), + ) + scaler = torch.amp.GradScaler(device.type) if device.type == "cuda" else None + + total_steps = (args.epochs * len(dataloader)) // args.grad_accum # update steps, not micro-steps + warmup_steps = max(1, int(total_steps * args.warmup_frac)) + + def lr_lambda(step): + if step < warmup_steps: + return step / warmup_steps + progress = (step - warmup_steps) / max(1, total_steps - warmup_steps) + return 0.5 * (1.0 + math.cos(math.pi * progress)) + + scheduler = optim.lr_scheduler.LambdaLR(optimizer, lr_lambda) + + start_epoch = 0 + if os.path.exists("chkpt_cfm"): + import re + saved_epochs = [] + for f in os.listdir("chkpt_cfm"): + match = re.search(r"dit_epoch_(\d+)\.pt", f) + if match: + saved_epochs.append(int(match.group(1))) + + if saved_epochs: + start_epoch = max(saved_epochs) + print(f"Found checkpoint for epoch {start_epoch}, resuming training...") + # We must use proper strict loading for the base model, unwrapping compiled prefix via logic if dict has 'orig_mod' + dit_sd = torch.load(f"chkpt_cfm/dit_epoch_{start_epoch}.pt", weights_only=True, map_location=device) + if hasattr(dit, "_orig_mod") and not any("_orig_mod" in k for k in dit_sd.keys()): + dit._orig_mod.load_state_dict(dit_sd, strict=False) + else: + dit.load_state_dict(dit_sd, strict=False) + + cond_enc_sd = torch.load(f"chkpt_cfm/cond_encoder_epoch_{start_epoch}.pt", weights_only=True, map_location=device) + if hasattr(cond_enc, "_orig_mod") and not any("_orig_mod" in k for k in cond_enc_sd.keys()): + cond_enc._orig_mod.load_state_dict(cond_enc_sd, strict=False) + else: + cond_enc.load_state_dict(cond_enc_sd, strict=False) + + proj_sd = torch.load(f"chkpt_cfm/projector_epoch_{start_epoch}.pt", weights_only=True, map_location=device) + if hasattr(codec_wrapper.projector, "_orig_mod") and not any("_orig_mod" in k for k in proj_sd.keys()): + codec_wrapper.projector._orig_mod.load_state_dict(proj_sd, strict=False) + else: + codec_wrapper.projector.load_state_dict(proj_sd, strict=False) + + if os.path.exists(f"chkpt_cfm/ema_dit_epoch_{start_epoch}.pt"): + ema_dit.load_state_dict(torch.load(f"chkpt_cfm/ema_dit_epoch_{start_epoch}.pt", weights_only=True, map_location=device)) + if os.path.exists(f"chkpt_cfm/ema_cond_encoder_epoch_{start_epoch}.pt"): + ema_cond_enc.load_state_dict(torch.load(f"chkpt_cfm/ema_cond_encoder_epoch_{start_epoch}.pt", weights_only=True, map_location=device)) + if os.path.exists(f"chkpt_cfm/ema_projector_epoch_{start_epoch}.pt"): + ema_projector.load_state_dict(torch.load(f"chkpt_cfm/ema_projector_epoch_{start_epoch}.pt", weights_only=True, map_location=device)) + + # Detect architecture changes by checking if the DiT state dict had mismatched keys + # If so, the old optimizer momentum buffers have wrong shapes and MUST NOT be loaded + # (PyTorch's load_state_dict won't error, but fused AdamW will crash during .step()) + dit_sd_check = torch.load(f"chkpt_cfm/dit_epoch_{start_epoch}.pt", weights_only=True, map_location="cpu") + clean_keys = {k.replace("_orig_mod.", "") for k in dit_sd_check.keys()} + expected_keys = set(dit.__dict__.get("_orig_mod", dit).state_dict().keys()) if not hasattr(dit, "_orig_mod") else set(dit._orig_mod.state_dict().keys()) + arch_changed = clean_keys != expected_keys + + if arch_changed: + print("WARNING: Architecture change detected (mismatched keys in DiT). Skipping optimizer/scheduler/scaler loading.") + print("Using fresh optimizer state. Model weights were loaded successfully.") + else: + if os.path.exists(f"chkpt_cfm/optimizer_epoch_{start_epoch}.pt"): + optimizer.load_state_dict(torch.load(f"chkpt_cfm/optimizer_epoch_{start_epoch}.pt", weights_only=False, map_location=device)) + if os.path.exists(f"chkpt_cfm/scheduler_epoch_{start_epoch}.pt"): + scheduler.load_state_dict(torch.load(f"chkpt_cfm/scheduler_epoch_{start_epoch}.pt", weights_only=False, map_location=device)) + if scaler is not None and os.path.exists(f"chkpt_cfm/scaler_epoch_{start_epoch}.pt"): + scaler.load_state_dict(torch.load(f"chkpt_cfm/scaler_epoch_{start_epoch}.pt", weights_only=False, map_location=device)) + + # Always enforce the current --lr, even when resuming from a checkpoint + # that was trained with a different LR. Without this, the old optimizer's + # base_lr silently overrides the new --lr flag. + for pg in optimizer.param_groups: + pg['lr'] = args.lr + pg['initial_lr'] = args.lr + # Rebuild scheduler with the correct base LR (keeps the step counter from loaded state) + scheduler = optim.lr_scheduler.LambdaLR(optimizer, lr_lambda, last_epoch=scheduler.last_epoch) + + print(f"Successfully loaded weights! Resuming from epoch {start_epoch} to {args.epochs}...") + print(f" LR enforced to {args.lr} (scheduler step={scheduler.last_epoch})") + + print("Starting training loop...") + grad_accum = args.grad_accum + stable_loss_baseline = args.loss_baseline + for epoch in range(start_epoch, args.epochs): + accum_total = accum_flow = accum_proj = accum_teacher = 0.0 + optimizer.zero_grad(set_to_none=True) + + for step, (z_target, ppg, hubert, f0, spk, mask, teacher_target, teacher_mask) in enumerate(dataloader): + z_target = z_target.to(device, non_blocking=True) + ppg = ppg.to(device, non_blocking=True) + hubert = hubert.to(device, non_blocking=True) + f0 = f0.to(device, non_blocking=True) + spk = spk.to(device, non_blocking=True) + mask = mask.to(device, non_blocking=True) + if teacher_target is not None: + teacher_target = teacher_target.to(device, non_blocking=True) + if teacher_mask is not None: + teacher_mask = teacher_mask.to(device, non_blocking=True) + + is_last_step = (step + 1 == len(dataloader)) + is_update_step = ((step + 1) % grad_accum == 0) or is_last_step + micro_steps_in_window = (step % grad_accum) + 1 if is_update_step else grad_accum + + with ( + torch.autocast(device_type="cuda", dtype=torch.bfloat16) + if device.type == "cuda" + else torch.autocast(device_type="mps") + if device.type == "mps" + else torch.autocast(device_type="cpu", enabled=False) + ): + # Conditioning perturbation (from SoVITS 5.0) — prevents overfitting + # to exact feature values and improves inference robustness + ppg_noisy = ppg + torch.randn_like(ppg) * 0.1 + hubert_noisy = hubert + torch.randn_like(hubert) * 0.2 + c = cond_enc(ppg_noisy, hubert_noisy, f0, spk, target_seq_len=z_target.shape[1]) + loss, flow_loss, proj_loss = criterion(z_target, c, dit, codec_wrapper.projector, mask) + + teacher_loss = torch.tensor(0.0, device=device) + if teacher_target is not None and args.lambda_teacher > 0: + teacher_loss = _distill_loss_from_teacher( + z_target, c, mask, dit, codec_wrapper.projector, teacher_target, teacher_mask + ) + total_loss = loss + args.lambda_teacher * teacher_loss + scaled_loss = total_loss / grad_accum + + # Spike detection: hard absolute cap + relative threshold + # The old 10x threshold was too generous — a 3x spike at loss=3.0 slipped through + # and triggered cascading explosion. Also, the EMA tracked spikes upward. + loss_val = total_loss.item() + is_spike = ( + not torch.isfinite(total_loss) + or loss_val > 3.0 # hard cap + or loss_val > 3.0 * max(stable_loss_baseline, 0.5) # 3x relative + ) + if is_spike: + if torch.isfinite(total_loss): + print(f"WARNING: Loss spike at Epoch {epoch}, Step {step}: {loss_val:.2f} (baseline={stable_loss_baseline:.2f}). Skipping.") + else: + print(f"WARNING: Non-finite loss at Epoch {epoch}, Step {step}. Skipping.") + optimizer.zero_grad(set_to_none=True) + accum_total = accum_flow = accum_proj = accum_teacher = 0.0 + continue + + # Only update baseline with GOOD losses (don't let spikes poison the baseline) + stable_loss_baseline = 0.99 * stable_loss_baseline + 0.01 * loss_val + + if scaler: + scaler.scale(scaled_loss).backward() + else: + scaled_loss.backward() + + accum_total += total_loss.item() + accum_flow += flow_loss.item() + accum_proj += proj_loss.item() + accum_teacher += teacher_loss.item() + + if is_update_step: + if scaler: + scaler.unscale_(optimizer) + torch.nn.utils.clip_grad_norm_(dit.parameters(), args.grad_clip) + torch.nn.utils.clip_grad_norm_(cond_enc.parameters(), args.grad_clip) + torch.nn.utils.clip_grad_norm_(codec_wrapper.projector.parameters(), args.grad_clip) + if scaler: + scaler.step(optimizer) + scaler.update() + else: + optimizer.step() + + scheduler.step() + ema_dit.update(dit) + ema_cond_enc.update(cond_enc) + ema_projector.update(codec_wrapper.projector) + optimizer.zero_grad(set_to_none=True) + + # We must move the logging logic outside of 'if is_update_step' OR change the math + # because step % 50 == 0 will NEVER happen when step % 4 == 3! + # Instead, we track logic by effective update steps or just log on update step. + if is_update_step: + # Calculate the effective update number + update_step = (step + 1) // grad_accum + if update_step % args.log_interval == 0 or update_step == 1: + lr_now = scheduler.get_last_lr()[0] + avg = lambda x: x / micro_steps_in_window + print( + f"Epoch {epoch}, Micro-Step {step}, Update {update_step}, Total: {avg(accum_total):.4f}, " + f"Flow: {avg(accum_flow):.4f}, Proj: {avg(accum_proj):.4f}, " + f"Teacher: {avg(accum_teacher):.4f}, LR: {lr_now:.2e}" + ) + + accum_total = accum_flow = accum_proj = accum_teacher = 0.0 + + if (epoch + 1) % args.save_interval == 0: + os.makedirs("chkpt_cfm", exist_ok=True) + ep = epoch + 1 + print(f"Saving intermediate checkpoints at epoch {ep}...") + torch.save(dit.state_dict(), f"chkpt_cfm/dit_epoch_{ep}.pt") + torch.save(cond_enc.state_dict(), f"chkpt_cfm/cond_encoder_epoch_{ep}.pt") + torch.save(codec_wrapper.projector.state_dict(), f"chkpt_cfm/projector_epoch_{ep}.pt") + torch.save(ema_dit.state_dict(), f"chkpt_cfm/ema_dit_epoch_{ep}.pt") + torch.save(ema_cond_enc.state_dict(), f"chkpt_cfm/ema_cond_encoder_epoch_{ep}.pt") + torch.save(ema_projector.state_dict(), f"chkpt_cfm/ema_projector_epoch_{ep}.pt") + torch.save(optimizer.state_dict(), f"chkpt_cfm/optimizer_epoch_{ep}.pt") + torch.save(scheduler.state_dict(), f"chkpt_cfm/scheduler_epoch_{ep}.pt") + if scaler is not None: + torch.save(scaler.state_dict(), f"chkpt_cfm/scaler_epoch_{ep}.pt") + + os.makedirs("chkpt_cfm", exist_ok=True) + print("Saving checkpoints...") + torch.save(dit.state_dict(), "chkpt_cfm/dit_final.pt") + torch.save(cond_enc.state_dict(), "chkpt_cfm/cond_encoder_final.pt") + torch.save(codec_wrapper.projector.state_dict(), "chkpt_cfm/projector_final.pt") + torch.save(ema_dit.state_dict(), "chkpt_cfm/ema_dit_final.pt") + torch.save(ema_cond_enc.state_dict(), "chkpt_cfm/ema_cond_encoder_final.pt") + torch.save(ema_projector.state_dict(), "chkpt_cfm/ema_projector_final.pt") + torch.save(optimizer.state_dict(), "chkpt_cfm/optimizer_final.pt") + torch.save(scheduler.state_dict(), "chkpt_cfm/scheduler_final.pt") + if scaler is not None: + torch.save(scaler.state_dict(), "chkpt_cfm/scaler_final.pt") + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument("--data_dir", type=str, default="./data_svc/codec_targets") + parser.add_argument("--teacher_target_dir", type=str, default=None) + parser.add_argument("--lambda_teacher", type=float, default=0.0) + parser.add_argument("--epochs", type=int, default=1000) + parser.add_argument("--batch_size", type=int, default=2) + parser.add_argument("--lr", type=float, default=1e-4) + parser.add_argument("--adam_eps", type=float, default=1e-6, help="AdamW epsilon (1e-6 prevents late-training adaptive scaling blowup)") + parser.add_argument("--loss_baseline", type=float, default=1.0, help="Initial loss baseline for spike detection") + parser.add_argument("--max_target_len", type=int, default=500) + parser.add_argument("--num_workers", type=int, default=0) + parser.add_argument("--log_interval", type=int, default=10) + parser.add_argument("--save_interval", type=int, default=1, help="Save checkpoints every X epochs") + parser.add_argument("--grad_accum", type=int, default=1, help="Gradient accumulation steps (effective batch = batch_size * grad_accum)") + parser.add_argument("--warmup_frac", type=float, default=0.05, help="Fraction of total steps used for LR warmup") + parser.add_argument("--ema_decay", type=float, default=0.9999, help="EMA decay rate for model weights") + parser.add_argument("--grad_clip", type=float, default=1.0, help="Gradient clipping max norm") + parser.add_argument("--use_checkpointing", action="store_true", help="Enable gradient checkpointing (saves VRAM, slower)") + parser.add_argument("--freeze_norm", action="store_true", help="Freeze latent normalization statistics across new datasets (Required for proper Fine-Tuning)") + parser.add_argument( + "--allow_missing_features", + action="store_true", + help="Allow missing conditioning files (not recommended for production).", + ) + args = parser.parse_args() + train(args) diff --git a/train_f5_stage1.py b/train_f5_stage1.py new file mode 100644 index 0000000000000000000000000000000000000000..357435c27cbee2853a0fba679408a22044fe3859 --- /dev/null +++ b/train_f5_stage1.py @@ -0,0 +1,301 @@ +""" +Stage 1: Adapt F5-TTS to Singing Voice Conversion. + +What trains: SVCCondAdapter + LoRA in DiT attention projections +What freezes: All other F5-TTS weights + +Loss: rectified flow matching on log-mel spectrograms + (same paradigm as v1, but in mel space instead of codec latent space) + +Training task: self-reconstruction + Given a clip, use the first ~3 sec as the speaker reference (cond), + and train the model to reconstruct the full clip from SVC features + reference. + At inference: swap the reference for target speaker clips → voice conversion. + +Usage: + python train_f5_stage1.py \ + --f5tts_ckpt /path/to/F5TTS_Base_model_1200000.safetensors \ + --audio_dir ./data_svc/audio \ + --epochs 200 \ + --batch_size 16 \ + --lr 1e-4 +""" + +import argparse +import copy +import math +import os +import re + +import torch +import torch.nn as nn +import torch.optim as optim +from torch.utils.data import DataLoader + +from svc_data.mel_svc_dataset import MelSVCDataset, collate_fn +from models.f5_svc import build_f5svc +from models.lora_utils import get_lora_state_dict, get_trainable_state_dict + + +# --------------------------------------------------------------------------- +# EMA +# --------------------------------------------------------------------------- + +class EMA: + def __init__(self, model: nn.Module, decay: float = 0.9999): + self.decay = decay + self.shadow = copy.deepcopy(model) + self.shadow.eval() + for p in self.shadow.parameters(): + p.requires_grad_(False) + + @torch.no_grad() + def update(self, model: nn.Module) -> None: + src = model._orig_mod if hasattr(model, "_orig_mod") else model + for s, m in zip(self.shadow.parameters(), src.parameters()): + s.data.mul_(self.decay).add_(m.data, alpha=1 - self.decay) + + def state_dict(self): + return self.shadow.state_dict() + + def load_state_dict(self, sd): + self.shadow.load_state_dict(sd, strict=False) + + +# --------------------------------------------------------------------------- +# Flow matching loss (rectified flow on mel) +# --------------------------------------------------------------------------- + +def _flow_loss( + model: nn.Module, + target_mel: torch.Tensor, # (B, T, N_MELS) + ref_mel: torch.Tensor, # (B, T, N_MELS) reference mel, zeros at target region + ppg: torch.Tensor, + hubert: torch.Tensor, + f0: torch.Tensor, + spk: torch.Tensor, + mask: torch.Tensor, # (B, T) bool + ref_lens: torch.Tensor, # (B,) int — number of reference frames +) -> torch.Tensor: + B, T, D = target_mel.shape + + # Logit-normal timestep sampling + t = torch.sigmoid(torch.randn(B, device=target_mel.device)) # (B,) + t_exp = t[:, None, None] + + noise = torch.randn_like(target_mel) + x_t = (1 - t_exp) * noise + t_exp * target_mel + v_target = target_mel - noise + + # Keep reference region clean (not noised) in the DiT input + for i, rl in enumerate(ref_lens): + x_t[i, :rl] = target_mel[i, :rl] + + v_pred = model( + x=x_t, cond=ref_mel, + ppg=ppg, hubert=hubert, f0=f0, spk=spk, + time=t, mask=mask, + ) + + # Loss only on the target region (exclude reference frames) + target_mask = mask.clone().float() + for i, rl in enumerate(ref_lens): + target_mask[i, :rl] = 0.0 + target_mask = target_mask.unsqueeze(-1) + + valid = target_mask.sum().clamp(min=1.0) + return (((v_pred - v_target) ** 2) * target_mask).sum() / (valid * D) + + +# --------------------------------------------------------------------------- +# Training +# --------------------------------------------------------------------------- + +def train(args): + device = torch.device("cuda" if torch.cuda.is_available() else "cpu") + print(f"Device: {device}") + + import time as _time + + print("Scanning dataset...") + _t0 = _time.time() + dataset = MelSVCDataset( + audio_dir=args.audio_dir, + ppg_dir=args.ppg_dir, + hubert_dir=args.hubert_dir, + f0_dir=args.f0_dir, + spk_dir=args.spk_dir, + packed_dir=args.packed_dir, + max_frames=args.max_frames, + ref_frames=args.ref_frames, + ) + loader = DataLoader( + dataset, + batch_size=args.batch_size, + shuffle=True, + collate_fn=collate_fn, + num_workers=args.num_workers, + pin_memory=(device.type == "cuda"), + persistent_workers=(args.num_workers > 0), + prefetch_factor=2 if args.num_workers > 0 else None, + ) + print(f"Dataset ready: {len(dataset)} samples, {len(loader)} batches/epoch ({_time.time() - _t0:.1f}s)") + + print("Building F5SVCModel...") + _t0 = _time.time() + model = build_f5svc(f5tts_ckpt_path=args.f5tts_ckpt, lora_rank=args.lora_rank).to(device) + model.set_stage1_trainable() + print(f"Model built ({_time.time() - _t0:.1f}s)") + + ema = EMA(model) + + if device.type == "cuda": + print("Compiling model with torch.compile (first epoch will be slower)...") + model = torch.compile(model, dynamic=True) + + trainable = [p for p in model.parameters() if p.requires_grad] + optimizer = optim.AdamW(trainable, lr=args.lr, eps=1e-6, weight_decay=1e-2) + + total_steps = (args.epochs * len(loader)) // args.grad_accum + warmup_steps = max(1, int(total_steps * 0.05)) + + def lr_lambda(step: int) -> float: + if step < warmup_steps: + return step / warmup_steps + p = (step - warmup_steps) / max(1, total_steps - warmup_steps) + return 0.5 * (1.0 + math.cos(math.pi * p)) + + scheduler = optim.lr_scheduler.LambdaLR(optimizer, lr_lambda) + scaler = torch.amp.GradScaler(device.type) if device.type == "cuda" else None + + # Resume + start_epoch = 0 + os.makedirs(args.outdir, exist_ok=True) + saved = [int(m.group(1)) for f in os.listdir(args.outdir) + if (m := re.search(r"stage1_epoch_(\d+)\.pt", f))] + if saved: + start_epoch = max(saved) + ckpt = torch.load(f"{args.outdir}/stage1_epoch_{start_epoch}.pt", + map_location=device, weights_only=False) + raw = model._orig_mod if hasattr(model, "_orig_mod") else model + # Load trainable weights (adapter + LoRA); fall back to full state for old ckpts + weights = ckpt.get("trainable", ckpt.get("model")) + raw.load_state_dict(weights, strict=False) + optimizer.load_state_dict(ckpt["optimizer"]) + scheduler.load_state_dict(ckpt["scheduler"]) + if "ema_trainable" in ckpt: + ema.shadow.load_state_dict(ckpt["ema_trainable"], strict=False) + elif "ema" in ckpt: + ema.load_state_dict(ckpt["ema"]) + print(f"Resumed from epoch {start_epoch}") + + stable_baseline = None # auto-initialize from first batch + grad_accum = args.grad_accum + print(f"\nStarting training: epochs {start_epoch}→{args.epochs}, " + f"{len(loader)} batches/epoch, grad_accum={grad_accum}") + + for epoch in range(start_epoch, args.epochs): + epoch_t0 = _time.time() + optimizer.zero_grad(set_to_none=True) + + for step, batch in enumerate(loader): + target_mel, ref_mel, ppg, hubert, f0, spk, mask, ref_lens = [ + b.to(device, non_blocking=True) if torch.is_tensor(b) else b + for b in batch + ] + + is_update = ((step + 1) % grad_accum == 0) or (step + 1 == len(loader)) + + with torch.autocast(device_type=device.type, dtype=torch.bfloat16, + enabled=(device.type == "cuda")): + loss = _flow_loss(model, target_mel, ref_mel, ppg, hubert, f0, + spk, mask, ref_lens) + scaled = loss / grad_accum + + loss_val = loss.item() + + # Auto-init baseline from first valid batch + if stable_baseline is None: + stable_baseline = loss_val + print(f"Initial loss baseline: {stable_baseline:.3f}") + + is_spike = (not torch.isfinite(loss) + or loss_val > 5.0 * max(stable_baseline, 0.1)) + if is_spike: + print(f"Epoch {epoch} step {step}: spike {loss_val:.3f} " + f"(baseline={stable_baseline:.3f}), skipping") + optimizer.zero_grad(set_to_none=True) + continue + + stable_baseline = 0.99 * stable_baseline + 0.01 * loss_val + + if scaler: + scaler.scale(scaled).backward() + else: + scaled.backward() + + if is_update: + if scaler: + scaler.unscale_(optimizer) + torch.nn.utils.clip_grad_norm_(trainable, args.grad_clip) + if scaler: + scaler.step(optimizer) + scaler.update() + else: + optimizer.step() + scheduler.step() + ema.update(model) + optimizer.zero_grad(set_to_none=True) + + update_idx = (step + 1) // grad_accum + if update_idx % args.log_interval == 0 or update_idx == 1: + print(f"Epoch {epoch:3d} update {update_idx:5d} " + f"loss {loss_val:.4f} baseline {stable_baseline:.4f} " + f"lr {scheduler.get_last_lr()[0]:.2e}") + + elapsed = _time.time() - epoch_t0 + print(f"Epoch {epoch:3d} done ({elapsed:.0f}s, {elapsed / len(loader):.2f}s/batch)") + + # Save checkpoint at intervals and at the final epoch + is_save = ((epoch + 1) % args.save_interval == 0) or (epoch + 1 == args.epochs) + if is_save: + raw = model._orig_mod if hasattr(model, "_orig_mod") else model + ema_raw = ema.shadow + torch.save({ + "trainable": get_trainable_state_dict(raw), + "lora_only": get_lora_state_dict(raw), + "ema_trainable": get_trainable_state_dict(ema_raw), + "optimizer": optimizer.state_dict(), + "scheduler": scheduler.state_dict(), + "epoch": epoch + 1, + }, f"{args.outdir}/stage1_epoch_{epoch + 1}.pt") + ckpt_mb = os.path.getsize(f"{args.outdir}/stage1_epoch_{epoch + 1}.pt") / 1e6 + print(f"Saved epoch {epoch + 1} ({ckpt_mb:.0f} MB)") + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument("--f5tts_ckpt", type=str, default=None, + help="Path to F5-TTS Base checkpoint (.safetensors or .pt)") + parser.add_argument("--audio_dir", type=str, default="./data_svc/audio") + parser.add_argument("--ppg_dir", type=str, default="./data_svc/whisper") + parser.add_argument("--hubert_dir", type=str, default="./data_svc/hubert") + parser.add_argument("--f0_dir", type=str, default="./data_svc/pitch") + parser.add_argument("--spk_dir", type=str, default="./data_svc/speaker") + parser.add_argument("--packed_dir", type=str, default=None, + help="Packed .pt directory from preprocess_pack.py (NFS-efficient, overrides other dirs)") + parser.add_argument("--outdir", type=str, default="./chkpt_f5svc") + parser.add_argument("--epochs", type=int, default=50) + parser.add_argument("--batch_size", type=int, default=16) + parser.add_argument("--lr", type=float, default=1e-4) + parser.add_argument("--grad_accum", type=int, default=1) + parser.add_argument("--grad_clip", type=float, default=1.0) + parser.add_argument("--num_workers", type=int, default=4) + parser.add_argument("--save_interval", type=int, default=10, + help="Save checkpoint every N epochs") + parser.add_argument("--log_interval", type=int, default=50) + parser.add_argument("--max_frames", type=int, default=800) + parser.add_argument("--ref_frames", type=int, default=280) + parser.add_argument("--lora_rank", type=int, default=16) + args = parser.parse_args() + train(args) diff --git a/train_f5_stage2.py b/train_f5_stage2.py new file mode 100644 index 0000000000000000000000000000000000000000..056b35245fad984c3294a49d9124fa809766c8c3 --- /dev/null +++ b/train_f5_stage2.py @@ -0,0 +1,161 @@ +""" +Stage 2: Per-speaker fine-tuning using the target speaker's speech clips. + +What trains: SVCCondAdapter.spk_proj + stacked LoRA A₂B₂ (rank-4, speaker branch) +What freezes: DiT body + Stage 1 LoRA A₁B₁ + SVCCondAdapter content_proj + Vocos + +Training task: voice reconstruction + The model must reconstruct the target speaker's speech mel from their own + SVC features + speaker embedding. This teaches spk_proj to capture the + target speaker's vocal tract characteristics. These characteristics + (formants, timbre) transfer from speech to singing at inference. + +Usage: + python train_f5_stage2.py \ + --stage1_ckpt ./chkpt_f5svc/stage1_epoch_200.pt \ + --audio_dir ./data_svc/target_speaker/audio \ + --speaker_id ada_wong \ + --epochs 50 +""" + +import argparse +import math +import os + +import torch +import torch.optim as optim +from torch.utils.data import DataLoader + +from svc_data.mel_svc_dataset import MelSVCDataset, collate_fn +from models.f5_svc import build_f5svc +from train_f5_stage1 import _flow_loss + + +def train(args): + device = torch.device("cuda" if torch.cuda.is_available() else "cpu") + print(f"Device: {device}") + + dataset = MelSVCDataset( + audio_dir=args.audio_dir, + ppg_dir=args.ppg_dir, + hubert_dir=args.hubert_dir, + f0_dir=args.f0_dir, + spk_dir=args.spk_dir, + packed_dir=args.packed_dir, + max_frames=args.max_frames, + ref_frames=args.ref_frames, + ) + loader = DataLoader( + dataset, + batch_size=args.batch_size, + shuffle=True, + collate_fn=collate_fn, + num_workers=args.num_workers, + pin_memory=(device.type == "cuda"), + ) + + model = build_f5svc(f5tts_ckpt_path=args.f5tts_ckpt, lora_rank=args.lora_rank).to(device) + + # Load Stage 1 weights (adapter + LoRA); fall back to full state for old ckpts + ckpt = torch.load(args.stage1_ckpt, map_location=device, weights_only=False) + weights = ckpt.get("trainable", ckpt.get("model", ckpt)) + model.load_state_dict(weights, strict=False) + print(f"Loaded Stage 1 checkpoint: {args.stage1_ckpt}") + + # Stage 2: spk_proj + new LoRA pair (A₂B₂) for speaker adaptation + model.set_stage2_trainable(stage2_rank=args.stage2_rank) + model.to(device) + + trainable = [p for p in model.parameters() if p.requires_grad] + optimizer = optim.AdamW(trainable, lr=args.lr, eps=1e-6) + + total_steps = args.epochs * len(loader) + warmup_steps = max(1, total_steps // 20) + + def lr_lambda(step: int) -> float: + if step < warmup_steps: + return step / warmup_steps + p = (step - warmup_steps) / max(1, total_steps - warmup_steps) + return 0.5 * (1.0 + math.cos(math.pi * p)) + + scheduler = optim.lr_scheduler.LambdaLR(optimizer, lr_lambda) + scaler = torch.amp.GradScaler(device.type) if device.type == "cuda" else None + + os.makedirs(args.outdir, exist_ok=True) + + for epoch in range(args.epochs): + for step, batch in enumerate(loader): + target_mel, ref_mel, ppg, hubert, f0, spk, mask, ref_lens = [ + b.to(device, non_blocking=True) if torch.is_tensor(b) else b + for b in batch + ] + + with torch.autocast(device_type=device.type, dtype=torch.bfloat16, + enabled=(device.type == "cuda")): + loss = _flow_loss(model, target_mel, ref_mel, ppg, hubert, f0, + spk, mask, ref_lens) + scaled = loss / args.grad_accum + + is_update = ((step + 1) % args.grad_accum == 0) or (step + 1 == len(loader)) + + if scaler: + scaler.scale(scaled).backward() + if is_update: + scaler.unscale_(optimizer) + torch.nn.utils.clip_grad_norm_(trainable, 1.0) + scaler.step(optimizer) + scaler.update() + else: + scaled.backward() + if is_update: + torch.nn.utils.clip_grad_norm_(trainable, 1.0) + optimizer.step() + + if is_update: + scheduler.step() + optimizer.zero_grad(set_to_none=True) + + if step % args.log_interval == 0: + print(f"Epoch {epoch:3d} step {step:4d} " + f"loss {loss.item():.4f} lr {scheduler.get_last_lr()[0]:.2e}") + + # Save speaker-specific checkpoint (Stage 2 weights only: spk_proj + A₂B₂) + from models.lora_utils import get_stage2_state_dict + out_path = os.path.join(args.outdir, f"stage2_{args.speaker_id}.pt") + torch.save({ + "stage2": get_stage2_state_dict(model), + "speaker_id": args.speaker_id, + "stage2_rank": args.stage2_rank, + }, out_path) + print(f"Saved speaker checkpoint: {out_path}") + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument("--stage1_ckpt", type=str, required=True) + parser.add_argument("--f5tts_ckpt", type=str, + default="./chkpt_f5svc/model_1200000.safetensors", + help="Path to pretrained F5-TTS base checkpoint") + parser.add_argument("--audio_dir", type=str, required=True) + parser.add_argument("--speaker_id", type=str, required=True) + parser.add_argument("--ppg_dir", type=str, default="./data_svc/whisper") + parser.add_argument("--hubert_dir", type=str, default="./data_svc/hubert") + parser.add_argument("--f0_dir", type=str, default="./data_svc/pitch") + parser.add_argument("--spk_dir", type=str, default="./data_svc/speaker") + parser.add_argument("--packed_dir", type=str, default=None, + help="Packed .pt directory from preprocess_pack.py (NFS-efficient)") + parser.add_argument("--outdir", type=str, default="./chkpt_f5svc") + parser.add_argument("--epochs", type=int, default=50) + parser.add_argument("--batch_size", type=int, default=8) + parser.add_argument("--grad_accum", type=int, default=1) + parser.add_argument("--lr", type=float, default=5e-5) + parser.add_argument("--num_workers", type=int, default=4) + parser.add_argument("--log_interval", type=int, default=20) + parser.add_argument("--max_frames", type=int, default=800) + parser.add_argument("--ref_frames", type=int, default=280) + parser.add_argument("--lora_rank", type=int, default=16, + help="Stage 1 LoRA rank (must match Stage 1 checkpoint)") + parser.add_argument("--stage2_rank", type=int, default=4, + help="Stage 2 LoRA rank for speaker adaptation") + args = parser.parse_args() + train(args) diff --git a/ui_cfm.py b/ui_cfm.py new file mode 100644 index 0000000000000000000000000000000000000000..e3831cb29bebbd746d960f7aca6bfbd493f73e1f --- /dev/null +++ b/ui_cfm.py @@ -0,0 +1,606 @@ +import gradio as gr +import glob +import math +import os +import re +import subprocess +import sys +import tempfile +import time + +import numpy as np +import soundfile as sf +import torch + + +# ────────────────────────────────────────────────────────────────────────────── +# Helpers +# ────────────────────────────────────────────────────────────────────────────── + +PROJECT_ROOT = os.path.dirname(os.path.abspath(__file__)) + + +def _get_device(): + if torch.cuda.is_available(): + return torch.device("cuda") + elif torch.backends.mps.is_available(): + return torch.device("mps") + return torch.device("cpu") + + +def _available_checkpoints(): + """Return sorted list of available checkpoint epochs (integers).""" + epochs = set() + for f in glob.glob(os.path.join(PROJECT_ROOT, "chkpt_cfm", "dit_epoch_*.pt")): + m = re.search(r"dit_epoch_(\d+)\.pt$", f) + if m: + epochs.add(int(m.group(1))) + if os.path.isfile(os.path.join(PROJECT_ROOT, "chkpt_cfm", "dit_final.pt")): + epochs.add(-1) # sentinel for "final" + return sorted(epochs) + + +def _checkpoint_choices(): + """Return human-readable dropdown choices for checkpoints.""" + epochs = _available_checkpoints() + choices = [] + for e in epochs: + if e == -1: + choices.append("final") + else: + choices.append(f"epoch {e}") + if not choices: + choices = ["(no checkpoints found)"] + return choices + + +def _parse_epoch_choice(choice: str): + """Convert a dropdown choice back to the epoch int (or None for final).""" + if choice.startswith("epoch"): + return int(choice.split()[-1]) + return None # final + + +# ────────────────────────────────────────────────────────────────────────────── +# 1. Data Preprocessing (streaming) +# ────────────────────────────────────────────────────────────────────────────── + +def process_data(dataset_path): + if not os.path.isdir(dataset_path): + yield f"Error: Dataset path '{dataset_path}' does not exist." + return + + abs_dataset = os.path.abspath(dataset_path) + abs_default = os.path.abspath(os.path.join(PROJECT_ROOT, "dataset_raw")) + if abs_dataset != abs_default: + yield ( + "Error: this UI currently runs `svc_preprocessing.py`, which reads from `./dataset_raw`.\n" + f"Current path: {dataset_path}\n" + "Move your dataset to ./dataset_raw or change the textbox to ./dataset_raw." + ) + return + + wav_count = 0 + for _, _, files in os.walk(dataset_path): + wav_count += sum(1 for name in files if name.lower().endswith(".wav")) + if wav_count == 0: + yield f"Error: No .wav files found under '{dataset_path}'." + return + + log_lines = [] + + def push(line): + log_lines.append(line) + if len(log_lines) > 400: + del log_lines[:100] + return "\n".join(log_lines) + + yield push(f"Started data preprocessing for {dataset_path}") + yield push(f"Detected {wav_count} wav files.") + if not torch.cuda.is_available(): + yield push("CUDA is not available. Preprocessing will run on CPU/MPS and can be much slower.") + + cmd = [sys.executable, "-u", "svc_preprocessing.py", "-t", "2"] + yield push(f"Running: {' '.join(cmd)}") + + process = subprocess.Popen( + cmd, + stdout=subprocess.PIPE, + stderr=subprocess.STDOUT, + text=True, + bufsize=1, + cwd=PROJECT_ROOT, + ) + + assert process.stdout is not None + for raw_line in process.stdout: + line = raw_line.rstrip() + if not line: + continue + print(line, flush=True) + yield push(line) + + return_code = process.wait() + if return_code == 0: + yield push("Preprocessing completed successfully.") + else: + yield push(f"Preprocessing failed with exit code {return_code}.") + + +# ────────────────────────────────────────────────────────────────────────────── +# 2. Training (streaming) +# ────────────────────────────────────────────────────────────────────────────── + +def start_training(epochs, batch_size, learning_rate, grad_accum, save_interval): + """Launch train_cfm.py as a subprocess and stream its stdout back.""" + log_lines = [] + + def push(line): + log_lines.append(line) + if len(log_lines) > 600: + del log_lines[:200] + return "\n".join(log_lines) + + cmd = [ + sys.executable, "-u", "train_cfm.py", + "--epochs", str(int(epochs)), + "--batch_size", str(int(batch_size)), + "--lr", str(learning_rate), + "--grad_accum", str(int(grad_accum)), + "--save_interval", str(int(save_interval)), + ] + + yield push(f"Launching training: {' '.join(cmd)}") + yield push(f"Device: {_get_device()}") + + process = subprocess.Popen( + cmd, + stdout=subprocess.PIPE, + stderr=subprocess.STDOUT, + text=True, + bufsize=1, + cwd=PROJECT_ROOT, + ) + + assert process.stdout is not None + for raw_line in process.stdout: + line = raw_line.rstrip() + if not line: + continue + print(line, flush=True) + yield push(line) + + return_code = process.wait() + if return_code == 0: + yield push("\n✅ Training completed successfully.") + else: + yield push(f"\n❌ Training failed with exit code {return_code}.") + + +# ────────────────────────────────────────────────────────────────────────────── +# 3. Inference – real pipeline +# ────────────────────────────────────────────────────────────────────────────── + +def _preprocess_source_audio(wav_path, file_id): + """ + Extract conditioning features (Whisper PPG, HuBERT, CREPE, Speaker) + for a single source audio file into data_svc_infer/. + Runs the same preprocessing scripts used for training but targeting + the inference directory. + """ + infer_root = os.path.join(PROJECT_ROOT, "data_svc_infer") + waves_16k_dir = os.path.join(infer_root, "waves-16k", "speaker0") + os.makedirs(waves_16k_dir, exist_ok=True) + + # Resample to 16kHz first + dst_16k = os.path.join(waves_16k_dir, f"{file_id}.wav") + + wav, sr = sf.read(wav_path) + if wav.ndim > 1: + wav = wav.mean(axis=1) + if sr != 16000: + from scipy.signal import resample + target_len = int(round(len(wav) * 16000 / sr)) + wav = resample(wav, target_len) + # Normalise to prevent clipping + peak = np.abs(wav).max() + if peak > 0: + wav = wav / peak * 0.6 + sf.write(dst_16k, wav.astype(np.float32), 16000) + + log = [] + + # Run each feature extractor + steps = [ + ("Extracting pitch (CREPE)", [ + sys.executable, "prepare/preprocess_crepe.py", + "-w", os.path.join(infer_root, "waves-16k") + "/", + "-p", os.path.join(infer_root, "pitch"), + "-t", "0", + ]), + ("Extracting PPG (Whisper)", [ + sys.executable, "prepare/preprocess_ppg.py", + "-w", os.path.join(infer_root, "waves-16k") + "/", + "-p", os.path.join(infer_root, "whisper"), + ]), + ("Extracting HuBERT vectors", [ + sys.executable, "prepare/preprocess_hubert.py", + "-w", os.path.join(infer_root, "waves-16k") + "/", + "-v", os.path.join(infer_root, "hubert"), + "-t", "1", + ]), + ("Extracting speaker embeddings", [ + sys.executable, "prepare/preprocess_speaker.py", + os.path.join(infer_root, "waves-16k") + "/", + os.path.join(infer_root, "speaker"), + "-t", "0", + ]), + ] + + for step_name, cmd in steps: + log.append(f" → {step_name}...") + result = subprocess.run(cmd, capture_output=True, text=True, cwd=PROJECT_ROOT) + if result.returncode != 0: + log.append(f" ⚠ Warning: {step_name} exited with code {result.returncode}") + if result.stderr: + log.append(f" stderr: {result.stderr[:300]}") + + return "\n".join(log) + + +def run_inference(source_audio, target_speaker_embed, checkpoint_choice, steps, solver): + """ + Full inference pipeline: + 1. Preprocess source audio to extract conditioning features + 2. Load trained models from checkpoint + 3. Run ODE sampling + 4. Decode to waveform + """ + if source_audio is None: + return None, "❌ Error: Please upload a source audio file." + + if checkpoint_choice == "(no checkpoints found)": + return None, "❌ Error: No trained checkpoints found in chkpt_cfm/. Train a model first." + + log_lines = [] + + def push(line): + log_lines.append(line) + return "\n".join(log_lines) + + device = _get_device() + epoch = _parse_epoch_choice(checkpoint_choice) + push(f"--- Inference Pipeline Started ---") + push(f"Device: {device}") + push(f"Checkpoint: {checkpoint_choice}") + push(f"Solver: {solver.upper()}, Steps: {steps}") + + # ── Step 1: Extract conditioning features ── + push("\n1. Extracting conditioning features from source audio...") + + # Determine file_id from uploaded path + file_id = os.path.splitext(os.path.basename(source_audio))[0] + # Sanitise file_id for filesystem safety + file_id = re.sub(r"[^\w\-.]", "_", file_id) + + try: + preproc_log = _preprocess_source_audio(source_audio, file_id) + push(preproc_log) + push(" Feature extraction complete.") + except Exception as e: + push(f" ❌ Feature extraction failed: {e}") + return None, "\n".join(log_lines) + + # ── Step 2: Load models ── + push("\n2. Loading trained models...") + try: + from models.cond_encoder import CondEncoder + from models.codec_wrapper import CodecWrapper + from models.cfm import DiT + from samplers.ode import ODESampler + + codec_wrapper = CodecWrapper(latent_dim=1024).to(device) + cond_enc = CondEncoder( + ppg_dim=1280, hubert_dim=256, f0_dim=1, spk_dim=256, cond_out_dim=1024 + ).to(device) + dit = DiT(in_channels=1024, cond_dim=1024, hidden_dim=512, depth=8).to(device) + + # Resolve paths + if epoch is not None: + dit_path = os.path.join(PROJECT_ROOT, f"chkpt_cfm/dit_epoch_{epoch}.pt") + cond_path = os.path.join(PROJECT_ROOT, f"chkpt_cfm/cond_encoder_epoch_{epoch}.pt") + proj_path = os.path.join(PROJECT_ROOT, f"chkpt_cfm/projector_epoch_{epoch}.pt") + else: + dit_path = os.path.join(PROJECT_ROOT, "chkpt_cfm/dit_final.pt") + cond_path = os.path.join(PROJECT_ROOT, "chkpt_cfm/cond_encoder_final.pt") + proj_path = os.path.join(PROJECT_ROOT, "chkpt_cfm/projector_final.pt") + + def clean_sd(sd): + return {k.replace("_orig_mod.", ""): v for k, v in sd.items()} + + dit.load_state_dict(clean_sd(torch.load(dit_path, map_location=device, weights_only=True))) + cond_enc.load_state_dict(clean_sd(torch.load(cond_path, map_location=device, weights_only=True))) + codec_wrapper.projector.load_state_dict(clean_sd(torch.load(proj_path, map_location=device, weights_only=True))) + + # Load normalisation stats + norm_path = os.path.join(PROJECT_ROOT, "chkpt_cfm/latent_norm.pt") + if os.path.isfile(norm_path): + norm_data = torch.load(norm_path, map_location=device, weights_only=True) + z_mean = norm_data["mean"].to(device) + z_std = norm_data["std"].to(device) + else: + z_mean = torch.zeros(1024, device=device) + z_std = torch.ones(1024, device=device) + + codec_wrapper.eval() + cond_enc.eval() + dit.eval() + + push(" Models loaded successfully.") + except Exception as e: + push(f" ❌ Model loading failed: {e}") + import traceback + push(traceback.format_exc()) + return None, "\n".join(log_lines) + + # ── Step 3: Load conditioning features & run chunked inference ── + push(f"\n3. Running chunked ODE sampling ({solver.upper()}, {steps} steps)...") + + try: + wav_data, sr = sf.read(source_audio) + if wav_data.ndim > 1: + wav_data = wav_data.mean(axis=1) + total_T_latent = math.ceil(len(wav_data) / sr * 44100 / 512) + push(f" Source audio: {len(wav_data) / sr:.1f}s, T_latent={total_T_latent}") + + infer_root = os.path.join(PROJECT_ROOT, "data_svc_infer") + + # Load full features + ppg_path = os.path.join(infer_root, "whisper", "speaker0", f"{file_id}.ppg.npy") + hubert_path = os.path.join(infer_root, "hubert", "speaker0", f"{file_id}.vec.npy") + f0_path = os.path.join(infer_root, "pitch", "speaker0", f"{file_id}.pit.npy") + spk_path = os.path.join(infer_root, "speaker", "speaker0", f"{file_id}.spk.npy") + + # Verify features exist + missing = [] + for p in [ppg_path, hubert_path, f0_path, spk_path]: + if not os.path.isfile(p): + missing.append(os.path.basename(p)) + if missing: + push(f" ❌ Missing features: {', '.join(missing)}") + push(" Feature extraction may have failed. Check that pretrained models are available.") + return None, "\n".join(log_lines) + + ppg_full = np.load(ppg_path) + hubert_full = np.load(hubert_path) + f0_full = np.load(f0_path) + + if target_speaker_embed is not None: + spk_full = np.load(target_speaker_embed) + else: + spk_full = np.load(spk_path) + + # Chunked inference with overlap-add crossfade + max_frames = 400 + overlap_frames = 50 + step_frames = max_frames - overlap_frames + + final_audio = None + + with torch.no_grad(): + for chunk_start in range(0, total_T_latent, step_frames): + T_latent = min(max_frames, total_T_latent - chunk_start) + if T_latent <= 0: + break + + time_start = chunk_start * 512 / 44100.0 + time_end = (chunk_start + T_latent) * 512 / 44100.0 + + ppg_start, ppg_end = int(time_start * 50), int(time_end * 50) + hubert_start, hubert_end = int(time_start * 50), int(time_end * 50) + f0_start, f0_end = int(time_start * 100), int(time_end * 100) + + ppg = torch.tensor(ppg_full[max(0, ppg_start):max(1, ppg_end)]).float().unsqueeze(0).to(device) + hubert = torch.tensor(hubert_full[max(0, hubert_start):max(1, hubert_end)]).float().unsqueeze(0).to(device) + + f0_raw = torch.tensor(f0_full[max(0, f0_start):max(1, f0_end)]).float() + f0 = torch.where( + f0_raw > 0, + torch.log(f0_raw.clamp(min=1.0)), + torch.zeros_like(f0_raw), + ).unsqueeze(-1).unsqueeze(0).to(device) + + spk = torch.tensor(spk_full).float().unsqueeze(0).to(device) + + # Safety: handle empty slices + if ppg.shape[1] == 0: + ppg = torch.randn(1, max(1, T_latent // 2), 1280, device=device) + if hubert.shape[1] == 0: + hubert = torch.randn(1, max(1, T_latent // 2), 256, device=device) + if f0.shape[1] == 0: + f0 = torch.zeros(1, T_latent, 1, device=device) + + c = cond_enc(ppg, hubert, f0, spk, target_seq_len=T_latent) + + sampler = ODESampler(dit, steps=int(steps), solver=solver.lower()) + z_noise = torch.randn(1, T_latent, 1024, device=device) + u_hat = sampler.sample(z_noise, c) + + # Project + u_hat_t = u_hat.transpose(1, 2) + z_hat_norm = codec_wrapper.forward_project(u_hat_t) + + # Denormalise + z_hat_norm_t = z_hat_norm.transpose(1, 2) + z_hat_denorm = (z_hat_norm_t * z_std) + z_mean + z_hat = z_hat_denorm.transpose(1, 2) + + # Decode + wav_chunk = codec_wrapper.decode(z_hat).cpu().squeeze().numpy() + + # Overlap-add crossfade + if final_audio is None: + final_audio = wav_chunk + else: + overlap_samples = overlap_frames * 512 + if len(wav_chunk) >= overlap_samples and len(final_audio) >= overlap_samples: + fade_in = np.linspace(0, 1, overlap_samples) + fade_out = 1 - fade_in + final_audio[-overlap_samples:] = ( + final_audio[-overlap_samples:] * fade_out + + wav_chunk[:overlap_samples] * fade_in + ) + final_audio = np.concatenate([final_audio, wav_chunk[overlap_samples:]]) + else: + final_audio = np.concatenate([final_audio, wav_chunk]) + + push(f" Chunk {chunk_start}–{chunk_start + T_latent} done.") + + if T_latent < max_frames: + break + + push(f"\n4. Inference complete! Output: {len(final_audio)} samples ({len(final_audio) / 44100:.1f}s)") + + # Write to temp file + out_path = os.path.join(tempfile.gettempdir(), f"cfm_output_{int(time.time())}.wav") + sf.write(out_path, final_audio, 44100) + push(f" Saved to: {out_path}") + + return out_path, "\n".join(log_lines) + + except Exception as e: + push(f"\n❌ Inference failed: {e}") + import traceback + push(traceback.format_exc()) + return None, "\n".join(log_lines) + + +# ────────────────────────────────────────────────────────────────────────────── +# Gradio UI +# ────────────────────────────────────────────────────────────────────────────── + +with gr.Blocks(title="CFM-based SVC", theme=gr.themes.Soft()) as app: + gr.Markdown( + """ + # 🎙️ CFM-based Singing Voice Conversion + Welcome to the Continuous Normalizing Flow (CFM) voice conversion interface. + This tool runs the real training and inference pipeline end-to-end. + """ + ) + + with gr.Tabs(): + # ─── Tab 1: Inference ────────────────────────────────────────────── + with gr.TabItem("1. Convert Voice (Inference)"): + with gr.Row(): + with gr.Column(): + gr.Markdown("### Source Settings") + audio_in = gr.Audio( + label="Upload Voice to Convert", + type="filepath", + ) + speaker_in = gr.File( + label="Upload Target Speaker Embedding (.npy) (Optional)", + file_types=[".npy"], + ) + + gr.Markdown("### Model & Generation") + ckpt_dropdown = gr.Dropdown( + choices=_checkpoint_choices(), + value=_checkpoint_choices()[-1] if _checkpoint_choices() else "(no checkpoints found)", + label="Checkpoint", + ) + refresh_btn = gr.Button("🔄 Refresh Checkpoints", size="sm") + solver_dropdown = gr.Dropdown( + choices=["euler", "heun", "rk4"], + value="heun", + label="ODE Solver", + ) + steps_slider = gr.Slider( + minimum=5, maximum=100, step=1, value=12, + label="Solver Steps", + ) + + convert_btn = gr.Button("🎤 Convert Audio", variant="primary") + + with gr.Column(): + gr.Markdown("### Output") + audio_out = gr.Audio(label="Converted Audio", type="filepath") + log_out = gr.Textbox(label="Process Log", lines=16, interactive=False) + + def _refresh_ckpts(): + choices = _checkpoint_choices() + return gr.Dropdown(choices=choices, value=choices[-1] if choices else "(no checkpoints found)") + + refresh_btn.click(fn=_refresh_ckpts, outputs=[ckpt_dropdown]) + + convert_btn.click( + fn=run_inference, + inputs=[audio_in, speaker_in, ckpt_dropdown, steps_slider, solver_dropdown], + outputs=[audio_out, log_out], + ) + + # ─── Tab 2: Data Preprocessing ───────────────────────────────────── + with gr.TabItem("2. Data Preparation"): + gr.Markdown("### Offline Data Preprocessing") + gr.Markdown( + "Before training, you must extract offline conditioning and codec targets " + "to prevent CPU bottlenecks." + ) + + dataset_folder = gr.Textbox( + value="./dataset_raw", label="Raw Dataset Directory" + ) + preprocess_btn = gr.Button("⚙️ Start Preprocessing") + prep_log = gr.Textbox( + label="Preprocessing Log", lines=12, interactive=False + ) + + preprocess_btn.click( + fn=process_data, + inputs=[dataset_folder], + outputs=[prep_log], + queue=True, + ) + + # ─── Tab 3: Training ────────────────────────────────────────────── + with gr.TabItem("3. Model Training"): + gr.Markdown("### Train CFM Model") + gr.Markdown( + "Train the Diffusion Transformer and Projection network on " + "your pre-extracted codec targets. Output streams live below." + ) + + with gr.Row(): + epoch_slider = gr.Slider( + minimum=1, maximum=2000, step=10, value=100, label="Epochs" + ) + batch_slider = gr.Slider( + minimum=1, maximum=64, step=1, value=2, label="Batch Size" + ) + with gr.Row(): + lr_number = gr.Number(value=1e-4, label="Learning Rate") + ga_slider = gr.Slider( + minimum=1, maximum=32, step=1, value=1, + label="Gradient Accumulation Steps", + ) + save_slider = gr.Slider( + minimum=1, maximum=200, step=1, value=10, + label="Save Checkpoint Every N Epochs", + ) + + train_btn = gr.Button("🚀 Start Training", variant="primary") + train_log = gr.Textbox( + label="Training Log", lines=20, interactive=False + ) + + train_btn.click( + fn=start_training, + inputs=[epoch_slider, batch_slider, lr_number, ga_slider, save_slider], + outputs=[train_log], + queue=True, + ) + + +if __name__ == "__main__": + print("Starting Gradio Web Interface...") + app.queue().launch(server_name="0.0.0.0", server_port=7860, share=False) diff --git a/ui_f5svc.py b/ui_f5svc.py new file mode 100644 index 0000000000000000000000000000000000000000..0e1f9856831d065b517cc919fc541e2006976a2d --- /dev/null +++ b/ui_f5svc.py @@ -0,0 +1,699 @@ +""" +Gradio Web UI for F5-SVC. + +Tabs: + 1. Convert Voice (Inference) – Stage 1 or Stage 2 checkpoint + 2. Preprocess Speaker – extract features for a new speaker + 3. Stage 2 Training – per-speaker fine-tuning (Stage 1 checkpoint is pre-trained) +""" + +import glob +import math +import os +import re +import subprocess +import sys +import tempfile +import time + +import numpy as np +import soundfile as sf +import torch +import torchaudio.functional as TAF +import gradio as gr + +PROJECT_ROOT = os.path.dirname(os.path.abspath(__file__)) +INFER_ROOT = os.path.join(PROJECT_ROOT, "data_svc_infer_f5") +CHKPT_DIR = os.path.join(PROJECT_ROOT, "chkpt_f5svc") +SINGER_DIR = os.path.join(PROJECT_ROOT, "data_svc", "singer") +SAMPLE_RATE = 24000 + + +# ────────────────────────────────────────────────────────────────────────────── +# Helpers +# ────────────────────────────────────────────────────────────────────────────── + +def _get_device(): + if torch.cuda.is_available(): + return torch.device("cuda") + if torch.backends.mps.is_available(): + return torch.device("mps") + return torch.device("cpu") + + +def _stage1_checkpoints(): + choices = [] + for f in sorted(glob.glob(os.path.join(CHKPT_DIR, "stage1_epoch_*.pt"))): + m = re.search(r"stage1_epoch_(\d+)\.pt$", f) + if m: + choices.append(f"stage1 epoch {m.group(1)}") + if not choices: + choices = ["(no stage1 checkpoints found)"] + return choices + + +def _stage2_checkpoints(): + choices = ["(none — Stage 1 only)"] + for f in sorted(glob.glob(os.path.join(CHKPT_DIR, "stage2_*.pt"))): + name = os.path.splitext(os.path.basename(f))[0] # e.g. stage2_obama + choices.append(name) + return choices + + +def _speaker_choices(): + choices = [] + if os.path.isdir(SINGER_DIR): + for f in sorted(glob.glob(os.path.join(SINGER_DIR, "*.spk.npy"))): + choices.append(os.path.splitext(os.path.basename(f).replace(".spk", ""))[0]) + return choices if choices else ["(no speakers found)"] + + +def _resolve_stage1_path(choice: str) -> str | None: + m = re.search(r"stage1 epoch (\d+)", choice) + if m: + return os.path.join(CHKPT_DIR, f"stage1_epoch_{m.group(1)}.pt") + return None + + +def _resolve_stage2_path(choice: str) -> str | None: + if choice.startswith("(none") or not choice: + return None + return os.path.join(CHKPT_DIR, f"{choice}.pt") + + +# ────────────────────────────────────────────────────────────────────────────── +# Model cache (avoids reloading on every inference) +# ────────────────────────────────────────────────────────────────────────────── + +_MODEL_CACHE: dict = {} + + +def _detect_lora_rank(state_dict: dict) -> int: + """Infer LoRA rank from the lora_A weight shape in a Stage 1 state dict.""" + for k, v in state_dict.items(): + if "lora_A" in k and hasattr(v, "ndim") and v.ndim == 2: + return v.shape[0] + return 16 # safe fallback + + +def _load_model(stage1_path: str, stage2_path: str | None, device): + cache_key = (stage1_path, stage2_path, str(device)) + if cache_key in _MODEL_CACHE: + return _MODEL_CACHE[cache_key] + + from models.f5_svc import build_f5svc + f5tts_ckpt = os.path.join(CHKPT_DIR, "model_1200000.safetensors") + + s1_raw = torch.load(stage1_path, map_location=device, weights_only=False) + s1_weights = s1_raw.get("trainable", s1_raw.get("model", s1_raw)) + lora_rank = s1_raw.get("lora_rank", _detect_lora_rank(s1_weights)) + + model = build_f5svc(f5tts_ckpt_path=f5tts_ckpt, lora_rank=lora_rank).to(device) + + # Load Stage 1 + model.load_state_dict(s1_weights, strict=False) + + # Optionally load Stage 2 on top + if stage2_path and os.path.isfile(stage2_path): + from models.lora_utils import inject_lora_stage2 + s2 = torch.load(stage2_path, map_location=device, weights_only=False) + stage2_rank = s2.get("stage2_rank", 4) + inject_lora_stage2(model.transformer, rank=stage2_rank) + model.load_state_dict(s2["stage2"], strict=False) + + model.eval() + _MODEL_CACHE[cache_key] = model + return model + + +# ────────────────────────────────────────────────────────────────────────────── +# 1. Source feature extraction for inference +# ────────────────────────────────────────────────────────────────────────────── + +def _save_wav_24k(src_path: str, dst_path: str): + arr, sr = sf.read(src_path, always_2d=True) + arr = arr.mean(axis=1).astype(np.float32) + if sr != SAMPLE_RATE: + t = torch.tensor(arr).unsqueeze(0) + t = TAF.resample(t, sr, SAMPLE_RATE) + arr = t.squeeze(0).numpy() + sf.write(dst_path, arr, SAMPLE_RATE, subtype="PCM_16") + + +def _extract_features(wav_path: str, speaker_name: str, file_id: str, log_fn): + """Extract PPG / HuBERT / F0 / speaker for one file into INFER_ROOT.""" + waves_dir = os.path.join(INFER_ROOT, "waves-32k", speaker_name) + os.makedirs(waves_dir, exist_ok=True) + dst_wav = os.path.join(waves_dir, f"{file_id}.wav") + _save_wav_24k(wav_path, dst_wav) + log_fn(f" Saved 24kHz wav → {dst_wav}") + + steps = [ + ("Whisper PPG", [ + sys.executable, "prepare/preprocess_ppg.py", + "-w", os.path.join(INFER_ROOT, "waves-32k"), + "-p", os.path.join(INFER_ROOT, "whisper"), + ]), + ("HuBERT", [ + sys.executable, "prepare/preprocess_hubert.py", + "-w", os.path.join(INFER_ROOT, "waves-32k"), + "-v", os.path.join(INFER_ROOT, "hubert"), + "-t", "1", + ]), + ("F0 (CREPE)", [ + sys.executable, "prepare/preprocess_crepe.py", + "-w", os.path.join(INFER_ROOT, "waves-32k"), + "-p", os.path.join(INFER_ROOT, "pitch"), + "-t", "1", + ]), + ] + + for step_name, cmd in steps: + log_fn(f" → {step_name}...") + result = subprocess.run(cmd, capture_output=True, text=True, cwd=PROJECT_ROOT) + if result.returncode != 0: + log_fn(f" Warning: {step_name} exited {result.returncode}") + if result.stderr: + log_fn(f" {result.stderr[:200]}") + else: + log_fn(f" {step_name} done.") + + +def _load_source_features(speaker_name: str, file_id: str, f0_shift: float, device): + """Load extracted features and compute mel for source wav.""" + from svc_data.mel_svc_dataset import ( + SAMPLE_RATE as SR, HOP_LENGTH, N_MELS, _build_mel_transform, _resample_to, + ) + + wav_path = os.path.join(INFER_ROOT, "waves-32k", speaker_name, f"{file_id}.wav") + arr, sr = sf.read(wav_path, always_2d=True) + arr = arr.mean(axis=1).astype(np.float32) + if sr != SR: + t = torch.tensor(arr).unsqueeze(0) + t = TAF.resample(t, sr, SR) + arr = t.squeeze(0).numpy() + + wav_t = torch.tensor(arr).unsqueeze(0) + mel_tf = _build_mel_transform() + mel = mel_tf(wav_t).squeeze(0).T + mel = torch.log(mel.clamp(min=1e-5)) + t_mel = mel.shape[0] + + def _load_npy(path, fallback_shape): + if os.path.isfile(path): + return torch.tensor(np.load(path)).float() + return torch.zeros(*fallback_shape) + + ppg_path = os.path.join(INFER_ROOT, "whisper", speaker_name, f"{file_id}.ppg.npy") + hbt_path = os.path.join(INFER_ROOT, "hubert", speaker_name, f"{file_id}.vec.npy") + f0_path = os.path.join(INFER_ROOT, "pitch", speaker_name, f"{file_id}.pit.npy") + + ppg = _load_npy(ppg_path, (t_mel, 1280)) + hubert = _load_npy(hbt_path, (t_mel, 256)) + f0_raw = _load_npy(f0_path, (t_mel,)) + + if f0_shift != 0.0: + f0_raw = f0_raw * math.pow(2.0, f0_shift / 12.0) + + f0 = torch.where(f0_raw > 0, + torch.log(f0_raw.clamp(min=1.0)), + torch.zeros_like(f0_raw)).unsqueeze(-1) + + ppg = _resample_to(ppg, t_mel) + hubert = _resample_to(hubert, t_mel) + f0 = _resample_to(f0, t_mel) + + return (mel.unsqueeze(0).to(device), + ppg.unsqueeze(0).to(device), + hubert.unsqueeze(0).to(device), + f0.unsqueeze(0).to(device)) + + +# ────────────────────────────────────────────────────────────────────────────── +# 2. Inference +# ────────────────────────────────────────────────────────────────────────────── + +def run_inference( + source_audio, + speaker_choice, + custom_spk_file, + ref_audio, + stage1_choice, + stage2_choice, + solver, + steps, + ref_sec, + f0_shift, + inpaint_mode, +): + if source_audio is None: + return None, "Error: please upload a source audio file." + if stage1_choice.startswith("(no stage1"): + return None, "Error: no Stage 1 checkpoint found. Train first." + + log_lines = [] + + def push(line): + log_lines.append(line) + return "\n".join(log_lines) + + device = _get_device() + push(f"Device: {device}") + + # ── Step 1: feature extraction ── + push("\n1. Extracting source features...") + file_id = re.sub(r"[^\w\-.]", "_", os.path.splitext(os.path.basename(source_audio))[0]) + speaker_name = "source_infer" + + try: + _extract_features(source_audio, speaker_name, file_id, lambda l: push(f" {l}")) + except Exception as e: + push(f" Feature extraction failed: {e}") + return None, "\n".join(log_lines) + + # ── Step 2: load model ── + push("\n2. Loading model...") + stage1_path = _resolve_stage1_path(stage1_choice) + stage2_path = _resolve_stage2_path(stage2_choice) + try: + model = _load_model(stage1_path, stage2_path, device) + label = stage1_choice + if stage2_path: + label += f" + {stage2_choice}" + push(f" Loaded: {label}") + except Exception as e: + import traceback + push(f" Model load failed: {e}\n{traceback.format_exc()}") + return None, "\n".join(log_lines) + + # ── Step 3: resolve speaker embedding ── + push("\n3. Resolving speaker embedding...") + try: + if custom_spk_file is not None: + spk_arr = np.load(custom_spk_file) + push(f" Using uploaded speaker embedding.") + elif speaker_choice and not speaker_choice.startswith("(no speakers"): + spk_path = os.path.join(SINGER_DIR, f"{speaker_choice}.spk.npy") + spk_arr = np.load(spk_path) + push(f" Using speaker: {speaker_choice}") + else: + push(" Warning: no speaker embedding — using zeros.") + spk_arr = np.zeros(256, dtype=np.float32) + spk = torch.tensor(spk_arr).float().unsqueeze(0).to(device) + except Exception as e: + push(f" Speaker load failed: {e}") + return None, "\n".join(log_lines) + + # ── Step 4: load source features and build sequence ── + push("\n4. Building conditioning sequence...") + try: + from svc_data.mel_svc_dataset import SAMPLE_RATE as SR, HOP_LENGTH, N_MELS, _build_mel_transform, _resample_to + + source_mel, ppg, hubert, f0 = _load_source_features( + speaker_name, file_id, float(f0_shift), device + ) + T = source_mel.shape[1] + push(f" Source: {T} mel frames ({T * HOP_LENGTH / SR:.1f}s)") + + # Reference audio mel + ref_frames = 0 + ref_mel_raw = None + ref_audio_path = ref_audio + if ref_audio_path and os.path.isfile(ref_audio_path): + arr, sr = sf.read(ref_audio_path, always_2d=True) + arr = arr.mean(axis=1).astype(np.float32) + if sr != SR: + t = torch.tensor(arr).unsqueeze(0) + arr = TAF.resample(t, sr, SR).squeeze(0).numpy() + wav_t = torch.tensor(arr).unsqueeze(0) + mel_tf = _build_mel_transform() + ref_mel_raw = torch.log(mel_tf(wav_t).clamp(min=1e-5)).squeeze(0).T + ref_frames = min(int(float(ref_sec) * SR / HOP_LENGTH), ref_mel_raw.shape[0]) + push(f" Reference: {ref_frames} frames ({ref_frames * HOP_LENGTH / SR:.1f}s)") + + T_total = ref_frames + T + + # Build cond_mel: [ref_mel | zeros] + cond_mel = torch.zeros(1, T_total, N_MELS, device=device) + if ref_frames > 0: + cond_mel[0, :ref_frames] = ref_mel_raw[:ref_frames].to(device) + + # Pad feature tensors with zeros for the ref region + if ref_frames > 0: + pad_ppg = torch.zeros(1, ref_frames, ppg.shape[2], device=device) + pad_hbt = torch.zeros(1, ref_frames, hubert.shape[2], device=device) + pad_f0 = torch.zeros(1, ref_frames, f0.shape[2], device=device) + ppg_full = torch.cat([pad_ppg, ppg], dim=1) + hubert_full = torch.cat([pad_hbt, hubert], dim=1) + f0_full = torch.cat([pad_f0, f0], dim=1) + else: + ppg_full, hubert_full, f0_full = ppg, hubert, f0 + + # Inpaint mask + inpaint_mask = None + if ref_frames > 0: + inpaint_mask = torch.zeros(1, T_total, dtype=torch.bool, device=device) + inpaint_mask[0, :ref_frames] = True + + except Exception as e: + import traceback + push(f" Feature loading failed: {e}\n{traceback.format_exc()}") + return None, "\n".join(log_lines) + + # ── Step 5: ODE sampling ── + push(f"\n5. ODE sampling ({solver.upper()}, {steps} steps, T={T_total})...") + try: + from infer_f5_svc import ode_sample + pred_mel = ode_sample( + model=model, + ref_mel=cond_mel, + ppg=ppg_full, + hubert=hubert_full, + f0=f0_full, + spk=spk, + inpaint_mask=inpaint_mask, + steps=int(steps), + method=solver, + inpaint_mode=inpaint_mode, + device=device, + ) + # Trim ref region + if ref_frames > 0: + pred_mel = pred_mel[:, ref_frames:] + push(f" Output: {pred_mel.shape[1]} frames ({pred_mel.shape[1] * HOP_LENGTH / SR:.1f}s)") + except Exception as e: + import traceback + push(f" ODE sampling failed: {e}\n{traceback.format_exc()}") + return None, "\n".join(log_lines) + + # ── Step 6: Vocos decode ── + push("\n6. Decoding mel → audio (Vocos)...") + try: + from vocos import Vocos + vocos = Vocos.from_pretrained("charactr/vocos-mel-24khz").to(device) + vocos.eval() + with torch.no_grad(): + mel_input = pred_mel.transpose(1, 2) # (1, N_MELS, T) + wav_out = vocos.decode(mel_input).squeeze(0).cpu().numpy() + + out_path = os.path.join(tempfile.gettempdir(), f"f5svc_out_{int(time.time())}.wav") + sf.write(out_path, wav_out, SR) + push(f" Done. {len(wav_out) / SR:.1f}s saved to {out_path}") + return out_path, "\n".join(log_lines) + except Exception as e: + import traceback + push(f" Vocos decode failed: {e}\n{traceback.format_exc()}") + return None, "\n".join(log_lines) + + +# ────────────────────────────────────────────────────────────────────────────── +# 3. Preprocess new speaker (streaming subprocess) +# ────────────────────────────────────────────────────────────────────────────── + +def preprocess_speaker(audio_dir, speaker_id): + if not os.path.isdir(audio_dir): + yield f"Error: directory '{audio_dir}' not found." + return + wav_count = sum( + 1 for _, _, fs in os.walk(audio_dir) + for f in fs if f.lower().endswith(".wav") + ) + if wav_count == 0: + yield f"Error: no .wav files found under {audio_dir}." + return + + log_lines = [] + + def push(line): + log_lines.append(line) + if len(log_lines) > 500: + del log_lines[:100] + return "\n".join(log_lines) + + spk_id = speaker_id.strip() or "speaker" + yield push(f"Preprocessing speaker '{spk_id}' from {audio_dir} ({wav_count} wavs)") + + # Wrap files into data_svc/waves-32k// if needed + abs_audio = os.path.abspath(audio_dir) + target_waves = os.path.join(PROJECT_ROOT, "data_svc", "waves-32k", spk_id) + if abs_audio != os.path.abspath(target_waves): + yield push(f"Note: pass audio already inside data_svc/waves-32k/{spk_id}/ for correct path resolution.") + + steps = [ + ("Speaker embeddings", [ + sys.executable, "-u", "prepare/preprocess_speaker.py", + "data_svc/waves-32k", "data_svc/speaker", + ]), + ("Averaged speaker .spk.npy", [ + sys.executable, "-u", "prepare/preprocess_speaker_ave.py", + "data_svc/speaker", "data_svc/singer", + ]), + ("Whisper PPG", [ + sys.executable, "-u", "prepare/preprocess_ppg.py", + "-w", "data_svc/waves-32k", "-p", "data_svc/whisper", + ]), + ("HuBERT", [ + sys.executable, "-u", "prepare/preprocess_hubert.py", + "-w", "data_svc/waves-32k", "-v", "data_svc/hubert", "-t", "2", + ]), + ("F0 (CREPE)", [ + sys.executable, "-u", "prepare/preprocess_crepe.py", + "-w", "data_svc/waves-32k", "-p", "data_svc/pitch", "-t", "2", + ]), + ] + + for step_name, cmd in steps: + yield push(f"\n--- {step_name} ---") + process = subprocess.Popen( + cmd, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, + text=True, bufsize=1, cwd=PROJECT_ROOT, + ) + assert process.stdout is not None + for raw_line in process.stdout: + line = raw_line.rstrip() + if line: + yield push(line) + rc = process.wait() + if rc != 0: + yield push(f" Warning: exited with code {rc}") + + yield push(f"\nPreprocessing complete. Speaker embedding: data_svc/singer/{spk_id}.spk.npy") + + +# ────────────────────────────────────────────────────────────────────────────── +# 4. Stage 2 Training (streaming subprocess) +# ────────────────────────────────────────────────────────────────────────────── + +def start_stage2(stage1_choice, speaker_id, audio_dir, epochs, lr, stage2_rank): + log_lines = [] + + def push(line): + log_lines.append(line) + if len(log_lines) > 600: + del log_lines[:200] + return "\n".join(log_lines) + + stage1_path = _resolve_stage1_path(stage1_choice) + if stage1_path is None: + yield "Error: please select a valid Stage 1 checkpoint." + return + + spk_id = speaker_id.strip() + if not spk_id: + yield "Error: please enter a speaker ID." + return + + cmd = [ + sys.executable, "-u", "train_f5_stage2.py", + "--stage1_ckpt", stage1_path, + "--audio_dir", audio_dir, + "--speaker_id", spk_id, + "--epochs", str(int(epochs)), + "--lr", str(lr), + "--stage2_rank", str(int(stage2_rank)), + ] + + yield push(f"Launching Stage 2 for speaker '{spk_id}': {' '.join(cmd)}") + process = subprocess.Popen( + cmd, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, + text=True, bufsize=1, cwd=PROJECT_ROOT, + ) + assert process.stdout is not None + for raw_line in process.stdout: + line = raw_line.rstrip() + if line: + yield push(line) + + rc = process.wait() + if rc == 0: + yield push(f"\nStage 2 complete. Checkpoint: chkpt_f5svc/stage2_{spk_id}.pt") + else: + yield push(f"\nStage 2 failed (exit code {rc}).") + + +# ────────────────────────────────────────────────────────────────────────────── +# Gradio UI +# ────────────────────────────────────────────────────────────────────────────── + +with gr.Blocks(title="F5-SVC", theme=gr.themes.Soft()) as app: + gr.Markdown( + """ + # F5-SVC — Singing Voice Conversion + F5-TTS backbone fine-tuned with LoRA for content-preserving voice conversion. + """ + ) + + with gr.Tabs(): + + # ─── Tab 1: Inference ───────────────────────────────────────────────── + with gr.TabItem("1. Convert Voice"): + with gr.Row(): + with gr.Column(): + gr.Markdown("### Source") + audio_in = gr.Audio(label="Source Singing", type="filepath") + ref_audio_in = gr.Audio( + label="Reference Audio (target speaker timbre)", + type="filepath", + ) + + gr.Markdown("### Target Speaker") + speaker_dd = gr.Dropdown( + choices=_speaker_choices(), + value=_speaker_choices()[0], + label="Speaker (from data_svc/singer/)", + ) + custom_spk = gr.File( + label="Upload custom .spk.npy (overrides dropdown)", + file_types=[".npy"], + ) + + gr.Markdown("### Checkpoint") + stage1_dd = gr.Dropdown( + choices=_stage1_checkpoints(), + value=_stage1_checkpoints()[-1], + label="Stage 1 Checkpoint", + ) + stage2_dd = gr.Dropdown( + choices=_stage2_checkpoints(), + value=_stage2_checkpoints()[0], + label="Stage 2 Checkpoint (optional)", + ) + refresh_btn = gr.Button("Refresh Checkpoints & Speakers", size="sm") + + with gr.Column(): + gr.Markdown("### Generation Settings") + solver_dd = gr.Dropdown( + choices=["euler", "heun", "rk4"], + value="heun", + label="ODE Solver", + ) + steps_sl = gr.Slider( + minimum=4, maximum=128, step=1, value=32, + label="ODE Steps", + ) + ref_sec_sl = gr.Slider( + minimum=1.0, maximum=10.0, step=0.5, value=3.0, + label="Reference Seconds", + ) + f0_shift_sl = gr.Slider( + minimum=-12, maximum=12, step=1, value=0, + label="F0 Shift (semitones)", + ) + inpaint_dd = gr.Dropdown( + choices=["none", "soft", "hard"], + value="none", + label="Inpaint Mode (none recommended)", + ) + + convert_btn = gr.Button("Convert Audio", variant="primary") + + gr.Markdown("### Output") + audio_out = gr.Audio(label="Converted Audio", type="filepath") + log_out = gr.Textbox(label="Log", lines=16, interactive=False) + + def _refresh(): + s1 = _stage1_checkpoints() + s2 = _stage2_checkpoints() + spk = _speaker_choices() + return ( + gr.Dropdown(choices=s1, value=s1[-1]), + gr.Dropdown(choices=s2, value=s2[0]), + gr.Dropdown(choices=spk, value=spk[0]), + ) + + refresh_btn.click( + fn=_refresh, + outputs=[stage1_dd, stage2_dd, speaker_dd], + ) + convert_btn.click( + fn=run_inference, + inputs=[ + audio_in, speaker_dd, custom_spk, ref_audio_in, + stage1_dd, stage2_dd, + solver_dd, steps_sl, ref_sec_sl, f0_shift_sl, inpaint_dd, + ], + outputs=[audio_out, log_out], + ) + + # ─── Tab 2: Preprocess Speaker ──────────────────────────────────────── + with gr.TabItem("2. Preprocess Speaker"): + gr.Markdown( + "Extract PPG / HuBERT / F0 / speaker embeddings for a new speaker.\n\n" + "**Place wav files at** `data_svc/waves-32k//` before running." + ) + with gr.Row(): + audio_dir_tb = gr.Textbox( + value="./data_svc/waves-32k/obama", + label="Audio directory (data_svc/waves-32k//)", + ) + speaker_id_tb = gr.Textbox( + value="obama", + label="Speaker ID", + ) + prep_btn = gr.Button("Run Preprocessing", variant="primary") + prep_log = gr.Textbox(label="Log", lines=18, interactive=False) + + prep_btn.click( + fn=preprocess_speaker, + inputs=[audio_dir_tb, speaker_id_tb], + outputs=[prep_log], + queue=True, + ) + + # ─── Tab 3: Stage 2 Training ────────────────────────────────────────── + with gr.TabItem("3. Stage 2 Training"): + gr.Markdown( + "### Per-Speaker Adaptation\n" + "Fine-tunes `spk_proj` + stacked LoRA A₂B₂ (rank-4) on a target speaker's " + "speech clips. Requires a Stage 1 checkpoint and preprocessed speaker features." + ) + with gr.Row(): + s2_stage1_dd = gr.Dropdown( + choices=_stage1_checkpoints(), + value=_stage1_checkpoints()[-1], + label="Base Stage 1 Checkpoint", + ) + s2_spk_tb = gr.Textbox(value="obama", label="Speaker ID") + s2_audio_tb = gr.Textbox( + value="./data_svc/waves-32k/obama", + label="Speaker Audio Directory", + ) + with gr.Row(): + s2_epoch_sl = gr.Slider(10, 200, step=5, value=50, label="Epochs") + s2_lr_num = gr.Number(value=5e-5, label="Learning Rate") + s2_rank_sl = gr.Slider(1, 16, step=1, value=4, label="Stage 2 LoRA Rank") + + s2_btn = gr.Button("Start Stage 2 Training", variant="primary") + s2_log = gr.Textbox(label="Training Log", lines=20, interactive=False) + + s2_btn.click( + fn=start_stage2, + inputs=[s2_stage1_dd, s2_spk_tb, s2_audio_tb, s2_epoch_sl, s2_lr_num, s2_rank_sl], + outputs=[s2_log], + queue=True, + ) + + +if __name__ == "__main__": + print("Starting F5-SVC Web Interface...") + app.queue().launch(server_name="0.0.0.0", server_port=7861, share=False) diff --git a/v2/__init__.py b/v2/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/v2/infer_v2.py b/v2/infer_v2.py new file mode 100644 index 0000000000000000000000000000000000000000..48224e91bfb0589d94049c8d9c372fbffe814db0 --- /dev/null +++ b/v2/infer_v2.py @@ -0,0 +1,162 @@ +""" +v2 Inference: SVC inference using Stable Audio Open + LoRA. +""" + +import argparse +import os +import sys + +import torch +import torchaudio +import numpy as np + +sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) + +from v2.models.svc_adapter import SVCCondAdapter +from v2.models.stable_audio_svc import StableAudioSVC + + +@torch.no_grad() +def infer_v2(wave_path, checkpoint_path, output_path="output_v2.wav", steps=50): + device = torch.device("cuda" if torch.cuda.is_available() else "cpu") + print(f"Using device: {device}") + + # ---- 1. Load Stable Audio ---- + print("Loading pretrained Stable Audio Open...") + from stable_audio_tools import get_pretrained_model + from stable_audio_tools.inference.generation import generate_diffusion_cond + + stable_audio_model, model_config = get_pretrained_model("stabilityai/stable-audio-open-1.0") + stable_audio_model = stable_audio_model.to(device).eval() + + cross_attn_dim = model_config.get("model", {}).get("cross_attn_dim", 768) + sample_rate = model_config.get("sample_rate", 44100) + + # ---- 2. Build SVC model ---- + svc_adapter = SVCCondAdapter(cross_attn_dim=cross_attn_dim).to(device) + + model = StableAudioSVC( + stable_audio_model=stable_audio_model, + svc_adapter=svc_adapter, + lora_config={"rank": 16, "alpha": 32, "target_modules": ["to_q", "to_k", "to_v"]}, + ) + + # ---- 3. Load trained weights ---- + print(f"Loading v2 checkpoint from {checkpoint_path}...") + ckpt = torch.load(checkpoint_path, map_location=device, weights_only=False) + model.svc_adapter.load_state_dict(ckpt["svc_adapter"]) + model.lora_layers.load_state_dict(ckpt["lora_layers"]) + + # Optionally load EMA weights + if "ema_params" in ckpt: + print("Using EMA weights for inference") + for name, param in model.named_parameters(): + if name in ckpt["ema_params"]: + param.data.copy_(ckpt["ema_params"][name]) + + model.eval() + + # ---- 4. Extract conditioning features from source audio ---- + print(f"Extracting features from {wave_path}...") + + # Load source audio (16kHz for feature extraction) + waveform_16k, sr = torchaudio.load(wave_path) + if sr != 16000: + waveform_16k = torchaudio.transforms.Resample(sr, 16000)(waveform_16k) + waveform_16k = waveform_16k.mean(0, keepdim=True) # mono + + # Extract PPG (Whisper) + ppg_path = wave_path.replace(".wav", ".ppg.npy") + if os.path.exists(ppg_path): + ppg = torch.from_numpy(np.load(ppg_path)).float().unsqueeze(0).to(device) + else: + print("WARNING: PPG file not found, extracting on the fly...") + # Would need whisper feature extractor here + raise FileNotFoundError(f"PPG not found at {ppg_path}") + + # Extract HuBERT + hubert_path = wave_path.replace(".wav", ".hubert.npy") + if os.path.exists(hubert_path): + hubert = torch.from_numpy(np.load(hubert_path)).float().unsqueeze(0).to(device) + else: + raise FileNotFoundError(f"HuBERT not found at {hubert_path}") + + # Extract F0 + f0_path = wave_path.replace(".wav", ".f0.npy") + if os.path.exists(f0_path): + f0 = torch.from_numpy(np.load(f0_path)).float().unsqueeze(0).unsqueeze(-1).to(device) + else: + raise FileNotFoundError(f"F0 not found at {f0_path}") + + # Speaker embedding + spk_path = os.path.join(os.path.dirname(wave_path), "speaker.npy") + if os.path.exists(spk_path): + spk = torch.from_numpy(np.load(spk_path)).float().unsqueeze(0).to(device) + else: + spk = torch.zeros(1, 256).to(device) + print("WARNING: No speaker embedding found, using zero vector") + + # ---- 5. Determine latent length ---- + # Load source audio at 44.1kHz for encoding + waveform_44k, sr = torchaudio.load(wave_path) + if sr != 44100: + waveform_44k = torchaudio.transforms.Resample(sr, 44100)(waveform_44k) + if waveform_44k.shape[0] == 1: + waveform_44k = waveform_44k.repeat(2, 1) # stereo + + # Encode to get target latent length + waveform_44k = waveform_44k.unsqueeze(0).to(device) + z_source = model.encode_audio(waveform_44k) + T_latent = z_source.shape[2] + latent_dim = z_source.shape[1] + + print(f"Source audio: {waveform_44k.shape}, latent: ({latent_dim}, {T_latent})") + + # ---- 6. Get SVC conditioning ---- + c = model.get_conditioning(ppg, hubert, f0, spk, T_latent) + + # ---- 7. ODE Sampling (Euler method) ---- + print(f"Sampling with {steps} ODE steps...") + + # Start from noise + z = torch.randn(1, latent_dim, T_latent, device=device) + + dt = 1.0 / steps + for i in range(steps): + t_val = i / steps + t = torch.full((1,), t_val, device=device) + + # Predict velocity + v_pred = model.dit( + z, t, + cross_attn_cond=c["cross_attn_cond"], + cross_attn_mask=c["cross_attn_mask"], + ) + + # Euler step + z = z + v_pred * dt + + if (i + 1) % 10 == 0: + print(f" Step {i+1}/{steps}") + + # ---- 8. Decode to audio ---- + print("Decoding latent to audio...") + audio_out = model.decode_latent(z) # (1, 2, T_samples) + + # Convert to mono and save + audio_out = audio_out.squeeze(0).mean(0).cpu() # mono + torchaudio.save(output_path, audio_out.unsqueeze(0), 44100) + + print(f"Saved output to {output_path}") + return output_path + + +if __name__ == "__main__": + parser = argparse.ArgumentParser(description="CFM-SVC v2 Inference") + parser.add_argument("--wave", type=str, required=True, help="Source audio file") + parser.add_argument("--checkpoint", type=str, required=True, help="v2 checkpoint path") + parser.add_argument("--output", type=str, default="output_v2.wav") + parser.add_argument("--steps", type=int, default=50, help="ODE sampling steps") + args = parser.parse_args() + + infer_v2(args.wave, args.checkpoint, args.output, args.steps) diff --git a/v2/models/__init__.py b/v2/models/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/v2/models/stable_audio_svc.py b/v2/models/stable_audio_svc.py new file mode 100644 index 0000000000000000000000000000000000000000..a502a1c0a7796d996645fd450bc24c7c2d135b23 --- /dev/null +++ b/v2/models/stable_audio_svc.py @@ -0,0 +1,169 @@ +""" +Stable Audio SVC Wrapper: Wraps the pretrained Stable Audio Open model, +replaces T5 conditioning with SVCCondAdapter, and adds LoRA to the DiT. +""" + +import torch +import torch.nn as nn +from typing import Optional + + +class StableAudioSVC(nn.Module): + """ + Wraps a pretrained Stable Audio Open model for singing voice conversion. + + Components: + - svc_adapter: SVCCondAdapter (trained) — replaces T5 text encoder + - dit: Stable Audio's DiT with LoRA (LoRA trained, base frozen) + - pretransform: Stable Audio's VAE (fully frozen) + """ + + def __init__(self, stable_audio_model, svc_adapter, lora_config=None): + super().__init__() + + self.svc_adapter = svc_adapter + + # Extract components from the Stable Audio model + self.dit = stable_audio_model.model # The diffusion transformer + self.pretransform = stable_audio_model.pretransform # VAE encoder/decoder + self.conditioner = stable_audio_model.conditioner # Original T5 conditioner (kept for reference, not used) + + # Freeze everything first + for p in self.dit.parameters(): + p.requires_grad_(False) + for p in self.pretransform.parameters(): + p.requires_grad_(False) + for p in self.conditioner.parameters(): + p.requires_grad_(False) + + # Apply LoRA to DiT attention layers + if lora_config is not None: + self._apply_lora(lora_config) + + # Store diffusion params from the stable audio model + # These are needed for sampling + self.diffusion_config = getattr(stable_audio_model, 'diffusion', None) + + def _apply_lora(self, config): + """Apply LoRA adapters to the DiT's attention layers. + + Uses manual LoRA implementation to avoid PEFT dependency issues. + """ + rank = config.get("rank", 16) + alpha = config.get("alpha", 32) + target_modules = config.get("target_modules", ["to_q", "to_k", "to_v", "to_out"]) + + self.lora_layers = nn.ModuleList() + + for name, module in self.dit.named_modules(): + if any(target in name for target in target_modules): + if isinstance(module, nn.Linear): + lora = LoRALayer(module, rank=rank, alpha=alpha) + self.lora_layers.append(lora) + # Replace the module in the parent + parent_name = ".".join(name.split(".")[:-1]) + child_name = name.split(".")[-1] + parent = self.dit + for part in parent_name.split("."): + if part: + parent = getattr(parent, part) + setattr(parent, child_name, lora) + + n_lora_params = sum(p.numel() for p in self.lora_layers.parameters()) + print(f"Applied LoRA to {len(self.lora_layers)} layers, {n_lora_params:,} trainable params") + + @torch.no_grad() + def encode_audio(self, audio: torch.Tensor) -> torch.Tensor: + """Encode raw audio to VAE latent space. + + Args: + audio: (B, C, T_samples) raw waveform at 44.1kHz + + Returns: + z: (B, latent_dim, T_latent) latent representation + """ + return self.pretransform.encode(audio) + + @torch.no_grad() + def decode_latent(self, z: torch.Tensor) -> torch.Tensor: + """Decode VAE latent back to audio. + + Args: + z: (B, latent_dim, T_latent) latent representation + + Returns: + audio: (B, C, T_samples) reconstructed waveform + """ + return self.pretransform.decode(z) + + def get_conditioning(self, ppg, hubert, f0, spk, target_seq_len): + """Get SVC conditioning for the DiT. + + Returns conditioning in the format expected by Stable Audio's DiT. + """ + # Get SVC features via adapter + c = self.svc_adapter(ppg, hubert, f0, spk, target_seq_len) + + # Format as Stable Audio expects: + # cross_attn conditioning: list of dicts with 'cross_attn_cond' key + # The exact format depends on Stable Audio's internals + conditioning = { + "cross_attn_cond": c, # (B, T, cross_attn_dim) + "cross_attn_mask": None, # No masking for SVC features + } + + return conditioning + + def trainable_parameters(self): + """Return only the parameters that should be trained.""" + params = list(self.svc_adapter.parameters()) + for layer in self.lora_layers: + params.extend(layer.trainable_parameters()) + return params + + def trainable_param_count(self): + """Count trainable parameters.""" + total = sum(p.numel() for p in self.svc_adapter.parameters()) + total += sum(p.numel() for layer in self.lora_layers + for p in layer.trainable_parameters()) + return total + + +class LoRALayer(nn.Module): + """Low-Rank Adaptation layer wrapping an existing nn.Linear. + + Implements: output = original_linear(x) + (x @ A^T @ B^T) * (alpha / rank) + Only A and B are trainable. Original weights are frozen. + """ + + def __init__(self, original_linear: nn.Linear, rank: int = 16, alpha: float = 32): + super().__init__() + self.original = original_linear + self.rank = rank + self.alpha = alpha + self.scaling = alpha / rank + + in_features = original_linear.in_features + out_features = original_linear.out_features + + # LoRA decomposition: W' = W + (alpha/r) * B @ A + self.lora_A = nn.Linear(in_features, rank, bias=False) + self.lora_B = nn.Linear(rank, out_features, bias=False) + + # Initialize A with Kaiming, B with zeros (so LoRA starts as identity) + nn.init.kaiming_uniform_(self.lora_A.weight) + nn.init.zeros_(self.lora_B.weight) + + # Freeze original weights + for p in self.original.parameters(): + p.requires_grad_(False) + + def forward(self, x: torch.Tensor) -> torch.Tensor: + # Original forward (frozen) + original_out = self.original(x) + # LoRA residual (trained) + lora_out = self.lora_B(self.lora_A(x)) * self.scaling + return original_out + lora_out + + def trainable_parameters(self): + return list(self.lora_A.parameters()) + list(self.lora_B.parameters()) diff --git a/v2/models/svc_adapter.py b/v2/models/svc_adapter.py new file mode 100644 index 0000000000000000000000000000000000000000..e0449300955c254ba708a685e020783503843e88 --- /dev/null +++ b/v2/models/svc_adapter.py @@ -0,0 +1,135 @@ +""" +SVCCondAdapter: Maps PPG/HuBERT/F0/Speaker features to Stable Audio's +cross-attention conditioning space, replacing the T5 text encoder. +""" + +import torch +import torch.nn as nn +import torch.nn.functional as F + + +class SVCCondAdapter(nn.Module): + """ + Conditioning adapter that fuses SVC features (PPG, HuBERT, F0, speaker) + into a sequence suitable for Stable Audio's cross-attention mechanism. + + Replaces T5 text embeddings with frame-level SVC conditioning. + """ + + def __init__( + self, + ppg_dim: int = 1280, + hubert_dim: int = 256, + f0_dim: int = 1, + spk_dim: int = 256, + cross_attn_dim: int = 768, # Stable Audio's cross-attention dimension + hidden_dim: int = 512, + num_heads: int = 8, + num_layers: int = 2, + dropout: float = 0.1, + ): + super().__init__() + + self.cross_attn_dim = cross_attn_dim + + # Individual feature projections + self.ppg_proj = nn.Linear(ppg_dim, hidden_dim) + self.hubert_proj = nn.Linear(hubert_dim, hidden_dim) + self.spk_proj = nn.Linear(spk_dim, hidden_dim) + + # F0: discretize into 256 bins + embedding (from SoVITS) + self.f0_bins = 256 + self.f0_embedding = nn.Embedding(self.f0_bins, hidden_dim) + + # Fusion: concat all features → project to cross_attn_dim + self.fusion = nn.Sequential( + nn.Linear(hidden_dim * 4, cross_attn_dim), + nn.GELU(), + nn.LayerNorm(cross_attn_dim), + nn.Dropout(dropout), + ) + + # Optional: lightweight transformer to capture local dependencies + if num_layers > 0: + encoder_layer = nn.TransformerEncoderLayer( + d_model=cross_attn_dim, + nhead=num_heads, + dim_feedforward=cross_attn_dim * 2, + dropout=dropout, + activation="gelu", + batch_first=True, + norm_first=True, + ) + self.context_encoder = nn.TransformerEncoder( + encoder_layer, num_layers=num_layers + ) + else: + self.context_encoder = nn.Identity() + + def f0_to_coarse(self, f0: torch.Tensor) -> torch.Tensor: + """Convert continuous F0 (Hz) to discrete bins (0-255). + + Uses log-scale binning: + - bin 0: unvoiced (f0 <= 0) + - bins 1-255: log-spaced from ~50Hz to ~1100Hz + """ + f0_mel = 1127 * torch.log(1 + f0 / 700) # Hz → mel scale + f0_mel_min = 1127 * torch.log(torch.tensor(1 + 50.0 / 700)) + f0_mel_max = 1127 * torch.log(torch.tensor(1 + 1100.0 / 700)) + + # Normalize to [0, 254] and shift to [1, 255] + f0_coarse = (f0_mel - f0_mel_min) / (f0_mel_max - f0_mel_min) * 254 + 1 + f0_coarse = f0_coarse.clamp(0, 255).long() + + # Unvoiced frames → bin 0 + f0_coarse[f0.squeeze(-1) <= 0] = 0 + + return f0_coarse + + def forward( + self, + ppg: torch.Tensor, # (B, T_ppg, 1280) + hubert: torch.Tensor, # (B, T_hubert, 256) + f0: torch.Tensor, # (B, T_f0, 1) + spk: torch.Tensor, # (B, 256) + target_seq_len: int, # target frame count (VAE latent length) + ) -> torch.Tensor: + """ + Returns: + c: (B, target_seq_len, cross_attn_dim) — drop-in replacement for T5 embeddings + """ + # 1. Project each feature to hidden_dim + ppg_h = self.ppg_proj(ppg) # (B, T_ppg, hidden) + hubert_h = self.hubert_proj(hubert) # (B, T_hubert, hidden) + + # F0: continuous → discrete bins → embedding + f0_coarse = self.f0_to_coarse(f0) # (B, T_f0) or (B, T_f0, 1) + if f0_coarse.dim() == 3: + f0_coarse = f0_coarse.squeeze(-1) + f0_h = self.f0_embedding(f0_coarse) # (B, T_f0, hidden) + + # Speaker: broadcast to sequence + spk_h = self.spk_proj(spk) # (B, hidden) + + # 2. Resample all to target_seq_len via linear interpolation + def resample(x, target_len): + """(B, T, D) → (B, target_len, D)""" + if x.shape[1] == target_len: + return x + x_t = x.transpose(1, 2) # (B, D, T) + x_r = F.interpolate(x_t, size=target_len, mode="linear", align_corners=False) + return x_r.transpose(1, 2) # (B, target_len, D) + + ppg_r = resample(ppg_h, target_seq_len) + hubert_r = resample(hubert_h, target_seq_len) + f0_r = resample(f0_h, target_seq_len) + spk_r = spk_h.unsqueeze(1).expand(-1, target_seq_len, -1) + + # 3. Concatenate and fuse + stacked = torch.cat([ppg_r, hubert_r, f0_r, spk_r], dim=-1) # (B, T, 4*hidden) + c = self.fusion(stacked) # (B, T, cross_attn_dim) + + # 4. Optional context encoding for local dependencies + c = self.context_encoder(c) + + return c diff --git a/v2/train_v2.py b/v2/train_v2.py new file mode 100644 index 0000000000000000000000000000000000000000..55bd4526160075ed804d2499818b3b03f3b1a200 --- /dev/null +++ b/v2/train_v2.py @@ -0,0 +1,442 @@ +""" +v2 Training Script: Fine-tune Stable Audio Open for SVC with LoRA. + +Usage: + python v2/train_v2.py \ + --data_dir ./data_svc/codec_targets \ + --epochs 50 \ + --batch_size 8 \ + --lr 1e-4 \ + --lora_rank 16 +""" + +import argparse +import os +import sys +import glob +import copy + +import torch +import torch.nn as nn +import torch.nn.functional as F +import torch.optim as optim +from torch.utils.data import DataLoader, Dataset + +# Add project root to path +sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) + +from v2.models.svc_adapter import SVCCondAdapter +from v2.models.stable_audio_svc import StableAudioSVC + + +# ============================================================ +# Dataset: Reuses v1's preprocessed features, encodes targets +# through Stable Audio's VAE on first run +# ============================================================ + +class SVCDataset(Dataset): + """Dataset for SVC training with Stable Audio VAE targets. + + Loads preprocessed .pt files containing: + - ppg, hubert, f0, spk (conditioning features from v1) + - z_target (VAE latent targets, pre-encoded) + """ + + def __init__(self, data_dir, max_seq_len=500): + self.max_seq_len = max_seq_len + self.files = sorted(glob.glob(os.path.join(data_dir, "*.pt"))) + if not self.files: + raise RuntimeError(f"No .pt files found in {data_dir}") + print(f"SVCDataset: {len(self.files)} samples from {data_dir}") + + def __len__(self): + return len(self.files) + + def __getitem__(self, idx): + data = torch.load(self.files[idx], weights_only=True, map_location="cpu") + + ppg = data["ppg"] # (T_ppg, 1280) + hubert = data["hubert"] # (T_hubert, 256) + f0 = data["f0"] # (T_f0, 1) + spk = data["spk"] # (256,) + + # z_target will be pre-encoded by the encode_dataset.py script + z_target = data["z_target"] # (latent_dim, T_latent) from VAE encoder + + # Truncate to max_seq_len (in latent frames) + T_latent = z_target.shape[1] + if T_latent > self.max_seq_len: + # Random crop + start = torch.randint(0, T_latent - self.max_seq_len, (1,)).item() + z_target = z_target[:, start:start + self.max_seq_len] + + # Proportionally crop conditioning features + ratio_ppg = ppg.shape[0] / T_latent + ratio_hubert = hubert.shape[0] / T_latent + ratio_f0 = f0.shape[0] / T_latent + + ppg_start = int(start * ratio_ppg) + ppg_end = int((start + self.max_seq_len) * ratio_ppg) + ppg = ppg[ppg_start:ppg_end] + + hub_start = int(start * ratio_hubert) + hub_end = int((start + self.max_seq_len) * ratio_hubert) + hubert = hubert[hub_start:hub_end] + + f0_start = int(start * ratio_f0) + f0_end = int((start + self.max_seq_len) * ratio_f0) + f0 = f0[f0_start:f0_end] + + return { + "ppg": ppg, + "hubert": hubert, + "f0": f0, + "spk": spk, + "z_target": z_target, + } + + +def collate_fn(batch): + """Pad sequences to the max length in the batch.""" + # Find max lengths + max_ppg = max(b["ppg"].shape[0] for b in batch) + max_hubert = max(b["hubert"].shape[0] for b in batch) + max_f0 = max(b["f0"].shape[0] for b in batch) + max_latent = max(b["z_target"].shape[1] for b in batch) + latent_dim = batch[0]["z_target"].shape[0] + + B = len(batch) + + ppg_padded = torch.zeros(B, max_ppg, 1280) + hubert_padded = torch.zeros(B, max_hubert, 256) + f0_padded = torch.zeros(B, max_f0, 1) + spk_stacked = torch.zeros(B, 256) + z_target_padded = torch.zeros(B, latent_dim, max_latent) + mask = torch.zeros(B, max_latent) + + for i, b in enumerate(batch): + T_ppg = b["ppg"].shape[0] + T_hub = b["hubert"].shape[0] + T_f0 = b["f0"].shape[0] + T_lat = b["z_target"].shape[1] + + ppg_padded[i, :T_ppg] = b["ppg"] + hubert_padded[i, :T_hub] = b["hubert"] + f0_padded[i, :T_f0] = b["f0"] + spk_stacked[i] = b["spk"] + z_target_padded[i, :, :T_lat] = b["z_target"] + mask[i, :T_lat] = 1.0 + + return { + "ppg": ppg_padded, + "hubert": hubert_padded, + "f0": f0_padded, + "spk": spk_stacked, + "z_target": z_target_padded, + "mask": mask, + } + + +# ============================================================ +# Training Loop +# ============================================================ + +def train(args): + device = torch.device("cuda" if torch.cuda.is_available() else "cpu") + print(f"Using device: {device}") + + # ---- 1. Load Pretrained Stable Audio ---- + print("Loading pretrained Stable Audio Open model...") + try: + from stable_audio_tools import get_pretrained_model + stable_audio_model, model_config = get_pretrained_model("stabilityai/stable-audio-open-1.0") + stable_audio_model = stable_audio_model.to(device) + except ImportError: + print("ERROR: stable-audio-tools not installed. Run:") + print(" pip install stable-audio-tools") + sys.exit(1) + + # Determine cross-attention dimension from the model + # This depends on Stable Audio's architecture + cross_attn_dim = model_config.get("model", {}).get("cross_attn_dim", 768) + print(f"Cross-attention dimension: {cross_attn_dim}") + + # ---- 2. Build SVC Adapter ---- + print("Building SVC Conditioning Adapter...") + svc_adapter = SVCCondAdapter( + ppg_dim=1280, + hubert_dim=256, + f0_dim=1, + spk_dim=256, + cross_attn_dim=cross_attn_dim, + hidden_dim=512, + num_heads=8, + num_layers=2, + ).to(device) + + # ---- 3. Build SVC Model with LoRA ---- + print("Building StableAudioSVC with LoRA...") + lora_config = { + "rank": args.lora_rank, + "alpha": args.lora_rank * 2, + "target_modules": ["to_q", "to_k", "to_v"], + } + + model = StableAudioSVC( + stable_audio_model=stable_audio_model, + svc_adapter=svc_adapter, + lora_config=lora_config, + ) + + trainable_count = model.trainable_param_count() + total_count = sum(p.numel() for p in model.parameters()) + print(f"Total parameters: {total_count:,}") + print(f"Trainable parameters: {trainable_count:,} ({100*trainable_count/total_count:.1f}%)") + + # ---- 4. Dataset & DataLoader ---- + print("Loading dataset...") + dataset = SVCDataset(args.data_dir, max_seq_len=args.max_seq_len) + dataloader = DataLoader( + dataset, + batch_size=args.batch_size, + shuffle=True, + num_workers=args.num_workers, + collate_fn=collate_fn, + pin_memory=True, + drop_last=True, + ) + + # ---- 5. Optimizer (only trainable params) ---- + trainable_params = model.trainable_parameters() + optimizer = optim.AdamW(trainable_params, lr=args.lr, eps=1e-6) + + total_steps = args.epochs * len(dataloader) + warmup_steps = max(1, int(total_steps * 0.05)) + + def lr_lambda(step): + if step < warmup_steps: + return step / warmup_steps + progress = (step - warmup_steps) / max(1, total_steps - warmup_steps) + return 0.5 * (1 + torch.cos(torch.tensor(progress * 3.14159)).item()) + + scheduler = optim.lr_scheduler.LambdaLR(optimizer, lr_lambda) + + # ---- 6. EMA ---- + ema_params = {} + for name, param in zip( + [n for n, _ in model.named_parameters() if _.requires_grad], + trainable_params + ): + ema_params[name] = param.data.clone() + + # ---- 7. Training Loop ---- + print(f"\nStarting v2 training: {args.epochs} epochs, {len(dataloader)} steps/epoch") + print(f" LR: {args.lr}, LoRA rank: {args.lora_rank}") + print(f" Batch size: {args.batch_size}") + print("=" * 60) + + os.makedirs("chkpt_v2", exist_ok=True) + + for epoch in range(args.epochs): + model.svc_adapter.train() + for layer in model.lora_layers: + layer.train() + + epoch_loss = 0.0 + n_steps = 0 + + for step, batch in enumerate(dataloader): + ppg = batch["ppg"].to(device) + hubert = batch["hubert"].to(device) + f0 = batch["f0"].to(device) + spk = batch["spk"].to(device) + z_target = batch["z_target"].to(device) # (B, latent_dim, T_latent) + mask = batch["mask"].to(device) + + T_latent = z_target.shape[2] + + with torch.autocast(device_type="cuda", dtype=torch.bfloat16): + # Get SVC conditioning + c = model.get_conditioning(ppg, hubert, f0, spk, T_latent) + + # Flow matching loss (same as v1, but in Stable Audio's latent space) + B, D, T = z_target.shape + z_noise = torch.randn_like(z_target) + t = torch.sigmoid(torch.randn((B,), device=device)) # logit-normal + + # Expand t for broadcasting: (B,) → (B, 1, 1) + t_expand = t.view(B, 1, 1).expand_as(z_target) + + # Rectified flow interpolation + z_t = (1 - t_expand) * z_noise + t_expand * z_target + v_target = z_target - z_noise + + # Predict velocity using DiT with SVC conditioning + # NOTE: The exact API depends on Stable Audio's DiT interface + # This may need adjustment based on the model's forward() signature + v_pred = model.dit( + z_t, + t, + cross_attn_cond=c["cross_attn_cond"], + cross_attn_mask=c["cross_attn_mask"], + ) + + # Masked MSE loss + loss_mask = mask.unsqueeze(1) # (B, 1, T) + valid_frames = loss_mask.sum() * D + if valid_frames == 0: + valid_frames = 1.0 + + flow_loss = ((v_pred - v_target) ** 2 * loss_mask).sum() / valid_frames + + # Backward + optimizer.zero_grad() + flow_loss.backward() + torch.nn.utils.clip_grad_norm_(trainable_params, 1.0) + optimizer.step() + scheduler.step() + + # EMA update + with torch.no_grad(): + for name, param in zip( + [n for n, p in model.named_parameters() if p.requires_grad], + trainable_params + ): + if name in ema_params: + ema_params[name].mul_(0.9999).add_(param.data, alpha=0.0001) + + epoch_loss += flow_loss.item() + n_steps += 1 + + if (step + 1) % args.log_interval == 0: + avg_loss = epoch_loss / n_steps + lr = scheduler.get_last_lr()[0] + print(f"Epoch {epoch}, Step {step+1}/{len(dataloader)}, " + f"Loss: {flow_loss.item():.4f}, Avg: {avg_loss:.4f}, LR: {lr:.2e}") + + # End of epoch + avg_loss = epoch_loss / max(n_steps, 1) + print(f"--- Epoch {epoch} complete. Avg loss: {avg_loss:.4f} ---") + + # Save checkpoint + if (epoch + 1) % args.save_interval == 0: + ckpt = { + "epoch": epoch, + "svc_adapter": model.svc_adapter.state_dict(), + "lora_layers": model.lora_layers.state_dict(), + "optimizer": optimizer.state_dict(), + "scheduler": scheduler.state_dict(), + "ema_params": ema_params, + "avg_loss": avg_loss, + } + path = f"chkpt_v2/v2_epoch_{epoch+1}.pt" + torch.save(ckpt, path) + print(f"Saved checkpoint to {path}") + + print("Training complete!") + + +# ============================================================ +# Encode Dataset: Pre-encode training audio through Stable Audio's VAE +# ============================================================ + +def encode_dataset(args): + """Pre-encode training audio through Stable Audio's VAE. + + This creates new .pt files with added 'z_target' field containing + the VAE latent representation alongside existing SVC features. + """ + device = torch.device("cuda" if torch.cuda.is_available() else "cpu") + + print("Loading Stable Audio model for encoding...") + from stable_audio_tools import get_pretrained_model + model, config = get_pretrained_model("stabilityai/stable-audio-open-1.0") + model = model.to(device).eval() + + import torchaudio + + input_dir = args.data_dir + output_dir = args.output_dir or input_dir + "_v2" + os.makedirs(output_dir, exist_ok=True) + + files = sorted(glob.glob(os.path.join(input_dir, "*.pt"))) + print(f"Encoding {len(files)} files...") + + for i, f in enumerate(files): + data = torch.load(f, weights_only=True, map_location="cpu") + + # Load corresponding audio file + # Assumes audio path can be derived from the .pt file + audio_path = data.get("audio_path", None) + if audio_path is None: + # Try to find corresponding wav + base = os.path.splitext(os.path.basename(f))[0] + # Search in common audio directories + for audio_dir in ["data_svc/waves", "data_svc/waves-16k"]: + candidate = os.path.join(audio_dir, base + ".wav") + if os.path.exists(candidate): + audio_path = candidate + break + + if audio_path and os.path.exists(audio_path): + # Load and resample to 44.1kHz if needed + waveform, sr = torchaudio.load(audio_path) + if sr != 44100: + resampler = torchaudio.transforms.Resample(sr, 44100) + waveform = resampler(waveform) + + # Ensure stereo (Stable Audio expects stereo) + if waveform.shape[0] == 1: + waveform = waveform.repeat(2, 1) + + # Encode through VAE + with torch.no_grad(): + waveform = waveform.unsqueeze(0).to(device) # (1, 2, T) + z = model.pretransform.encode(waveform) # (1, latent_dim, T_latent) + z = z.squeeze(0).cpu() # (latent_dim, T_latent) + + # Add to data dict + data["z_target"] = z + else: + print(f" WARNING: No audio found for {f}, skipping") + continue + + # Save to output dir + out_path = os.path.join(output_dir, os.path.basename(f)) + torch.save(data, out_path) + + if (i + 1) % 100 == 0: + print(f" Encoded {i+1}/{len(files)}") + + print(f"Done! Encoded dataset saved to {output_dir}") + + +if __name__ == "__main__": + parser = argparse.ArgumentParser(description="CFM-SVC v2: LoRA on Stable Audio Open") + subparsers = parser.add_subparsers(dest="command") + + # Train command + train_parser = subparsers.add_parser("train", help="Train SVC adapter + LoRA") + train_parser.add_argument("--data_dir", type=str, required=True) + train_parser.add_argument("--epochs", type=int, default=50) + train_parser.add_argument("--batch_size", type=int, default=8) + train_parser.add_argument("--lr", type=float, default=1e-4) + train_parser.add_argument("--lora_rank", type=int, default=16) + train_parser.add_argument("--max_seq_len", type=int, default=500) + train_parser.add_argument("--num_workers", type=int, default=4) + train_parser.add_argument("--log_interval", type=int, default=10) + train_parser.add_argument("--save_interval", type=int, default=5) + + # Encode command + encode_parser = subparsers.add_parser("encode", help="Pre-encode dataset through VAE") + encode_parser.add_argument("--data_dir", type=str, required=True) + encode_parser.add_argument("--output_dir", type=str, default=None) + + args = parser.parse_args() + + if args.command == "train": + train(args) + elif args.command == "encode": + encode_dataset(args) + else: + parser.print_help() diff --git a/vits/LICENSE b/vits/LICENSE new file mode 100644 index 0000000000000000000000000000000000000000..6a6c3181fcdc4e20901a6ecbee5a406b78a5b560 --- /dev/null +++ b/vits/LICENSE @@ -0,0 +1,21 @@ +MIT License + +Copyright (c) 2021 Jaehyeon Kim + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. diff --git a/vits/__init__.py b/vits/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/vits/attentions.py b/vits/attentions.py new file mode 100644 index 0000000000000000000000000000000000000000..26624519b01497cc402dea5f860cf5022d1e7c89 --- /dev/null +++ b/vits/attentions.py @@ -0,0 +1,416 @@ +import copy +import math +import numpy as np +import torch +from torch import nn +from torch.nn import functional as F + +from vits import commons +from vits.modules import LayerNorm + + +class Encoder(nn.Module): + def __init__( + self, + hidden_channels, + filter_channels, + n_heads, + n_layers, + kernel_size=1, + p_dropout=0.0, + window_size=4, + **kwargs + ): + super().__init__() + self.hidden_channels = hidden_channels + self.filter_channels = filter_channels + self.n_heads = n_heads + self.n_layers = n_layers + self.kernel_size = kernel_size + self.p_dropout = p_dropout + self.window_size = window_size + + self.drop = nn.Dropout(p_dropout) + self.attn_layers = nn.ModuleList() + self.norm_layers_1 = nn.ModuleList() + self.ffn_layers = nn.ModuleList() + self.norm_layers_2 = nn.ModuleList() + for i in range(self.n_layers): + self.attn_layers.append( + MultiHeadAttention( + hidden_channels, + hidden_channels, + n_heads, + p_dropout=p_dropout, + window_size=window_size, + ) + ) + self.norm_layers_1.append(LayerNorm(hidden_channels)) + self.ffn_layers.append( + FFN( + hidden_channels, + hidden_channels, + filter_channels, + kernel_size, + p_dropout=p_dropout, + ) + ) + self.norm_layers_2.append(LayerNorm(hidden_channels)) + + def forward(self, x, x_mask): + attn_mask = x_mask.unsqueeze(2) * x_mask.unsqueeze(-1) + x = x * x_mask + for i in range(self.n_layers): + y = self.attn_layers[i](x, x, attn_mask) + y = self.drop(y) + x = self.norm_layers_1[i](x + y) + + y = self.ffn_layers[i](x, x_mask) + y = self.drop(y) + x = self.norm_layers_2[i](x + y) + x = x * x_mask + return x + + +class Decoder(nn.Module): + def __init__( + self, + hidden_channels, + filter_channels, + n_heads, + n_layers, + kernel_size=1, + p_dropout=0.0, + proximal_bias=False, + proximal_init=True, + **kwargs + ): + super().__init__() + self.hidden_channels = hidden_channels + self.filter_channels = filter_channels + self.n_heads = n_heads + self.n_layers = n_layers + self.kernel_size = kernel_size + self.p_dropout = p_dropout + self.proximal_bias = proximal_bias + self.proximal_init = proximal_init + + self.drop = nn.Dropout(p_dropout) + self.self_attn_layers = nn.ModuleList() + self.norm_layers_0 = nn.ModuleList() + self.encdec_attn_layers = nn.ModuleList() + self.norm_layers_1 = nn.ModuleList() + self.ffn_layers = nn.ModuleList() + self.norm_layers_2 = nn.ModuleList() + for i in range(self.n_layers): + self.self_attn_layers.append( + MultiHeadAttention( + hidden_channels, + hidden_channels, + n_heads, + p_dropout=p_dropout, + proximal_bias=proximal_bias, + proximal_init=proximal_init, + ) + ) + self.norm_layers_0.append(LayerNorm(hidden_channels)) + self.encdec_attn_layers.append( + MultiHeadAttention( + hidden_channels, hidden_channels, n_heads, p_dropout=p_dropout + ) + ) + self.norm_layers_1.append(LayerNorm(hidden_channels)) + self.ffn_layers.append( + FFN( + hidden_channels, + hidden_channels, + filter_channels, + kernel_size, + p_dropout=p_dropout, + causal=True, + ) + ) + self.norm_layers_2.append(LayerNorm(hidden_channels)) + + def forward(self, x, x_mask, h, h_mask): + """ + x: decoder input + h: encoder output + """ + self_attn_mask = commons.subsequent_mask(x_mask.size(2)).to( + device=x.device, dtype=x.dtype + ) + encdec_attn_mask = h_mask.unsqueeze(2) * x_mask.unsqueeze(-1) + x = x * x_mask + for i in range(self.n_layers): + y = self.self_attn_layers[i](x, x, self_attn_mask) + y = self.drop(y) + x = self.norm_layers_0[i](x + y) + + y = self.encdec_attn_layers[i](x, h, encdec_attn_mask) + y = self.drop(y) + x = self.norm_layers_1[i](x + y) + + y = self.ffn_layers[i](x, x_mask) + y = self.drop(y) + x = self.norm_layers_2[i](x + y) + x = x * x_mask + return x + + +class MultiHeadAttention(nn.Module): + def __init__( + self, + channels, + out_channels, + n_heads, + p_dropout=0.0, + window_size=None, + heads_share=True, + block_length=None, + proximal_bias=False, + proximal_init=False, + ): + super().__init__() + assert channels % n_heads == 0 + + self.channels = channels + self.out_channels = out_channels + self.n_heads = n_heads + self.p_dropout = p_dropout + self.window_size = window_size + self.heads_share = heads_share + self.block_length = block_length + self.proximal_bias = proximal_bias + self.proximal_init = proximal_init + self.attn = None + + self.k_channels = channels // n_heads + self.conv_q = nn.Conv1d(channels, channels, 1) + self.conv_k = nn.Conv1d(channels, channels, 1) + self.conv_v = nn.Conv1d(channels, channels, 1) + self.conv_o = nn.Conv1d(channels, out_channels, 1) + self.drop = nn.Dropout(p_dropout) + + if window_size is not None: + n_heads_rel = 1 if heads_share else n_heads + rel_stddev = self.k_channels**-0.5 + self.emb_rel_k = nn.Parameter( + torch.randn(n_heads_rel, window_size * 2 + 1, self.k_channels) + * rel_stddev + ) + self.emb_rel_v = nn.Parameter( + torch.randn(n_heads_rel, window_size * 2 + 1, self.k_channels) + * rel_stddev + ) + + nn.init.xavier_uniform_(self.conv_q.weight) + nn.init.xavier_uniform_(self.conv_k.weight) + nn.init.xavier_uniform_(self.conv_v.weight) + if proximal_init: + with torch.no_grad(): + self.conv_k.weight.copy_(self.conv_q.weight) + self.conv_k.bias.copy_(self.conv_q.bias) + + def forward(self, x, c, attn_mask=None): + q = self.conv_q(x) + k = self.conv_k(c) + v = self.conv_v(c) + + x, self.attn = self.attention(q, k, v, mask=attn_mask) + + x = self.conv_o(x) + return x + + def attention(self, query, key, value, mask=None): + # reshape [b, d, t] -> [b, n_h, t, d_k] + b, d, t_s, t_t = (*key.size(), query.size(2)) + query = query.view(b, self.n_heads, self.k_channels, t_t).transpose(2, 3) + key = key.view(b, self.n_heads, self.k_channels, t_s).transpose(2, 3) + value = value.view(b, self.n_heads, self.k_channels, t_s).transpose(2, 3) + + scores = torch.matmul(query / math.sqrt(self.k_channels), key.transpose(-2, -1)) + if self.window_size is not None: + assert ( + t_s == t_t + ), "Relative attention is only available for self-attention." + key_relative_embeddings = self._get_relative_embeddings(self.emb_rel_k, t_s) + rel_logits = self._matmul_with_relative_keys( + query / math.sqrt(self.k_channels), key_relative_embeddings + ) + scores_local = self._relative_position_to_absolute_position(rel_logits) + scores = scores + scores_local + if self.proximal_bias: + assert t_s == t_t, "Proximal bias is only available for self-attention." + scores = scores + self._attention_bias_proximal(t_s).to( + device=scores.device, dtype=scores.dtype + ) + if mask is not None: + scores = scores.masked_fill(mask == 0, -1e4) + if self.block_length is not None: + assert ( + t_s == t_t + ), "Local attention is only available for self-attention." + block_mask = ( + torch.ones_like(scores) + .triu(-self.block_length) + .tril(self.block_length) + ) + scores = scores.masked_fill(block_mask == 0, -1e4) + p_attn = F.softmax(scores, dim=-1) # [b, n_h, t_t, t_s] + p_attn = self.drop(p_attn) + output = torch.matmul(p_attn, value) + if self.window_size is not None: + relative_weights = self._absolute_position_to_relative_position(p_attn) + value_relative_embeddings = self._get_relative_embeddings( + self.emb_rel_v, t_s + ) + output = output + self._matmul_with_relative_values( + relative_weights, value_relative_embeddings + ) + output = ( + output.transpose(2, 3).contiguous().view(b, d, t_t) + ) # [b, n_h, t_t, d_k] -> [b, d, t_t] + return output, p_attn + + def _matmul_with_relative_values(self, x, y): + """ + x: [b, h, l, m] + y: [h or 1, m, d] + ret: [b, h, l, d] + """ + ret = torch.matmul(x, y.unsqueeze(0)) + return ret + + def _matmul_with_relative_keys(self, x, y): + """ + x: [b, h, l, d] + y: [h or 1, m, d] + ret: [b, h, l, m] + """ + ret = torch.matmul(x, y.unsqueeze(0).transpose(-2, -1)) + return ret + + def _get_relative_embeddings(self, relative_embeddings, length): + max_relative_position = 2 * self.window_size + 1 + # Pad first before slice to avoid using cond ops. + pad_length = max(length - (self.window_size + 1), 0) + slice_start_position = max((self.window_size + 1) - length, 0) + slice_end_position = slice_start_position + 2 * length - 1 + if pad_length > 0: + padded_relative_embeddings = F.pad( + relative_embeddings, + commons.convert_pad_shape([[0, 0], [pad_length, pad_length], [0, 0]]), + ) + else: + padded_relative_embeddings = relative_embeddings + used_relative_embeddings = padded_relative_embeddings[ + :, slice_start_position:slice_end_position + ] + return used_relative_embeddings + + def _relative_position_to_absolute_position(self, x): + """ + x: [b, h, l, 2*l-1] + ret: [b, h, l, l] + """ + batch, heads, length, _ = x.size() + # Concat columns of pad to shift from relative to absolute indexing. + x = F.pad(x, commons.convert_pad_shape([[0, 0], [0, 0], [0, 0], [0, 1]])) + + # Concat extra elements so to add up to shape (len+1, 2*len-1). + x_flat = x.view([batch, heads, length * 2 * length]) + x_flat = F.pad( + x_flat, commons.convert_pad_shape([[0, 0], [0, 0], [0, length - 1]]) + ) + + # Reshape and slice out the padded elements. + x_final = x_flat.view([batch, heads, length + 1, 2 * length - 1])[ + :, :, :length, length - 1 : + ] + return x_final + + def _absolute_position_to_relative_position(self, x): + """ + x: [b, h, l, l] + ret: [b, h, l, 2*l-1] + """ + batch, heads, length, _ = x.size() + # padd along column + x = F.pad( + x, commons.convert_pad_shape([[0, 0], [0, 0], [0, 0], [0, length - 1]]) + ) + x_flat = x.view([batch, heads, length**2 + length * (length - 1)]) + # add 0's in the beginning that will skew the elements after reshape + x_flat = F.pad(x_flat, commons.convert_pad_shape([[0, 0], [0, 0], [length, 0]])) + x_final = x_flat.view([batch, heads, length, 2 * length])[:, :, :, 1:] + return x_final + + def _attention_bias_proximal(self, length): + """Bias for self-attention to encourage attention to close positions. + Args: + length: an integer scalar. + Returns: + a Tensor with shape [1, 1, length, length] + """ + r = torch.arange(length, dtype=torch.float32) + diff = torch.unsqueeze(r, 0) - torch.unsqueeze(r, 1) + return torch.unsqueeze(torch.unsqueeze(-torch.log1p(torch.abs(diff)), 0), 0) + + +class FFN(nn.Module): + def __init__( + self, + in_channels, + out_channels, + filter_channels, + kernel_size, + p_dropout=0.0, + activation=None, + causal=False, + ): + super().__init__() + self.in_channels = in_channels + self.out_channels = out_channels + self.filter_channels = filter_channels + self.kernel_size = kernel_size + self.p_dropout = p_dropout + self.activation = activation + self.causal = causal + + if causal: + self.padding = self._causal_padding + else: + self.padding = self._same_padding + + self.conv_1 = nn.Conv1d(in_channels, filter_channels, kernel_size) + self.conv_2 = nn.Conv1d(filter_channels, out_channels, kernel_size) + self.drop = nn.Dropout(p_dropout) + + def forward(self, x, x_mask): + x = self.conv_1(self.padding(x * x_mask)) + if self.activation == "gelu": + x = x * torch.sigmoid(1.702 * x) + else: + x = torch.relu(x) + x = self.drop(x) + x = self.conv_2(self.padding(x * x_mask)) + return x * x_mask + + def _causal_padding(self, x): + if self.kernel_size == 1: + return x + pad_l = self.kernel_size - 1 + pad_r = 0 + padding = [[0, 0], [0, 0], [pad_l, pad_r]] + x = F.pad(x, commons.convert_pad_shape(padding)) + return x + + def _same_padding(self, x): + if self.kernel_size == 1: + return x + pad_l = (self.kernel_size - 1) // 2 + pad_r = self.kernel_size // 2 + padding = [[0, 0], [0, 0], [pad_l, pad_r]] + x = F.pad(x, commons.convert_pad_shape(padding)) + return x diff --git a/vits/commons.py b/vits/commons.py new file mode 100644 index 0000000000000000000000000000000000000000..045a538d5a3ef8033eca70639a894346b11d5f61 --- /dev/null +++ b/vits/commons.py @@ -0,0 +1,187 @@ +import math +import numpy as np +import torch +from torch import nn +from torch.nn import functional as F + + +def slice_pitch_segments(x, ids_str, segment_size=4): + ret = torch.zeros_like(x[:, :segment_size]) + for i in range(x.size(0)): + idx_str = ids_str[i] + idx_end = idx_str + segment_size + ret[i] = x[i, idx_str:idx_end] + return ret + + +def rand_slice_segments_with_pitch(x, pitch, x_lengths=None, segment_size=4): + b, d, t = x.size() + if x_lengths is None: + x_lengths = t + ids_str_max = x_lengths - segment_size + 1 + ids_str = (torch.rand([b]).to(device=x.device) * ids_str_max).to(dtype=torch.long) + ret = slice_segments(x, ids_str, segment_size) + ret_pitch = slice_pitch_segments(pitch, ids_str, segment_size) + return ret, ret_pitch, ids_str + + +def rand_spec_segments(x, x_lengths=None, segment_size=4): + b, d, t = x.size() + if x_lengths is None: + x_lengths = t + ids_str_max = x_lengths - segment_size + ids_str = (torch.rand([b]).to(device=x.device) * ids_str_max).to(dtype=torch.long) + ret = slice_segments(x, ids_str, segment_size) + return ret, ids_str + + +def init_weights(m, mean=0.0, std=0.01): + classname = m.__class__.__name__ + if classname.find("Conv") != -1: + m.weight.data.normal_(mean, std) + + +def get_padding(kernel_size, dilation=1): + return int((kernel_size * dilation - dilation) / 2) + + +def convert_pad_shape(pad_shape): + l = pad_shape[::-1] + pad_shape = [item for sublist in l for item in sublist] + return pad_shape + + +def kl_divergence(m_p, logs_p, m_q, logs_q): + """KL(P||Q)""" + kl = (logs_q - logs_p) - 0.5 + kl += ( + 0.5 * (torch.exp(2.0 * logs_p) + ((m_p - m_q) ** 2)) * torch.exp(-2.0 * logs_q) + ) + return kl + + +def rand_gumbel(shape): + """Sample from the Gumbel distribution, protect from overflows.""" + uniform_samples = torch.rand(shape) * 0.99998 + 0.00001 + return -torch.log(-torch.log(uniform_samples)) + + +def rand_gumbel_like(x): + g = rand_gumbel(x.size()).to(dtype=x.dtype, device=x.device) + return g + + +def slice_segments(x, ids_str, segment_size=4): + ret = torch.zeros_like(x[:, :, :segment_size]) + for i in range(x.size(0)): + idx_str = ids_str[i] + idx_end = idx_str + segment_size + ret[i] = x[i, :, idx_str:idx_end] + return ret + + +def rand_slice_segments(x, x_lengths=None, segment_size=4): + b, d, t = x.size() + if x_lengths is None: + x_lengths = t + ids_str_max = x_lengths - segment_size + 1 + ids_str = (torch.rand([b]).to(device=x.device) * ids_str_max).to(dtype=torch.long) + ret = slice_segments(x, ids_str, segment_size) + return ret, ids_str + + +def get_timing_signal_1d(length, channels, min_timescale=1.0, max_timescale=1.0e4): + position = torch.arange(length, dtype=torch.float) + num_timescales = channels // 2 + log_timescale_increment = math.log(float(max_timescale) / float(min_timescale)) / ( + num_timescales - 1 + ) + inv_timescales = min_timescale * torch.exp( + torch.arange(num_timescales, dtype=torch.float) * -log_timescale_increment + ) + scaled_time = position.unsqueeze(0) * inv_timescales.unsqueeze(1) + signal = torch.cat([torch.sin(scaled_time), torch.cos(scaled_time)], 0) + signal = F.pad(signal, [0, 0, 0, channels % 2]) + signal = signal.view(1, channels, length) + return signal + + +def add_timing_signal_1d(x, min_timescale=1.0, max_timescale=1.0e4): + b, channels, length = x.size() + signal = get_timing_signal_1d(length, channels, min_timescale, max_timescale) + return x + signal.to(dtype=x.dtype, device=x.device) + + +def cat_timing_signal_1d(x, min_timescale=1.0, max_timescale=1.0e4, axis=1): + b, channels, length = x.size() + signal = get_timing_signal_1d(length, channels, min_timescale, max_timescale) + return torch.cat([x, signal.to(dtype=x.dtype, device=x.device)], axis) + + +def subsequent_mask(length): + mask = torch.tril(torch.ones(length, length)).unsqueeze(0).unsqueeze(0) + return mask + + +@torch.jit.script +def fused_add_tanh_sigmoid_multiply(input_a, input_b, n_channels): + n_channels_int = n_channels[0] + in_act = input_a + input_b + t_act = torch.tanh(in_act[:, :n_channels_int, :]) + s_act = torch.sigmoid(in_act[:, n_channels_int:, :]) + acts = t_act * s_act + return acts + + +def convert_pad_shape(pad_shape): + l = pad_shape[::-1] + pad_shape = [item for sublist in l for item in sublist] + return pad_shape + + +def shift_1d(x): + x = F.pad(x, convert_pad_shape([[0, 0], [0, 0], [1, 0]]))[:, :, :-1] + return x + + +def sequence_mask(length, max_length=None): + if max_length is None: + max_length = length.max() + x = torch.arange(max_length, dtype=length.dtype, device=length.device) + return x.unsqueeze(0) < length.unsqueeze(1) + + +def generate_path(duration, mask): + """ + duration: [b, 1, t_x] + mask: [b, 1, t_y, t_x] + """ + device = duration.device + + b, _, t_y, t_x = mask.shape + cum_duration = torch.cumsum(duration, -1) + + cum_duration_flat = cum_duration.view(b * t_x) + path = sequence_mask(cum_duration_flat, t_y).to(mask.dtype) + path = path.view(b, t_x, t_y) + path = path - F.pad(path, convert_pad_shape([[0, 0], [1, 0], [0, 0]]))[:, :-1] + path = path.unsqueeze(1).transpose(2, 3) * mask + return path + + +def clip_grad_value_(parameters, clip_value, norm_type=2): + if isinstance(parameters, torch.Tensor): + parameters = [parameters] + parameters = list(filter(lambda p: p.grad is not None, parameters)) + norm_type = float(norm_type) + if clip_value is not None: + clip_value = float(clip_value) + + total_norm = 0 + for p in parameters: + param_norm = p.grad.data.norm(norm_type) + total_norm += param_norm.item() ** norm_type + if clip_value is not None: + p.grad.data.clamp_(min=-clip_value, max=clip_value) + total_norm = total_norm ** (1.0 / norm_type) + return total_norm diff --git a/vits/data_utils.py b/vits/data_utils.py new file mode 100644 index 0000000000000000000000000000000000000000..bb9c6635f7287ffa7307893b210680a65754c898 --- /dev/null +++ b/vits/data_utils.py @@ -0,0 +1,325 @@ +import os +import numpy as np +import random +import torch +import torch.utils.data + + +from vits.utils import load_wav_to_torch + + +def load_filepaths(filename, split="|"): + with open(filename, encoding='utf-8') as f: + filepaths = [line.strip().split(split) for line in f] + return filepaths + + +class TextAudioSpeakerSet(torch.utils.data.Dataset): + def __init__(self, filename, hparams): + self.items = load_filepaths(filename) + self.max_wav_value = hparams.max_wav_value + self.sampling_rate = hparams.sampling_rate + self.segment_size = hparams.segment_size + self.hop_length = hparams.hop_length + self._filter() + print(f'----------{len(self.items)}----------') + + def _filter(self): + lengths = [] + items_new = [] + items_min = int(self.segment_size / self.hop_length * 4) # 1 S + items_max = int(self.segment_size / self.hop_length * 16) # 4 S + for wavpath, spec, pitch, vec, ppg, spk in self.items: + if not os.path.isfile(wavpath): + continue + if not os.path.isfile(spec): + continue + if not os.path.isfile(pitch): + continue + if not os.path.isfile(vec): + continue + if not os.path.isfile(ppg): + continue + if not os.path.isfile(spk): + continue + temp = np.load(pitch) + usel = int(temp.shape[0] - 1) # useful length + if (usel < items_min): + continue + if (usel >= items_max): + usel = items_max + items_new.append([wavpath, spec, pitch, vec, ppg, spk, usel]) + lengths.append(usel) + self.items = items_new + self.lengths = lengths + + def read_wav(self, filename): + audio, sampling_rate = load_wav_to_torch(filename) + assert sampling_rate == self.sampling_rate, f"error: this sample rate of {filename} is {sampling_rate}" + audio_norm = audio / self.max_wav_value + audio_norm = audio_norm.unsqueeze(0) + return audio_norm + + def __getitem__(self, index): + return self.my_getitem(index) + + def __len__(self): + return len(self.items) + + def my_getitem(self, idx): + item = self.items[idx] + # print(item) + wav = item[0] + spe = item[1] + pit = item[2] + vec = item[3] + ppg = item[4] + spk = item[5] + use = item[6] + + wav = self.read_wav(wav) + spe = torch.load(spe) + + pit = np.load(pit) + vec = np.load(vec) + vec = np.repeat(vec, 2, 0) # 320 PPG -> 160 * 2 + ppg = np.load(ppg) + ppg = np.repeat(ppg, 2, 0) # 320 PPG -> 160 * 2 + spk = np.load(spk) + + pit = torch.FloatTensor(pit) + vec = torch.FloatTensor(vec) + ppg = torch.FloatTensor(ppg) + spk = torch.FloatTensor(spk) + + len_pit = pit.size()[0] + len_vec = vec.size()[0] - 2 # for safe + len_ppg = ppg.size()[0] - 2 # for safe + len_min = min(len_pit, len_vec) + len_min = min(len_min, len_ppg) + len_wav = len_min * self.hop_length + + pit = pit[:len_min] + vec = vec[:len_min, :] + ppg = ppg[:len_min, :] + spe = spe[:, :len_min] + wav = wav[:, :len_wav] + if len_min > use: + max_frame_start = ppg.size(0) - use - 1 + frame_start = random.randint(0, max_frame_start) + frame_end = frame_start + use + + pit = pit[frame_start:frame_end] + vec = vec[frame_start:frame_end, :] + ppg = ppg[frame_start:frame_end, :] + spe = spe[:, frame_start:frame_end] + + wav_start = frame_start * self.hop_length + wav_end = frame_end * self.hop_length + wav = wav[:, wav_start:wav_end] + # print(spe.shape) + # print(wav.shape) + # print(ppg.shape) + # print(pit.shape) + # print(spk.shape) + return spe, wav, ppg, vec, pit, spk + + +class TextAudioSpeakerCollate: + """Zero-pads model inputs and targets""" + + def __call__(self, batch): + # Right zero-pad all one-hot text sequences to max input length + # mel: [freq, length] + # wav: [1, length] + # ppg: [len, 1024] + # pit: [len] + # spk: [256] + _, ids_sorted_decreasing = torch.sort( + torch.LongTensor([x[0].size(1) for x in batch]), dim=0, descending=True + ) + + max_spe_len = max([x[0].size(1) for x in batch]) + max_wav_len = max([x[1].size(1) for x in batch]) + spe_lengths = torch.LongTensor(len(batch)) + wav_lengths = torch.LongTensor(len(batch)) + spe_padded = torch.FloatTensor( + len(batch), batch[0][0].size(0), max_spe_len) + wav_padded = torch.FloatTensor(len(batch), 1, max_wav_len) + spe_padded.zero_() + wav_padded.zero_() + + max_ppg_len = max([x[2].size(0) for x in batch]) + ppg_lengths = torch.FloatTensor(len(batch)) + ppg_padded = torch.FloatTensor( + len(batch), max_ppg_len, batch[0][2].size(1)) + vec_padded = torch.FloatTensor( + len(batch), max_ppg_len, batch[0][3].size(1)) + pit_padded = torch.FloatTensor(len(batch), max_ppg_len) + ppg_padded.zero_() + vec_padded.zero_() + pit_padded.zero_() + spk = torch.FloatTensor(len(batch), batch[0][5].size(0)) + + for i in range(len(ids_sorted_decreasing)): + row = batch[ids_sorted_decreasing[i]] + + spe = row[0] + spe_padded[i, :, : spe.size(1)] = spe + spe_lengths[i] = spe.size(1) + + wav = row[1] + wav_padded[i, :, : wav.size(1)] = wav + wav_lengths[i] = wav.size(1) + + ppg = row[2] + ppg_padded[i, : ppg.size(0), :] = ppg + ppg_lengths[i] = ppg.size(0) + + vec = row[3] + vec_padded[i, : vec.size(0), :] = vec + + pit = row[4] + pit_padded[i, : pit.size(0)] = pit + + spk[i] = row[5] + # print(ppg_padded.shape) + # print(ppg_lengths.shape) + # print(pit_padded.shape) + # print(spk.shape) + # print(spe_padded.shape) + # print(spe_lengths.shape) + # print(wav_padded.shape) + # print(wav_lengths.shape) + return ( + ppg_padded, + ppg_lengths, + vec_padded, + pit_padded, + spk, + spe_padded, + spe_lengths, + wav_padded, + wav_lengths, + ) + + +class DistributedBucketSampler(torch.utils.data.distributed.DistributedSampler): + """ + Maintain similar input lengths in a batch. + Length groups are specified by boundaries. + Ex) boundaries = [b1, b2, b3] -> any batch is included either {x | b1 < length(x) <=b2} or {x | b2 < length(x) <= b3}. + It removes samples which are not included in the boundaries. + Ex) boundaries = [b1, b2, b3] -> any x s.t. length(x) <= b1 or length(x) > b3 are discarded. + """ + + def __init__( + self, + dataset, + batch_size, + boundaries, + num_replicas=None, + rank=None, + shuffle=True, + ): + super().__init__(dataset, num_replicas=num_replicas, rank=rank, shuffle=shuffle) + self.lengths = dataset.lengths + self.batch_size = batch_size + self.boundaries = boundaries + + self.buckets, self.num_samples_per_bucket = self._create_buckets() + self.total_size = sum(self.num_samples_per_bucket) + self.num_samples = self.total_size // self.num_replicas + + def _create_buckets(self): + buckets = [[] for _ in range(len(self.boundaries) - 1)] + for i in range(len(self.lengths)): + length = self.lengths[i] + idx_bucket = self._bisect(length) + if idx_bucket != -1: + buckets[idx_bucket].append(i) + + for i in range(len(buckets) - 1, 0, -1): + if len(buckets[i]) == 0: + buckets.pop(i) + self.boundaries.pop(i + 1) + + num_samples_per_bucket = [] + for i in range(len(buckets)): + len_bucket = len(buckets[i]) + total_batch_size = self.num_replicas * self.batch_size + rem = ( + total_batch_size - (len_bucket % total_batch_size) + ) % total_batch_size + num_samples_per_bucket.append(len_bucket + rem) + return buckets, num_samples_per_bucket + + def __iter__(self): + # deterministically shuffle based on epoch + g = torch.Generator() + g.manual_seed(self.epoch) + + indices = [] + if self.shuffle: + for bucket in self.buckets: + indices.append(torch.randperm( + len(bucket), generator=g).tolist()) + else: + for bucket in self.buckets: + indices.append(list(range(len(bucket)))) + + batches = [] + for i in range(len(self.buckets)): + bucket = self.buckets[i] + len_bucket = len(bucket) + if (len_bucket == 0): + continue + ids_bucket = indices[i] + num_samples_bucket = self.num_samples_per_bucket[i] + + # add extra samples to make it evenly divisible + rem = num_samples_bucket - len_bucket + ids_bucket = ( + ids_bucket + + ids_bucket * (rem // len_bucket) + + ids_bucket[: (rem % len_bucket)] + ) + + # subsample + ids_bucket = ids_bucket[self.rank:: self.num_replicas] + + # batching + for j in range(len(ids_bucket) // self.batch_size): + batch = [ + bucket[idx] + for idx in ids_bucket[ + j * self.batch_size: (j + 1) * self.batch_size + ] + ] + batches.append(batch) + + if self.shuffle: + batch_ids = torch.randperm(len(batches), generator=g).tolist() + batches = [batches[i] for i in batch_ids] + self.batches = batches + + assert len(self.batches) * self.batch_size == self.num_samples + return iter(self.batches) + + def _bisect(self, x, lo=0, hi=None): + if hi is None: + hi = len(self.boundaries) - 1 + + if hi > lo: + mid = (hi + lo) // 2 + if self.boundaries[mid] < x and x <= self.boundaries[mid + 1]: + return mid + elif x <= self.boundaries[mid]: + return self._bisect(x, lo, mid) + else: + return self._bisect(x, mid + 1, hi) + else: + return -1 + + def __len__(self): + return self.num_samples // self.batch_size diff --git a/vits/losses.py b/vits/losses.py new file mode 100644 index 0000000000000000000000000000000000000000..9244de65482650fedea4de6e3cc26d4700ee76e9 --- /dev/null +++ b/vits/losses.py @@ -0,0 +1,79 @@ +import torch + + +def feature_loss(fmap_r, fmap_g): + loss = 0 + for dr, dg in zip(fmap_r, fmap_g): + for rl, gl in zip(dr, dg): + rl = rl.float().detach() + gl = gl.float() + loss += torch.mean(torch.abs(rl - gl)) + + return loss * 2 + + +def discriminator_loss(disc_real_outputs, disc_generated_outputs): + loss = 0 + r_losses = [] + g_losses = [] + for dr, dg in zip(disc_real_outputs, disc_generated_outputs): + dr = dr.float() + dg = dg.float() + r_loss = torch.mean((1 - dr) ** 2) + g_loss = torch.mean(dg**2) + loss += r_loss + g_loss + r_losses.append(r_loss.item()) + g_losses.append(g_loss.item()) + + return loss, r_losses, g_losses + + +def generator_loss(disc_outputs): + loss = 0 + gen_losses = [] + for dg in disc_outputs: + dg = dg.float() + l = torch.mean((1 - dg) ** 2) + gen_losses.append(l) + loss += l + + return loss, gen_losses + + +def kl_loss(z_p, logs_q, m_p, logs_p, total_logdet, z_mask): + """ + z_p, logs_q: [b, h, t_t] + m_p, logs_p: [b, h, t_t] + total_logdet: [b] - total_logdet summed over each batch + """ + z_p = z_p.float() + logs_q = logs_q.float() + m_p = m_p.float() + logs_p = logs_p.float() + z_mask = z_mask.float() + + kl = logs_p - logs_q - 0.5 + kl += 0.5 * ((z_p - m_p) ** 2) * torch.exp(-2.0 * logs_p) + kl = torch.sum(kl * z_mask) + # add total_logdet (Negative LL) + kl -= torch.sum(total_logdet) + l = kl / torch.sum(z_mask) + return l + + +def kl_loss_back(z_p, logs_q, m_p, logs_p, z_mask): + """ + z_p, logs_q: [b, h, t_t] + m_p, logs_p: [b, h, t_t] + """ + z_p = z_p.float() + logs_q = logs_q.float() + m_p = m_p.float() + logs_p = logs_p.float() + z_mask = z_mask.float() + + kl = logs_p - logs_q - 0.5 + kl += 0.5 * ((z_p - m_p) ** 2) * torch.exp(-2.0 * logs_p) + kl = torch.sum(kl * z_mask) + l = kl / torch.sum(z_mask) + return l diff --git a/vits/models.py b/vits/models.py new file mode 100644 index 0000000000000000000000000000000000000000..49c74ded38ee5e3731d563b3c2cbdb2bb821a5ac --- /dev/null +++ b/vits/models.py @@ -0,0 +1,256 @@ + +import torch + +from torch import nn +from torch.nn import functional as F +from vits import attentions +from vits import commons +from vits import modules +from vits.utils import f0_to_coarse +from vits_decoder.generator import Generator +from vits.modules_grl import SpeakerClassifier + + +class TextEncoder(nn.Module): + def __init__(self, + in_channels, + vec_channels, + out_channels, + hidden_channels, + filter_channels, + n_heads, + n_layers, + kernel_size, + p_dropout): + super().__init__() + self.out_channels = out_channels + self.pre = nn.Conv1d(in_channels, hidden_channels, kernel_size=5, padding=2) + self.hub = nn.Conv1d(vec_channels, hidden_channels, kernel_size=5, padding=2) + self.pit = nn.Embedding(256, hidden_channels) + self.enc = attentions.Encoder( + hidden_channels, + filter_channels, + n_heads, + n_layers, + kernel_size, + p_dropout) + self.proj = nn.Conv1d(hidden_channels, out_channels * 2, 1) + + def forward(self, x, x_lengths, v, f0): + x = torch.transpose(x, 1, -1) # [b, h, t] + x_mask = torch.unsqueeze(commons.sequence_mask(x_lengths, x.size(2)), 1).to( + x.dtype + ) + x = self.pre(x) * x_mask + v = torch.transpose(v, 1, -1) # [b, h, t] + v = self.hub(v) * x_mask + x = x + v + self.pit(f0).transpose(1, 2) + x = self.enc(x * x_mask, x_mask) + stats = self.proj(x) * x_mask + m, logs = torch.split(stats, self.out_channels, dim=1) + z = (m + torch.randn_like(m) * torch.exp(logs)) * x_mask + return z, m, logs, x_mask, x + + +class ResidualCouplingBlock(nn.Module): + def __init__( + self, + channels, + hidden_channels, + kernel_size, + dilation_rate, + n_layers, + n_flows=4, + gin_channels=0, + ): + super().__init__() + self.flows = nn.ModuleList() + for i in range(n_flows): + self.flows.append( + modules.ResidualCouplingLayer( + channels, + hidden_channels, + kernel_size, + dilation_rate, + n_layers, + gin_channels=gin_channels, + mean_only=True, + ) + ) + self.flows.append(modules.Flip()) + + def forward(self, x, x_mask, g=None, reverse=False): + if not reverse: + total_logdet = 0 + for flow in self.flows: + x, log_det = flow(x, x_mask, g=g, reverse=reverse) + total_logdet += log_det + return x, total_logdet + else: + total_logdet = 0 + for flow in reversed(self.flows): + x, log_det = flow(x, x_mask, g=g, reverse=reverse) + total_logdet += log_det + return x, total_logdet + + def remove_weight_norm(self): + for i in range(self.n_flows): + self.flows[i * 2].remove_weight_norm() + + +class PosteriorEncoder(nn.Module): + def __init__( + self, + in_channels, + out_channels, + hidden_channels, + kernel_size, + dilation_rate, + n_layers, + gin_channels=0, + ): + super().__init__() + self.out_channels = out_channels + self.pre = nn.Conv1d(in_channels, hidden_channels, 1) + self.enc = modules.WN( + hidden_channels, + kernel_size, + dilation_rate, + n_layers, + gin_channels=gin_channels, + ) + self.proj = nn.Conv1d(hidden_channels, out_channels * 2, 1) + + def forward(self, x, x_lengths, g=None): + x_mask = torch.unsqueeze(commons.sequence_mask(x_lengths, x.size(2)), 1).to( + x.dtype + ) + x = self.pre(x) * x_mask + x = self.enc(x, x_mask, g=g) + stats = self.proj(x) * x_mask + m, logs = torch.split(stats, self.out_channels, dim=1) + z = (m + torch.randn_like(m) * torch.exp(logs)) * x_mask + return z, m, logs, x_mask + + def remove_weight_norm(self): + self.enc.remove_weight_norm() + + +class SynthesizerTrn(nn.Module): + def __init__( + self, + spec_channels, + segment_size, + hp + ): + super().__init__() + self.segment_size = segment_size + self.emb_g = nn.Linear(hp.vits.spk_dim, hp.vits.gin_channels) + self.enc_p = TextEncoder( + hp.vits.ppg_dim, + hp.vits.vec_dim, + hp.vits.inter_channels, + hp.vits.hidden_channels, + hp.vits.filter_channels, + 2, + 6, + 3, + 0.1, + ) + self.speaker_classifier = SpeakerClassifier( + hp.vits.hidden_channels, + hp.vits.spk_dim, + ) + self.enc_q = PosteriorEncoder( + spec_channels, + hp.vits.inter_channels, + hp.vits.hidden_channels, + 5, + 1, + 16, + gin_channels=hp.vits.gin_channels, + ) + self.flow = ResidualCouplingBlock( + hp.vits.inter_channels, + hp.vits.hidden_channels, + 5, + 1, + 4, + gin_channels=hp.vits.spk_dim + ) + self.dec = Generator(hp=hp) + + def forward(self, ppg, vec, pit, spec, spk, ppg_l, spec_l): + ppg = ppg + torch.randn_like(ppg) * 1 # Perturbation + vec = vec + torch.randn_like(vec) * 2 # Perturbation + g = self.emb_g(F.normalize(spk)).unsqueeze(-1) + z_p, m_p, logs_p, ppg_mask, x = self.enc_p( + ppg, ppg_l, vec, f0=f0_to_coarse(pit)) + z_q, m_q, logs_q, spec_mask = self.enc_q(spec, spec_l, g=g) + + z_slice, pit_slice, ids_slice = commons.rand_slice_segments_with_pitch( + z_q, pit, spec_l, self.segment_size) + audio = self.dec(spk, z_slice, pit_slice) + + # SNAC to flow + z_f, logdet_f = self.flow(z_q, spec_mask, g=spk) + z_r, logdet_r = self.flow(z_p, spec_mask, g=spk, reverse=True) + # speaker + spk_preds = self.speaker_classifier(x) + return audio, ids_slice, spec_mask, (z_f, z_r, z_p, m_p, logs_p, z_q, m_q, logs_q, logdet_f, logdet_r), spk_preds + + def infer(self, ppg, vec, pit, spk, ppg_l): + ppg = ppg + torch.randn_like(ppg) * 0.0001 # Perturbation + z_p, m_p, logs_p, ppg_mask, x = self.enc_p( + ppg, ppg_l, vec, f0=f0_to_coarse(pit)) + z, _ = self.flow(z_p, ppg_mask, g=spk, reverse=True) + o = self.dec(spk, z * ppg_mask, f0=pit) + return o + + +class SynthesizerInfer(nn.Module): + def __init__( + self, + spec_channels, + segment_size, + hp + ): + super().__init__() + self.segment_size = segment_size + self.enc_p = TextEncoder( + hp.vits.ppg_dim, + hp.vits.vec_dim, + hp.vits.inter_channels, + hp.vits.hidden_channels, + hp.vits.filter_channels, + 2, + 6, + 3, + 0.1, + ) + self.flow = ResidualCouplingBlock( + hp.vits.inter_channels, + hp.vits.hidden_channels, + 5, + 1, + 4, + gin_channels=hp.vits.spk_dim + ) + self.dec = Generator(hp=hp) + + def remove_weight_norm(self): + self.flow.remove_weight_norm() + self.dec.remove_weight_norm() + + def pitch2source(self, f0): + return self.dec.pitch2source(f0) + + def source2wav(self, source): + return self.dec.source2wav(source) + + def inference(self, ppg, vec, pit, spk, ppg_l, source): + z_p, m_p, logs_p, ppg_mask, x = self.enc_p( + ppg, ppg_l, vec, f0=f0_to_coarse(pit)) + z, _ = self.flow(z_p, ppg_mask, g=spk, reverse=True) + o = self.dec.inference(spk, z * ppg_mask, source) + return o diff --git a/vits/modules.py b/vits/modules.py new file mode 100644 index 0000000000000000000000000000000000000000..58044685766deb548394d5a64633dddb261acf03 --- /dev/null +++ b/vits/modules.py @@ -0,0 +1,324 @@ +import torch +from torch import nn +from torch.nn import functional as F +from vits import commons + + +LRELU_SLOPE = 0.1 + + +class LayerNorm(nn.Module): + def __init__(self, channels, eps=1e-5): + super().__init__() + self.channels = channels + self.eps = eps + + self.gamma = nn.Parameter(torch.ones(channels)) + self.beta = nn.Parameter(torch.zeros(channels)) + + def forward(self, x): + x = x.transpose(1, -1) + x = F.layer_norm(x, (self.channels,), self.gamma, self.beta, self.eps) + return x.transpose(1, -1) + + +class ConvReluNorm(nn.Module): + def __init__( + self, + in_channels, + hidden_channels, + out_channels, + kernel_size, + n_layers, + p_dropout, + ): + super().__init__() + self.in_channels = in_channels + self.hidden_channels = hidden_channels + self.out_channels = out_channels + self.kernel_size = kernel_size + self.n_layers = n_layers + self.p_dropout = p_dropout + assert n_layers > 1, "Number of layers should be larger than 0." + + self.conv_layers = nn.ModuleList() + self.norm_layers = nn.ModuleList() + self.conv_layers.append( + nn.Conv1d( + in_channels, hidden_channels, kernel_size, padding=kernel_size // 2 + ) + ) + self.norm_layers.append(LayerNorm(hidden_channels)) + self.relu_drop = nn.Sequential(nn.ReLU(), nn.Dropout(p_dropout)) + for _ in range(n_layers - 1): + self.conv_layers.append( + nn.Conv1d( + hidden_channels, + hidden_channels, + kernel_size, + padding=kernel_size // 2, + ) + ) + self.norm_layers.append(LayerNorm(hidden_channels)) + self.proj = nn.Conv1d(hidden_channels, out_channels, 1) + self.proj.weight.data.zero_() + self.proj.bias.data.zero_() + + def forward(self, x, x_mask): + x_org = x + for i in range(self.n_layers): + x = self.conv_layers[i](x * x_mask) + x = self.norm_layers[i](x) + x = self.relu_drop(x) + x = x_org + self.proj(x) + return x * x_mask + + +class DDSConv(nn.Module): + """ + Dialted and Depth-Separable Convolution + """ + + def __init__(self, channels, kernel_size, n_layers, p_dropout=0.0): + super().__init__() + self.channels = channels + self.kernel_size = kernel_size + self.n_layers = n_layers + self.p_dropout = p_dropout + + self.drop = nn.Dropout(p_dropout) + self.convs_sep = nn.ModuleList() + self.convs_1x1 = nn.ModuleList() + self.norms_1 = nn.ModuleList() + self.norms_2 = nn.ModuleList() + for i in range(n_layers): + dilation = kernel_size**i + padding = (kernel_size * dilation - dilation) // 2 + self.convs_sep.append( + nn.Conv1d( + channels, + channels, + kernel_size, + groups=channels, + dilation=dilation, + padding=padding, + ) + ) + self.convs_1x1.append(nn.Conv1d(channels, channels, 1)) + self.norms_1.append(LayerNorm(channels)) + self.norms_2.append(LayerNorm(channels)) + + def forward(self, x, x_mask, g=None): + if g is not None: + x = x + g + for i in range(self.n_layers): + y = self.convs_sep[i](x * x_mask) + y = self.norms_1[i](y) + y = F.gelu(y) + y = self.convs_1x1[i](y) + y = self.norms_2[i](y) + y = F.gelu(y) + y = self.drop(y) + x = x + y + return x * x_mask + + +class WN(torch.nn.Module): + def __init__( + self, + hidden_channels, + kernel_size, + dilation_rate, + n_layers, + gin_channels=0, + p_dropout=0, + ): + super(WN, self).__init__() + assert kernel_size % 2 == 1 + self.hidden_channels = hidden_channels + self.kernel_size = (kernel_size,) + self.dilation_rate = dilation_rate + self.n_layers = n_layers + self.gin_channels = gin_channels + self.p_dropout = p_dropout + + self.in_layers = torch.nn.ModuleList() + self.res_skip_layers = torch.nn.ModuleList() + self.drop = nn.Dropout(p_dropout) + + if gin_channels != 0: + cond_layer = torch.nn.Conv1d( + gin_channels, 2 * hidden_channels * n_layers, 1 + ) + self.cond_layer = torch.nn.utils.weight_norm(cond_layer, name="weight") + + for i in range(n_layers): + dilation = dilation_rate**i + padding = int((kernel_size * dilation - dilation) / 2) + in_layer = torch.nn.Conv1d( + hidden_channels, + 2 * hidden_channels, + kernel_size, + dilation=dilation, + padding=padding, + ) + in_layer = torch.nn.utils.weight_norm(in_layer, name="weight") + self.in_layers.append(in_layer) + + # last one is not necessary + if i < n_layers - 1: + res_skip_channels = 2 * hidden_channels + else: + res_skip_channels = hidden_channels + + res_skip_layer = torch.nn.Conv1d(hidden_channels, res_skip_channels, 1) + res_skip_layer = torch.nn.utils.weight_norm(res_skip_layer, name="weight") + self.res_skip_layers.append(res_skip_layer) + + def forward(self, x, x_mask, g=None, **kwargs): + output = torch.zeros_like(x) + n_channels_tensor = torch.IntTensor([self.hidden_channels]) + + if g is not None: + g = self.cond_layer(g) + + for i in range(self.n_layers): + x_in = self.in_layers[i](x) + if g is not None: + cond_offset = i * 2 * self.hidden_channels + g_l = g[:, cond_offset : cond_offset + 2 * self.hidden_channels, :] + else: + g_l = torch.zeros_like(x_in) + + acts = commons.fused_add_tanh_sigmoid_multiply(x_in, g_l, n_channels_tensor) + acts = self.drop(acts) + + res_skip_acts = self.res_skip_layers[i](acts) + if i < self.n_layers - 1: + res_acts = res_skip_acts[:, : self.hidden_channels, :] + x = (x + res_acts) * x_mask + output = output + res_skip_acts[:, self.hidden_channels:, :] + else: + output = output + res_skip_acts + return output * x_mask + + def remove_weight_norm(self): + if self.gin_channels != 0: + torch.nn.utils.remove_weight_norm(self.cond_layer) + for l in self.in_layers: + torch.nn.utils.remove_weight_norm(l) + for l in self.res_skip_layers: + torch.nn.utils.remove_weight_norm(l) + + +class Log(nn.Module): + def forward(self, x, x_mask, reverse=False, **kwargs): + if not reverse: + y = torch.log(torch.clamp_min(x, 1e-5)) * x_mask + logdet = torch.sum(-y, [1, 2]) + return y, logdet + else: + x = torch.exp(x) * x_mask + return x + + +class Flip(nn.Module): + def forward(self, x, *args, reverse=False, **kwargs): + x = torch.flip(x, [1]) + logdet = torch.zeros(x.size(0)).to(dtype=x.dtype, device=x.device) + return x, logdet + + +class ElementwiseAffine(nn.Module): + def __init__(self, channels): + super().__init__() + self.channels = channels + self.m = nn.Parameter(torch.zeros(channels, 1)) + self.logs = nn.Parameter(torch.zeros(channels, 1)) + + def forward(self, x, x_mask, reverse=False, **kwargs): + if not reverse: + y = self.m + torch.exp(self.logs) * x + y = y * x_mask + logdet = torch.sum(self.logs * x_mask, [1, 2]) + return y, logdet + else: + x = (x - self.m) * torch.exp(-self.logs) * x_mask + return x + + +class ResidualCouplingLayer(nn.Module): + def __init__( + self, + channels, + hidden_channels, + kernel_size, + dilation_rate, + n_layers, + p_dropout=0, + gin_channels=0, + mean_only=False, + ): + assert channels % 2 == 0, "channels should be divisible by 2" + super().__init__() + self.channels = channels + self.hidden_channels = hidden_channels + self.kernel_size = kernel_size + self.dilation_rate = dilation_rate + self.n_layers = n_layers + self.half_channels = channels // 2 + self.mean_only = mean_only + + self.pre = nn.Conv1d(self.half_channels, hidden_channels, 1) + # no use gin_channels + self.enc = WN( + hidden_channels, + kernel_size, + dilation_rate, + n_layers, + p_dropout=p_dropout, + ) + self.post = nn.Conv1d( + hidden_channels, self.half_channels * (2 - mean_only), 1) + self.post.weight.data.zero_() + self.post.bias.data.zero_() + # SNAC Speaker-normalized Affine Coupling Layer + self.snac = nn.Conv1d(gin_channels, 2 * self.half_channels, 1) + + def forward(self, x, x_mask, g=None, reverse=False): + speaker = self.snac(g.unsqueeze(-1)) + speaker_m, speaker_v = speaker.chunk(2, dim=1) # (B, half_channels, 1) + x0, x1 = torch.split(x, [self.half_channels] * 2, 1) + # x0 norm + x0_norm = (x0 - speaker_m) * torch.exp(-speaker_v) * x_mask + h = self.pre(x0_norm) * x_mask + # don't use global condition + h = self.enc(h, x_mask) + stats = self.post(h) * x_mask + if not self.mean_only: + m, logs = torch.split(stats, [self.half_channels] * 2, 1) + else: + m = stats + logs = torch.zeros_like(m) + + if not reverse: + # x1 norm before affine xform + x1_norm = (x1 - speaker_m) * torch.exp(-speaker_v) * x_mask + x1 = (m + x1_norm * torch.exp(logs)) * x_mask + x = torch.cat([x0, x1], 1) + # speaker var to logdet + logdet = torch.sum(logs * x_mask, [1, 2]) - torch.sum( + speaker_v.expand(-1, -1, logs.size(-1)) * x_mask, [1, 2]) + return x, logdet + else: + x1 = (x1 - m) * torch.exp(-logs) * x_mask + # x1 denorm before output + x1 = (speaker_m + x1 * torch.exp(speaker_v)) * x_mask + x = torch.cat([x0, x1], 1) + # speaker var to logdet + logdet = torch.sum(logs * x_mask, [1, 2]) + torch.sum( + speaker_v.expand(-1, -1, logs.size(-1)) * x_mask, [1, 2]) + return x, logdet + + def remove_weight_norm(self): + self.enc.remove_weight_norm() diff --git a/vits/modules_grl.py b/vits/modules_grl.py new file mode 100644 index 0000000000000000000000000000000000000000..3c8510725210f5f31b3677f2e8f30c3b6c215f0f --- /dev/null +++ b/vits/modules_grl.py @@ -0,0 +1,62 @@ +# Adapted from https://github.com/ubisoft/ubisoft-laforge-daft-exprt Apache License Version 2.0 +# Unsupervised Domain Adaptation by Backpropagation + +import torch +import torch.nn as nn + +from torch.autograd import Function +from torch.nn.utils import weight_norm + + +class GradientReversalFunction(Function): + @staticmethod + def forward(ctx, x, lambda_): + ctx.lambda_ = lambda_ + return x.clone() + + @staticmethod + def backward(ctx, grads): + lambda_ = ctx.lambda_ + lambda_ = grads.new_tensor(lambda_) + dx = -lambda_ * grads + return dx, None + + +class GradientReversal(torch.nn.Module): + ''' Gradient Reversal Layer + Y. Ganin, V. Lempitsky, + "Unsupervised Domain Adaptation by Backpropagation", + in ICML, 2015. + Forward pass is the identity function + In the backward pass, upstream gradients are multiplied by -lambda (i.e. gradient are reversed) + ''' + + def __init__(self, lambda_reversal=1): + super(GradientReversal, self).__init__() + self.lambda_ = lambda_reversal + + def forward(self, x): + return GradientReversalFunction.apply(x, self.lambda_) + + +class SpeakerClassifier(nn.Module): + + def __init__(self, embed_dim, spk_dim): + super(SpeakerClassifier, self).__init__() + self.classifier = nn.Sequential( + GradientReversal(lambda_reversal=1), + weight_norm(nn.Conv1d(embed_dim, embed_dim, kernel_size=5, padding=2)), + nn.ReLU(), + weight_norm(nn.Conv1d(embed_dim, embed_dim, kernel_size=5, padding=2)), + nn.ReLU(), + weight_norm(nn.Conv1d(embed_dim, spk_dim, kernel_size=5, padding=2)) + ) + + def forward(self, x): + ''' Forward function of Speaker Classifier: + x = (B, embed_dim, len) + ''' + # pass through classifier + outputs = self.classifier(x) # (B, nb_speakers) + outputs = torch.mean(outputs, dim=-1) + return outputs diff --git a/vits/spectrogram.py b/vits/spectrogram.py new file mode 100644 index 0000000000000000000000000000000000000000..67b54b1757f977f840ba97e0ad28b241fceeecd7 --- /dev/null +++ b/vits/spectrogram.py @@ -0,0 +1,140 @@ +import torch +import torch.utils.data + +from librosa.filters import mel as librosa_mel_fn + +MAX_WAV_VALUE = 32768.0 + + +def dynamic_range_compression_torch(x, C=1, clip_val=1e-5): + """ + PARAMS + ------ + C: compression factor + """ + return torch.log(torch.clamp(x, min=clip_val) * C) + + +def dynamic_range_decompression_torch(x, C=1): + """ + PARAMS + ------ + C: compression factor used to compress + """ + return torch.exp(x) / C + + +def spectral_normalize_torch(magnitudes): + output = dynamic_range_compression_torch(magnitudes) + return output + + +def spectral_de_normalize_torch(magnitudes): + output = dynamic_range_decompression_torch(magnitudes) + return output + + +mel_basis = {} +hann_window = {} + + +def spectrogram_torch(y, n_fft, sampling_rate, hop_size, win_size, center=False): + if torch.min(y) < -1.0: + print("min value is ", torch.min(y)) + if torch.max(y) > 1.0: + print("max value is ", torch.max(y)) + + global hann_window + dtype_device = str(y.dtype) + "_" + str(y.device) + wnsize_dtype_device = str(win_size) + "_" + dtype_device + if wnsize_dtype_device not in hann_window: + hann_window[wnsize_dtype_device] = torch.hann_window(win_size).to( + dtype=y.dtype, device=y.device + ) + + y = torch.nn.functional.pad( + y.unsqueeze(1), + (int((n_fft - hop_size) / 2), int((n_fft - hop_size) / 2)), + mode="reflect", + ) + y = y.squeeze(1) + + spec = torch.stft( + y, + n_fft, + hop_length=hop_size, + win_length=win_size, + window=hann_window[wnsize_dtype_device], + center=center, + pad_mode="reflect", + normalized=False, + onesided=True, + return_complex=False, + ) + + spec = torch.sqrt(spec.pow(2).sum(-1) + 1e-6) + return spec + + +def spec_to_mel_torch(spec, n_fft, num_mels, sampling_rate, fmin, fmax): + global mel_basis + dtype_device = str(spec.dtype) + "_" + str(spec.device) + fmax_dtype_device = str(fmax) + "_" + dtype_device + if fmax_dtype_device not in mel_basis: + mel = librosa_mel_fn(sr=sampling_rate, n_fft=n_fft, n_mels=num_mels, fmin=fmin, fmax=fmax) + mel_basis[fmax_dtype_device] = torch.from_numpy(mel).to( + dtype=spec.dtype, device=spec.device + ) + spec = torch.matmul(mel_basis[fmax_dtype_device], spec) + spec = spectral_normalize_torch(spec) + return spec + + +def mel_spectrogram_torch( + y, n_fft, num_mels, sampling_rate, hop_size, win_size, fmin, fmax, center=False +): + if torch.min(y) < -1.0: + print("min value is ", torch.min(y)) + if torch.max(y) > 1.0: + print("max value is ", torch.max(y)) + + global mel_basis, hann_window + dtype_device = str(y.dtype) + "_" + str(y.device) + fmax_dtype_device = str(fmax) + "_" + dtype_device + wnsize_dtype_device = str(win_size) + "_" + dtype_device + if fmax_dtype_device not in mel_basis: + mel = librosa_mel_fn(sr=sampling_rate, n_fft=n_fft, n_mels=num_mels, fmin=fmin, fmax=fmax) + mel_basis[fmax_dtype_device] = torch.from_numpy(mel).to( + dtype=y.dtype, device=y.device + ) + if wnsize_dtype_device not in hann_window: + hann_window[wnsize_dtype_device] = torch.hann_window(win_size).to( + dtype=y.dtype, device=y.device + ) + + y = torch.nn.functional.pad( + y.unsqueeze(1), + (int((n_fft - hop_size) / 2), int((n_fft - hop_size) / 2)), + mode="reflect", + ) + y = y.squeeze(1) + + spec = torch.stft( + y, + n_fft, + hop_length=hop_size, + win_length=win_size, + window=hann_window[wnsize_dtype_device], + center=center, + pad_mode="reflect", + normalized=False, + onesided=True, + return_complex=False, + ) + + spec = torch.sqrt(spec.pow(2).sum(-1) + 1e-6) + + spec = torch.matmul(mel_basis[fmax_dtype_device], spec) + spec = spectral_normalize_torch(spec) + + return spec diff --git a/vits/utils.py b/vits/utils.py new file mode 100644 index 0000000000000000000000000000000000000000..f2ae3a16ecd0112c41422a0696a09740a3f2f6a3 --- /dev/null +++ b/vits/utils.py @@ -0,0 +1,33 @@ +import torch +import numpy as np +from scipy.io.wavfile import read + +MATPLOTLIB_FLAG = False + + +def load_wav_to_torch(full_path): + sampling_rate, data = read(full_path) + return torch.FloatTensor(data.astype(np.float32)), sampling_rate + + +f0_bin = 256 +f0_max = 1100.0 +f0_min = 50.0 +f0_mel_min = 1127 * np.log(1 + f0_min / 700) +f0_mel_max = 1127 * np.log(1 + f0_max / 700) + + +def f0_to_coarse(f0): + is_torch = isinstance(f0, torch.Tensor) + f0_mel = 1127 * (1 + f0 / 700).log() if is_torch else 1127 * \ + np.log(1 + f0 / 700) + f0_mel[f0_mel > 0] = (f0_mel[f0_mel > 0] - f0_mel_min) * \ + (f0_bin - 2) / (f0_mel_max - f0_mel_min) + 1 + + f0_mel[f0_mel <= 1] = 1 + f0_mel[f0_mel > f0_bin - 1] = f0_bin - 1 + f0_coarse = ( + f0_mel + 0.5).long() if is_torch else np.rint(f0_mel).astype(np.int) + assert f0_coarse.max() <= 255 and f0_coarse.min( + ) >= 1, (f0_coarse.max(), f0_coarse.min()) + return f0_coarse diff --git a/vits_decoder/LICENSE.txt b/vits_decoder/LICENSE.txt new file mode 100644 index 0000000000000000000000000000000000000000..e9663595cc28938f88d6299acd3ba791542e4c0c --- /dev/null +++ b/vits_decoder/LICENSE.txt @@ -0,0 +1,21 @@ +MIT License + +Copyright (c) 2022 NVIDIA CORPORATION. + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. \ No newline at end of file diff --git a/vits_decoder/__init__.py b/vits_decoder/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..986a0cfe522626f45f6c2d4dede44374c86bbe71 --- /dev/null +++ b/vits_decoder/__init__.py @@ -0,0 +1 @@ +from .alias.act import SnakeAlias \ No newline at end of file diff --git a/vits_decoder/alias/LICENSE-alias.txt b/vits_decoder/alias/LICENSE-alias.txt new file mode 100644 index 0000000000000000000000000000000000000000..261eeb9e9f8b2b4b0d119366dda99c6fd7d35c64 --- /dev/null +++ b/vits_decoder/alias/LICENSE-alias.txt @@ -0,0 +1,201 @@ + Apache License + Version 2.0, January 2004 + http://www.apache.org/licenses/ + + TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION + + 1. Definitions. + + "License" shall mean the terms and conditions for use, reproduction, + and distribution as defined by Sections 1 through 9 of this document. + + "Licensor" shall mean the copyright owner or entity authorized by + the copyright owner that is granting the License. + + "Legal Entity" shall mean the union of the acting entity and all + other entities that control, are controlled by, or are under common + control with that entity. For the purposes of this definition, + "control" means (i) the power, direct or indirect, to cause the + direction or management of such entity, whether by contract or + otherwise, or (ii) ownership of fifty percent (50%) or more of the + outstanding shares, or (iii) beneficial ownership of such entity. + + "You" (or "Your") shall mean an individual or Legal Entity + exercising permissions granted by this License. + + "Source" form shall mean the preferred form for making modifications, + including but not limited to software source code, documentation + source, and configuration files. + + "Object" form shall mean any form resulting from mechanical + transformation or translation of a Source form, including but + not limited to compiled object code, generated documentation, + and conversions to other media types. + + "Work" shall mean the work of authorship, whether in Source or + Object form, made available under the License, as indicated by a + copyright notice that is included in or attached to the work + (an example is provided in the Appendix below). + + "Derivative Works" shall mean any work, whether in Source or Object + form, that is based on (or derived from) the Work and for which the + editorial revisions, annotations, elaborations, or other modifications + represent, as a whole, an original work of authorship. For the purposes + of this License, Derivative Works shall not include works that remain + separable from, or merely link (or bind by name) to the interfaces of, + the Work and Derivative Works thereof. + + "Contribution" shall mean any work of authorship, including + the original version of the Work and any modifications or additions + to that Work or Derivative Works thereof, that is intentionally + submitted to Licensor for inclusion in the Work by the copyright owner + or by an individual or Legal Entity authorized to submit on behalf of + the copyright owner. For the purposes of this definition, "submitted" + means any form of electronic, verbal, or written communication sent + to the Licensor or its representatives, including but not limited to + communication on electronic mailing lists, source code control systems, + and issue tracking systems that are managed by, or on behalf of, the + Licensor for the purpose of discussing and improving the Work, but + excluding communication that is conspicuously marked or otherwise + designated in writing by the copyright owner as "Not a Contribution." + + "Contributor" shall mean Licensor and any individual or Legal Entity + on behalf of whom a Contribution has been received by Licensor and + subsequently incorporated within the Work. + + 2. Grant of Copyright License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + copyright license to reproduce, prepare Derivative Works of, + publicly display, publicly perform, sublicense, and distribute the + Work and such Derivative Works in Source or Object form. + + 3. Grant of Patent License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + (except as stated in this section) patent license to make, have made, + use, offer to sell, sell, import, and otherwise transfer the Work, + where such license applies only to those patent claims licensable + by such Contributor that are necessarily infringed by their + Contribution(s) alone or by combination of their Contribution(s) + with the Work to which such Contribution(s) was submitted. If You + institute patent litigation against any entity (including a + cross-claim or counterclaim in a lawsuit) alleging that the Work + or a Contribution incorporated within the Work constitutes direct + or contributory patent infringement, then any patent licenses + granted to You under this License for that Work shall terminate + as of the date such litigation is filed. + + 4. Redistribution. You may reproduce and distribute copies of the + Work or Derivative Works thereof in any medium, with or without + modifications, and in Source or Object form, provided that You + meet the following conditions: + + (a) You must give any other recipients of the Work or + Derivative Works a copy of this License; and + + (b) You must cause any modified files to carry prominent notices + stating that You changed the files; and + + (c) You must retain, in the Source form of any Derivative Works + that You distribute, all copyright, patent, trademark, and + attribution notices from the Source form of the Work, + excluding those notices that do not pertain to any part of + the Derivative Works; and + + (d) If the Work includes a "NOTICE" text file as part of its + distribution, then any Derivative Works that You distribute must + include a readable copy of the attribution notices contained + within such NOTICE file, excluding those notices that do not + pertain to any part of the Derivative Works, in at least one + of the following places: within a NOTICE text file distributed + as part of the Derivative Works; within the Source form or + documentation, if provided along with the Derivative Works; or, + within a display generated by the Derivative Works, if and + wherever such third-party notices normally appear. The contents + of the NOTICE file are for informational purposes only and + do not modify the License. You may add Your own attribution + notices within Derivative Works that You distribute, alongside + or as an addendum to the NOTICE text from the Work, provided + that such additional attribution notices cannot be construed + as modifying the License. + + You may add Your own copyright statement to Your modifications and + may provide additional or different license terms and conditions + for use, reproduction, or distribution of Your modifications, or + for any such Derivative Works as a whole, provided Your use, + reproduction, and distribution of the Work otherwise complies with + the conditions stated in this License. + + 5. Submission of Contributions. Unless You explicitly state otherwise, + any Contribution intentionally submitted for inclusion in the Work + by You to the Licensor shall be under the terms and conditions of + this License, without any additional terms or conditions. + Notwithstanding the above, nothing herein shall supersede or modify + the terms of any separate license agreement you may have executed + with Licensor regarding such Contributions. + + 6. Trademarks. This License does not grant permission to use the trade + names, trademarks, service marks, or product names of the Licensor, + except as required for reasonable and customary use in describing the + origin of the Work and reproducing the content of the NOTICE file. + + 7. Disclaimer of Warranty. Unless required by applicable law or + agreed to in writing, Licensor provides the Work (and each + Contributor provides its Contributions) on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or + implied, including, without limitation, any warranties or conditions + of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A + PARTICULAR PURPOSE. You are solely responsible for determining the + appropriateness of using or redistributing the Work and assume any + risks associated with Your exercise of permissions under this License. + + 8. Limitation of Liability. In no event and under no legal theory, + whether in tort (including negligence), contract, or otherwise, + unless required by applicable law (such as deliberate and grossly + negligent acts) or agreed to in writing, shall any Contributor be + liable to You for damages, including any direct, indirect, special, + incidental, or consequential damages of any character arising as a + result of this License or out of the use or inability to use the + Work (including but not limited to damages for loss of goodwill, + work stoppage, computer failure or malfunction, or any and all + other commercial damages or losses), even if such Contributor + has been advised of the possibility of such damages. + + 9. Accepting Warranty or Additional Liability. While redistributing + the Work or Derivative Works thereof, You may choose to offer, + and charge a fee for, acceptance of support, warranty, indemnity, + or other liability obligations and/or rights consistent with this + License. However, in accepting such obligations, You may act only + on Your own behalf and on Your sole responsibility, not on behalf + of any other Contributor, and only if You agree to indemnify, + defend, and hold each Contributor harmless for any liability + incurred by, or claims asserted against, such Contributor by reason + of your accepting any such warranty or additional liability. + + END OF TERMS AND CONDITIONS + + APPENDIX: How to apply the Apache License to your work. + + To apply the Apache License to your work, attach the following + boilerplate notice, with the fields enclosed by brackets "[]" + replaced with your own identifying information. (Don't include + the brackets!) The text should be enclosed in the appropriate + comment syntax for the file format. We also recommend that a + file or class name and description of purpose be included on the + same "printed page" as the copyright notice for easier + identification within third-party archives. + + Copyright [yyyy] [name of copyright owner] + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. diff --git a/vits_decoder/alias/LICENSE-snake.txt b/vits_decoder/alias/LICENSE-snake.txt new file mode 100644 index 0000000000000000000000000000000000000000..9c28182ace9ed5b2d9c8ee4b0e003d1f6f10c757 --- /dev/null +++ b/vits_decoder/alias/LICENSE-snake.txt @@ -0,0 +1,21 @@ +MIT License + +Copyright (c) 2020 Edward Dixon + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. diff --git a/vits_decoder/alias/__init__.py b/vits_decoder/alias/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..a2318b63198250856809c0cb46210a4147b829bc --- /dev/null +++ b/vits_decoder/alias/__init__.py @@ -0,0 +1,6 @@ +# Adapted from https://github.com/junjun3518/alias-free-torch under the Apache License 2.0 +# LICENSE is in incl_licenses directory. + +from .filter import * +from .resample import * +from .act import * \ No newline at end of file diff --git a/vits_decoder/alias/act.py b/vits_decoder/alias/act.py new file mode 100644 index 0000000000000000000000000000000000000000..308344fb6ccbc39317c584a3ee1fb2f29084678e --- /dev/null +++ b/vits_decoder/alias/act.py @@ -0,0 +1,129 @@ +# Adapted from https://github.com/junjun3518/alias-free-torch under the Apache License 2.0 +# LICENSE is in incl_licenses directory. + +import torch +import torch.nn as nn +import torch.nn.functional as F + +from torch import sin, pow +from torch.nn import Parameter +from .resample import UpSample1d, DownSample1d + + +class Activation1d(nn.Module): + def __init__(self, + activation, + up_ratio: int = 2, + down_ratio: int = 2, + up_kernel_size: int = 12, + down_kernel_size: int = 12): + super().__init__() + self.up_ratio = up_ratio + self.down_ratio = down_ratio + self.act = activation + self.upsample = UpSample1d(up_ratio, up_kernel_size) + self.downsample = DownSample1d(down_ratio, down_kernel_size) + + # x: [B,C,T] + def forward(self, x): + x = self.upsample(x) + x = self.act(x) + x = self.downsample(x) + + return x + + +class SnakeBeta(nn.Module): + ''' + A modified Snake function which uses separate parameters for the magnitude of the periodic components + Shape: + - Input: (B, C, T) + - Output: (B, C, T), same shape as the input + Parameters: + - alpha - trainable parameter that controls frequency + - beta - trainable parameter that controls magnitude + References: + - This activation function is a modified version based on this paper by Liu Ziyin, Tilman Hartwig, Masahito Ueda: + https://arxiv.org/abs/2006.08195 + Examples: + >>> a1 = snakebeta(256) + >>> x = torch.randn(256) + >>> x = a1(x) + ''' + + def __init__(self, in_features, alpha=1.0, alpha_trainable=True, alpha_logscale=False): + ''' + Initialization. + INPUT: + - in_features: shape of the input + - alpha - trainable parameter that controls frequency + - beta - trainable parameter that controls magnitude + alpha is initialized to 1 by default, higher values = higher-frequency. + beta is initialized to 1 by default, higher values = higher-magnitude. + alpha will be trained along with the rest of your model. + ''' + super(SnakeBeta, self).__init__() + self.in_features = in_features + # initialize alpha + self.alpha_logscale = alpha_logscale + if self.alpha_logscale: # log scale alphas initialized to zeros + self.alpha = Parameter(torch.zeros(in_features) * alpha) + self.beta = Parameter(torch.zeros(in_features) * alpha) + else: # linear scale alphas initialized to ones + self.alpha = Parameter(torch.ones(in_features) * alpha) + self.beta = Parameter(torch.ones(in_features) * alpha) + self.alpha.requires_grad = alpha_trainable + self.beta.requires_grad = alpha_trainable + self.no_div_by_zero = 0.000000001 + + def forward(self, x): + ''' + Forward pass of the function. + Applies the function to the input elementwise. + SnakeBeta = x + 1/b * sin^2 (xa) + ''' + alpha = self.alpha.unsqueeze( + 0).unsqueeze(-1) # line up with x to [B, C, T] + beta = self.beta.unsqueeze(0).unsqueeze(-1) + if self.alpha_logscale: + alpha = torch.exp(alpha) + beta = torch.exp(beta) + x = x + (1.0 / (beta + self.no_div_by_zero)) * pow(sin(x * alpha), 2) + return x + + +class Mish(nn.Module): + """ + Mish activation function is proposed in "Mish: A Self + Regularized Non-Monotonic Neural Activation Function" + paper, https://arxiv.org/abs/1908.08681. + """ + + def __init__(self): + super().__init__() + + def forward(self, x): + return x * torch.tanh(F.softplus(x)) + + +class SnakeAlias(nn.Module): + def __init__(self, + channels, + up_ratio: int = 2, + down_ratio: int = 2, + up_kernel_size: int = 12, + down_kernel_size: int = 12): + super().__init__() + self.up_ratio = up_ratio + self.down_ratio = down_ratio + self.act = SnakeBeta(channels, alpha_logscale=True) + self.upsample = UpSample1d(up_ratio, up_kernel_size) + self.downsample = DownSample1d(down_ratio, down_kernel_size) + + # x: [B,C,T] + def forward(self, x): + x = self.upsample(x) + x = self.act(x) + x = self.downsample(x) + + return x \ No newline at end of file diff --git a/vits_decoder/alias/filter.py b/vits_decoder/alias/filter.py new file mode 100644 index 0000000000000000000000000000000000000000..7ad6ea87c1f10ddd94c544037791d7a4634d5ae1 --- /dev/null +++ b/vits_decoder/alias/filter.py @@ -0,0 +1,95 @@ +# Adapted from https://github.com/junjun3518/alias-free-torch under the Apache License 2.0 +# LICENSE is in incl_licenses directory. + +import torch +import torch.nn as nn +import torch.nn.functional as F +import math + +if 'sinc' in dir(torch): + sinc = torch.sinc +else: + # This code is adopted from adefossez's julius.core.sinc under the MIT License + # https://adefossez.github.io/julius/julius/core.html + # LICENSE is in incl_licenses directory. + def sinc(x: torch.Tensor): + """ + Implementation of sinc, i.e. sin(pi * x) / (pi * x) + __Warning__: Different to julius.sinc, the input is multiplied by `pi`! + """ + return torch.where(x == 0, + torch.tensor(1., device=x.device, dtype=x.dtype), + torch.sin(math.pi * x) / math.pi / x) + + +# This code is adopted from adefossez's julius.lowpass.LowPassFilters under the MIT License +# https://adefossez.github.io/julius/julius/lowpass.html +# LICENSE is in incl_licenses directory. +def kaiser_sinc_filter1d(cutoff, half_width, kernel_size): # return filter [1,1,kernel_size] + even = (kernel_size % 2 == 0) + half_size = kernel_size // 2 + + #For kaiser window + delta_f = 4 * half_width + A = 2.285 * (half_size - 1) * math.pi * delta_f + 7.95 + if A > 50.: + beta = 0.1102 * (A - 8.7) + elif A >= 21.: + beta = 0.5842 * (A - 21)**0.4 + 0.07886 * (A - 21.) + else: + beta = 0. + window = torch.kaiser_window(kernel_size, beta=beta, periodic=False) + + # ratio = 0.5/cutoff -> 2 * cutoff = 1 / ratio + if even: + time = (torch.arange(-half_size, half_size) + 0.5) + else: + time = torch.arange(kernel_size) - half_size + if cutoff == 0: + filter_ = torch.zeros_like(time) + else: + filter_ = 2 * cutoff * window * sinc(2 * cutoff * time) + # Normalize filter to have sum = 1, otherwise we will have a small leakage + # of the constant component in the input signal. + filter_ /= filter_.sum() + filter = filter_.view(1, 1, kernel_size) + + return filter + + +class LowPassFilter1d(nn.Module): + def __init__(self, + cutoff=0.5, + half_width=0.6, + stride: int = 1, + padding: bool = True, + padding_mode: str = 'replicate', + kernel_size: int = 12): + # kernel_size should be even number for stylegan3 setup, + # in this implementation, odd number is also possible. + super().__init__() + if cutoff < -0.: + raise ValueError("Minimum cutoff must be larger than zero.") + if cutoff > 0.5: + raise ValueError("A cutoff above 0.5 does not make sense.") + self.kernel_size = kernel_size + self.even = (kernel_size % 2 == 0) + self.pad_left = kernel_size // 2 - int(self.even) + self.pad_right = kernel_size // 2 + self.stride = stride + self.padding = padding + self.padding_mode = padding_mode + filter = kaiser_sinc_filter1d(cutoff, half_width, kernel_size) + self.register_buffer("filter", filter) + + #input [B, C, T] + def forward(self, x): + _, C, _ = x.shape + + if self.padding: + x = F.pad(x, (self.pad_left, self.pad_right), + mode=self.padding_mode) + out = F.conv1d(x, self.filter.expand(C, -1, -1), + stride=self.stride, groups=C) + + return out \ No newline at end of file diff --git a/vits_decoder/alias/resample.py b/vits_decoder/alias/resample.py new file mode 100644 index 0000000000000000000000000000000000000000..750e6c3402cc5ac939c4b9d075246562e0e1d1a7 --- /dev/null +++ b/vits_decoder/alias/resample.py @@ -0,0 +1,49 @@ +# Adapted from https://github.com/junjun3518/alias-free-torch under the Apache License 2.0 +# LICENSE is in incl_licenses directory. + +import torch.nn as nn +from torch.nn import functional as F +from .filter import LowPassFilter1d +from .filter import kaiser_sinc_filter1d + + +class UpSample1d(nn.Module): + def __init__(self, ratio=2, kernel_size=None): + super().__init__() + self.ratio = ratio + self.kernel_size = int(6 * ratio // 2) * 2 if kernel_size is None else kernel_size + self.stride = ratio + self.pad = self.kernel_size // ratio - 1 + self.pad_left = self.pad * self.stride + (self.kernel_size - self.stride) // 2 + self.pad_right = self.pad * self.stride + (self.kernel_size - self.stride + 1) // 2 + filter = kaiser_sinc_filter1d(cutoff=0.5 / ratio, + half_width=0.6 / ratio, + kernel_size=self.kernel_size) + self.register_buffer("filter", filter) + + # x: [B, C, T] + def forward(self, x): + _, C, _ = x.shape + + x = F.pad(x, (self.pad, self.pad), mode='replicate') + x = self.ratio * F.conv_transpose1d( + x, self.filter.expand(C, -1, -1), stride=self.stride, groups=C) + x = x[..., self.pad_left:-self.pad_right] + + return x + + +class DownSample1d(nn.Module): + def __init__(self, ratio=2, kernel_size=None): + super().__init__() + self.ratio = ratio + self.kernel_size = int(6 * ratio // 2) * 2 if kernel_size is None else kernel_size + self.lowpass = LowPassFilter1d(cutoff=0.5 / ratio, + half_width=0.6 / ratio, + stride=ratio, + kernel_size=self.kernel_size) + + def forward(self, x): + xx = self.lowpass(x) + + return xx \ No newline at end of file diff --git a/vits_decoder/bigv.py b/vits_decoder/bigv.py new file mode 100644 index 0000000000000000000000000000000000000000..029362c34b2c850cc2d59eea4410f77380d84bbe --- /dev/null +++ b/vits_decoder/bigv.py @@ -0,0 +1,64 @@ +import torch +import torch.nn as nn + +from torch.nn import Conv1d +from torch.nn.utils import weight_norm, remove_weight_norm +from .alias.act import SnakeAlias + + +def init_weights(m, mean=0.0, std=0.01): + classname = m.__class__.__name__ + if classname.find("Conv") != -1: + m.weight.data.normal_(mean, std) + + +def get_padding(kernel_size, dilation=1): + return int((kernel_size*dilation - dilation)/2) + + +class AMPBlock(torch.nn.Module): + def __init__(self, channels, kernel_size=3, dilation=(1, 3, 5)): + super(AMPBlock, self).__init__() + self.convs1 = nn.ModuleList([ + weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=dilation[0], + padding=get_padding(kernel_size, dilation[0]))), + weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=dilation[1], + padding=get_padding(kernel_size, dilation[1]))), + weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=dilation[2], + padding=get_padding(kernel_size, dilation[2]))) + ]) + self.convs1.apply(init_weights) + + self.convs2 = nn.ModuleList([ + weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=1, + padding=get_padding(kernel_size, 1))), + weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=1, + padding=get_padding(kernel_size, 1))), + weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=1, + padding=get_padding(kernel_size, 1))) + ]) + self.convs2.apply(init_weights) + + # total number of conv layers + self.num_layers = len(self.convs1) + len(self.convs2) + + # periodic nonlinearity with snakebeta function and anti-aliasing + self.activations = nn.ModuleList([ + SnakeAlias(channels) for _ in range(self.num_layers) + ]) + + def forward(self, x): + acts1, acts2 = self.activations[::2], self.activations[1::2] + for c1, c2, a1, a2 in zip(self.convs1, self.convs2, acts1, acts2): + xt = a1(x) + xt = c1(xt) + xt = a2(xt) + xt = c2(xt) + x = xt + x + return x + + def remove_weight_norm(self): + for l in self.convs1: + remove_weight_norm(l) + for l in self.convs2: + remove_weight_norm(l) \ No newline at end of file diff --git a/vits_decoder/discriminator.py b/vits_decoder/discriminator.py new file mode 100644 index 0000000000000000000000000000000000000000..764c0ca806b707e4f36ca2abb64ce79971358dd9 --- /dev/null +++ b/vits_decoder/discriminator.py @@ -0,0 +1,39 @@ +import torch +import torch.nn as nn + +from omegaconf import OmegaConf +from .msd import ScaleDiscriminator +from .mpd import MultiPeriodDiscriminator +from .mrd import MultiResolutionDiscriminator + + +class Discriminator(nn.Module): + def __init__(self, hp): + super(Discriminator, self).__init__() + self.MRD = MultiResolutionDiscriminator(hp) + self.MPD = MultiPeriodDiscriminator(hp) + self.MSD = ScaleDiscriminator() + + def forward(self, x): + r = self.MRD(x) + p = self.MPD(x) + s = self.MSD(x) + return r + p + s + + +if __name__ == '__main__': + hp = OmegaConf.load('../config/base.yaml') + model = Discriminator(hp) + + x = torch.randn(3, 1, 16384) + print(x.shape) + + output = model(x) + for features, score in output: + for feat in features: + print(feat.shape) + print(score.shape) + + pytorch_total_params = sum(p.numel() + for p in model.parameters() if p.requires_grad) + print(pytorch_total_params) diff --git a/vits_decoder/generator.py b/vits_decoder/generator.py new file mode 100644 index 0000000000000000000000000000000000000000..787302bd496ee0545d9699b1cff1835a243cd62b --- /dev/null +++ b/vits_decoder/generator.py @@ -0,0 +1,200 @@ +import torch +import torch.nn as nn +import torch.nn.functional as F +import numpy as np + +from torch.nn import Conv1d +from torch.nn import ConvTranspose1d +from torch.nn.utils import weight_norm +from torch.nn.utils import remove_weight_norm + +from .nsf import SourceModuleHnNSF +from .bigv import init_weights, AMPBlock, SnakeAlias + + +class SpeakerAdapter(nn.Module): + + def __init__(self, + speaker_dim, + adapter_dim, + epsilon=1e-5 + ): + super(SpeakerAdapter, self).__init__() + self.speaker_dim = speaker_dim + self.adapter_dim = adapter_dim + self.epsilon = epsilon + self.W_scale = nn.Linear(self.speaker_dim, self.adapter_dim) + self.W_bias = nn.Linear(self.speaker_dim, self.adapter_dim) + self.reset_parameters() + + def reset_parameters(self): + torch.nn.init.constant_(self.W_scale.weight, 0.0) + torch.nn.init.constant_(self.W_scale.bias, 1.0) + torch.nn.init.constant_(self.W_bias.weight, 0.0) + torch.nn.init.constant_(self.W_bias.bias, 0.0) + + def forward(self, x, speaker_embedding): + x = x.transpose(1, -1) + mean = x.mean(dim=-1, keepdim=True) + var = ((x - mean) ** 2).mean(dim=-1, keepdim=True) + std = (var + self.epsilon).sqrt() + y = (x - mean) / std + scale = self.W_scale(speaker_embedding) + bias = self.W_bias(speaker_embedding) + y *= scale.unsqueeze(1) + y += bias.unsqueeze(1) + y = y.transpose(1, -1) + return y + + +class Generator(torch.nn.Module): + # this is our main BigVGAN model. Applies anti-aliased periodic activation for resblocks. + def __init__(self, hp): + super(Generator, self).__init__() + self.hp = hp + self.num_kernels = len(hp.gen.resblock_kernel_sizes) + self.num_upsamples = len(hp.gen.upsample_rates) + # speaker adaper, 256 should change by what speaker encoder you use + self.adapter = SpeakerAdapter(hp.vits.spk_dim, hp.gen.upsample_input) + # pre conv + self.conv_pre = Conv1d(hp.gen.upsample_input, + hp.gen.upsample_initial_channel, 7, 1, padding=3) + # nsf + self.f0_upsamp = torch.nn.Upsample( + scale_factor=np.prod(hp.gen.upsample_rates)) + self.m_source = SourceModuleHnNSF(sampling_rate=hp.data.sampling_rate) + self.noise_convs = nn.ModuleList() + # transposed conv-based upsamplers. does not apply anti-aliasing + self.ups = nn.ModuleList() + for i, (u, k) in enumerate(zip(hp.gen.upsample_rates, hp.gen.upsample_kernel_sizes)): + # print(f'ups: {i} {k}, {u}, {(k - u) // 2}') + # base + self.ups.append( + weight_norm( + ConvTranspose1d( + hp.gen.upsample_initial_channel // (2 ** i), + hp.gen.upsample_initial_channel // (2 ** (i + 1)), + k, + u, + padding=(k - u) // 2) + ) + ) + # nsf + if i + 1 < len(hp.gen.upsample_rates): + stride_f0 = np.prod(hp.gen.upsample_rates[i + 1:]) + stride_f0 = int(stride_f0) + self.noise_convs.append( + Conv1d( + 1, + hp.gen.upsample_initial_channel // (2 ** (i + 1)), + kernel_size=stride_f0 * 2, + stride=stride_f0, + padding=stride_f0 // 2, + ) + ) + else: + self.noise_convs.append( + Conv1d(1, hp.gen.upsample_initial_channel // + (2 ** (i + 1)), kernel_size=1) + ) + + # residual blocks using anti-aliased multi-periodicity composition modules (AMP) + self.resblocks = nn.ModuleList() + for i in range(len(self.ups)): + ch = hp.gen.upsample_initial_channel // (2 ** (i + 1)) + for k, d in zip(hp.gen.resblock_kernel_sizes, hp.gen.resblock_dilation_sizes): + self.resblocks.append(AMPBlock(ch, k, d)) + + # post conv + self.activation_post = SnakeAlias(ch) + self.conv_post = Conv1d(ch, 1, 7, 1, padding=3, bias=False) + # weight initialization + self.ups.apply(init_weights) + + def forward(self, spk, x, f0): + # Perturbation + x = x + torch.randn_like(x) + # adapter + x = self.adapter(x, spk) + x = self.conv_pre(x) + x = x * torch.tanh(F.softplus(x)) + # nsf + f0 = f0[:, None] + f0 = self.f0_upsamp(f0).transpose(1, 2) + har_source = self.m_source(f0) + har_source = har_source.transpose(1, 2) + + for i in range(self.num_upsamples): + # upsampling + x = self.ups[i](x) + # nsf + x_source = self.noise_convs[i](har_source) + x = x + x_source + # AMP blocks + xs = None + for j in range(self.num_kernels): + if xs is None: + xs = self.resblocks[i * self.num_kernels + j](x) + else: + xs += self.resblocks[i * self.num_kernels + j](x) + x = xs / self.num_kernels + + # post conv + x = self.activation_post(x) + x = self.conv_post(x) + x = torch.tanh(x) + return x + + def remove_weight_norm(self): + for l in self.ups: + remove_weight_norm(l) + for l in self.resblocks: + l.remove_weight_norm() + + def eval(self, inference=False): + super(Generator, self).eval() + # don't remove weight norm while validation in training loop + if inference: + self.remove_weight_norm() + + def pitch2source(self, f0): + f0 = f0[:, None] + f0 = self.f0_upsamp(f0).transpose(1, 2) # [1,len,1] + har_source = self.m_source(f0) + har_source = har_source.transpose(1, 2) # [1,1,len] + return har_source + + def source2wav(self, audio): + MAX_WAV_VALUE = 32768.0 + audio = audio.squeeze() + audio = MAX_WAV_VALUE * audio + audio = audio.clamp(min=-MAX_WAV_VALUE, max=MAX_WAV_VALUE-1) + audio = audio.short() + return audio.cpu().detach().numpy() + + def inference(self, spk, x, har_source): + # adapter + x = self.adapter(x, spk) + x = self.conv_pre(x) + x = x * torch.tanh(F.softplus(x)) + + for i in range(self.num_upsamples): + # upsampling + x = self.ups[i](x) + # nsf + x_source = self.noise_convs[i](har_source) + x = x + x_source + # AMP blocks + xs = None + for j in range(self.num_kernels): + if xs is None: + xs = self.resblocks[i * self.num_kernels + j](x) + else: + xs += self.resblocks[i * self.num_kernels + j](x) + x = xs / self.num_kernels + + # post conv + x = self.activation_post(x) + x = self.conv_post(x) + x = torch.tanh(x) + return x diff --git a/vits_decoder/med.py b/vits_decoder/med.py new file mode 100644 index 0000000000000000000000000000000000000000..77554d3c07b98328c0cc5c9b0b8301c22568f55c --- /dev/null +++ b/vits_decoder/med.py @@ -0,0 +1,65 @@ +import torch +import torchaudio +import typing as T + + +class MelspecDiscriminator(torch.nn.Module): + """mel spectrogram (frequency domain) discriminator""" + + def __init__(self) -> None: + super().__init__() + self.SAMPLE_RATE = 48000 + # mel filterbank transform + self._melspec = torchaudio.transforms.MelSpectrogram( + sample_rate=self.SAMPLE_RATE, + n_fft=2048, + win_length=int(0.025 * self.SAMPLE_RATE), + hop_length=int(0.010 * self.SAMPLE_RATE), + n_mels=128, + power=1, + ) + + # time-frequency 2D convolutions + kernel_sizes = [(7, 7), (4, 4), (4, 4), (4, 4)] + strides = [(1, 2), (1, 2), (1, 2), (1, 2)] + self._convs = torch.nn.ModuleList( + [ + torch.nn.Sequential( + torch.nn.Conv2d( + in_channels=1 if i == 0 else 32, + out_channels=64, + kernel_size=k, + stride=s, + padding=(1, 2), + bias=False, + ), + torch.nn.BatchNorm2d(num_features=64), + torch.nn.GLU(dim=1), + ) + for i, (k, s) in enumerate(zip(kernel_sizes, strides)) + ] + ) + + # output adversarial projection + self._postnet = torch.nn.Conv2d( + in_channels=32, + out_channels=1, + kernel_size=(15, 3), + stride=(1, 2), + ) + + def forward(self, x: torch.Tensor) -> T.Tuple[torch.Tensor, T.List[torch.Tensor]]: + # apply the log-scale mel spectrogram transform + x = torch.log(self._melspec(x) + 1e-5) + + # compute hidden layers and feature maps + f = [] + for c in self._convs: + x = c(x) + f.append(x) + + # apply the output projection and global average pooling + x = self._postnet(x) + x = x.mean(dim=[-2, -1]) + + return [(f, x)] diff --git a/vits_decoder/mpd.py b/vits_decoder/mpd.py new file mode 100644 index 0000000000000000000000000000000000000000..2dc63e859dd2920f9d02b285ebc4dae8cf318d6a --- /dev/null +++ b/vits_decoder/mpd.py @@ -0,0 +1,61 @@ +import torch +import torch.nn as nn +import torch.nn.functional as F +from torch.nn.utils import weight_norm, spectral_norm + +class DiscriminatorP(nn.Module): + def __init__(self, hp, period): + super(DiscriminatorP, self).__init__() + + self.LRELU_SLOPE = hp.mpd.lReLU_slope + self.period = period + + kernel_size = hp.mpd.kernel_size + stride = hp.mpd.stride + norm_f = weight_norm if hp.mpd.use_spectral_norm == False else spectral_norm + + self.convs = nn.ModuleList([ + norm_f(nn.Conv2d(1, 64, (kernel_size, 1), (stride, 1), padding=(kernel_size // 2, 0))), + norm_f(nn.Conv2d(64, 128, (kernel_size, 1), (stride, 1), padding=(kernel_size // 2, 0))), + norm_f(nn.Conv2d(128, 256, (kernel_size, 1), (stride, 1), padding=(kernel_size // 2, 0))), + norm_f(nn.Conv2d(256, 512, (kernel_size, 1), (stride, 1), padding=(kernel_size // 2, 0))), + norm_f(nn.Conv2d(512, 1024, (kernel_size, 1), 1, padding=(kernel_size // 2, 0))), + ]) + self.conv_post = norm_f(nn.Conv2d(1024, 1, (3, 1), 1, padding=(1, 0))) + + def forward(self, x): + fmap = [] + + # 1d to 2d + b, c, t = x.shape + if t % self.period != 0: # pad first + n_pad = self.period - (t % self.period) + x = F.pad(x, (0, n_pad), "reflect") + t = t + n_pad + x = x.view(b, c, t // self.period, self.period) + + for l in self.convs: + x = l(x) + x = F.leaky_relu(x, self.LRELU_SLOPE) + fmap.append(x) + x = self.conv_post(x) + fmap.append(x) + x = torch.flatten(x, 1, -1) + + return fmap, x + + +class MultiPeriodDiscriminator(nn.Module): + def __init__(self, hp): + super(MultiPeriodDiscriminator, self).__init__() + + self.discriminators = nn.ModuleList( + [DiscriminatorP(hp, period) for period in hp.mpd.periods] + ) + + def forward(self, x): + ret = list() + for disc in self.discriminators: + ret.append(disc(x)) + + return ret # [(feat, score), (feat, score), (feat, score), (feat, score), (feat, score)] diff --git a/vits_decoder/mrd.py b/vits_decoder/mrd.py new file mode 100644 index 0000000000000000000000000000000000000000..da6db1a416366603d2e65b400d66c44262e2baef --- /dev/null +++ b/vits_decoder/mrd.py @@ -0,0 +1,62 @@ +import torch +import torch.nn as nn +import torch.nn.functional as F +from torch.nn.utils import weight_norm, spectral_norm + +class DiscriminatorR(torch.nn.Module): + def __init__(self, hp, resolution): + super(DiscriminatorR, self).__init__() + + self.resolution = resolution + self.LRELU_SLOPE = hp.mpd.lReLU_slope + + norm_f = weight_norm if hp.mrd.use_spectral_norm == False else spectral_norm + + self.convs = nn.ModuleList([ + norm_f(nn.Conv2d(1, 32, (3, 9), padding=(1, 4))), + norm_f(nn.Conv2d(32, 32, (3, 9), stride=(1, 2), padding=(1, 4))), + norm_f(nn.Conv2d(32, 32, (3, 9), stride=(1, 2), padding=(1, 4))), + norm_f(nn.Conv2d(32, 32, (3, 9), stride=(1, 2), padding=(1, 4))), + norm_f(nn.Conv2d(32, 32, (3, 3), padding=(1, 1))), + ]) + self.conv_post = norm_f(nn.Conv2d(32, 1, (3, 3), padding=(1, 1))) + + def forward(self, x): + fmap = [] + + x = self.spectrogram(x) + x = x.unsqueeze(1) + for l in self.convs: + x = l(x) + x = F.leaky_relu(x, self.LRELU_SLOPE) + fmap.append(x) + x = self.conv_post(x) + fmap.append(x) + x = torch.flatten(x, 1, -1) + + return fmap, x + + def spectrogram(self, x): + n_fft, hop_length, win_length = self.resolution + x = F.pad(x, (int((n_fft - hop_length) / 2), int((n_fft - hop_length) / 2)), mode='reflect') + x = x.squeeze(1) + x = torch.stft(x, n_fft=n_fft, hop_length=hop_length, win_length=win_length, center=False, return_complex=False) #[B, F, TT, 2] + mag = torch.norm(x, p=2, dim =-1) #[B, F, TT] + + return mag + + +class MultiResolutionDiscriminator(torch.nn.Module): + def __init__(self, hp): + super(MultiResolutionDiscriminator, self).__init__() + self.resolutions = eval(hp.mrd.resolutions) + self.discriminators = nn.ModuleList( + [DiscriminatorR(hp, resolution) for resolution in self.resolutions] + ) + + def forward(self, x): + ret = list() + for disc in self.discriminators: + ret.append(disc(x)) + + return ret # [(feat, score), (feat, score), (feat, score)] diff --git a/vits_decoder/msd.py b/vits_decoder/msd.py new file mode 100644 index 0000000000000000000000000000000000000000..9e254fa3f1b53368332751a3e7235e93297c44c3 --- /dev/null +++ b/vits_decoder/msd.py @@ -0,0 +1,29 @@ +import torch +import torch.nn as nn +import torch.nn.functional as F +from torch.nn.utils import weight_norm + + +class ScaleDiscriminator(torch.nn.Module): + def __init__(self): + super(ScaleDiscriminator, self).__init__() + self.convs = nn.ModuleList([ + weight_norm(nn.Conv1d(1, 16, 15, 1, padding=7)), + weight_norm(nn.Conv1d(16, 64, 41, 4, groups=4, padding=20)), + weight_norm(nn.Conv1d(64, 256, 41, 4, groups=16, padding=20)), + weight_norm(nn.Conv1d(256, 1024, 41, 4, groups=64, padding=20)), + weight_norm(nn.Conv1d(1024, 1024, 41, 4, groups=256, padding=20)), + weight_norm(nn.Conv1d(1024, 1024, 5, 1, padding=2)), + ]) + self.conv_post = weight_norm(nn.Conv1d(1024, 1, 3, 1, padding=1)) + + def forward(self, x): + fmap = [] + for l in self.convs: + x = l(x) + x = F.leaky_relu(x, 0.1) + fmap.append(x) + x = self.conv_post(x) + fmap.append(x) + x = torch.flatten(x, 1, -1) + return [(fmap, x)] diff --git a/vits_decoder/nsf.py b/vits_decoder/nsf.py new file mode 100644 index 0000000000000000000000000000000000000000..1e9e6c7e344eb616a7ca427da1a02a2c2093c942 --- /dev/null +++ b/vits_decoder/nsf.py @@ -0,0 +1,394 @@ +import torch +import numpy as np +import sys +import torch.nn.functional as torch_nn_func + + +class PulseGen(torch.nn.Module): + """Definition of Pulse train generator + + There are many ways to implement pulse generator. + Here, PulseGen is based on SinGen. For a perfect + """ + + def __init__(self, samp_rate, pulse_amp=0.1, noise_std=0.003, voiced_threshold=0): + super(PulseGen, self).__init__() + self.pulse_amp = pulse_amp + self.sampling_rate = samp_rate + self.voiced_threshold = voiced_threshold + self.noise_std = noise_std + self.l_sinegen = SineGen( + self.sampling_rate, + harmonic_num=0, + sine_amp=self.pulse_amp, + noise_std=0, + voiced_threshold=self.voiced_threshold, + flag_for_pulse=True, + ) + + def forward(self, f0): + """Pulse train generator + pulse_train, uv = forward(f0) + input F0: tensor(batchsize=1, length, dim=1) + f0 for unvoiced steps should be 0 + output pulse_train: tensor(batchsize=1, length, dim) + output uv: tensor(batchsize=1, length, 1) + + Note: self.l_sine doesn't make sure that the initial phase of + a voiced segment is np.pi, the first pulse in a voiced segment + may not be at the first time step within a voiced segment + """ + with torch.no_grad(): + sine_wav, uv, noise = self.l_sinegen(f0) + + # sine without additive noise + pure_sine = sine_wav - noise + + # step t corresponds to a pulse if + # sine[t] > sine[t+1] & sine[t] > sine[t-1] + # & sine[t-1], sine[t+1], and sine[t] are voiced + # or + # sine[t] is voiced, sine[t-1] is unvoiced + # we use torch.roll to simulate sine[t+1] and sine[t-1] + sine_1 = torch.roll(pure_sine, shifts=1, dims=1) + uv_1 = torch.roll(uv, shifts=1, dims=1) + uv_1[:, 0, :] = 0 + sine_2 = torch.roll(pure_sine, shifts=-1, dims=1) + uv_2 = torch.roll(uv, shifts=-1, dims=1) + uv_2[:, -1, :] = 0 + + loc = (pure_sine > sine_1) * (pure_sine > sine_2) \ + * (uv_1 > 0) * (uv_2 > 0) * (uv > 0) \ + + (uv_1 < 1) * (uv > 0) + + # pulse train without noise + pulse_train = pure_sine * loc + + # additive noise to pulse train + # note that noise from sinegen is zero in voiced regions + pulse_noise = torch.randn_like(pure_sine) * self.noise_std + + # with additive noise on pulse, and unvoiced regions + pulse_train += pulse_noise * loc + pulse_noise * (1 - uv) + return pulse_train, sine_wav, uv, pulse_noise + + +class SignalsConv1d(torch.nn.Module): + """Filtering input signal with time invariant filter + Note: FIRFilter conducted filtering given fixed FIR weight + SignalsConv1d convolves two signals + Note: this is based on torch.nn.functional.conv1d + + """ + + def __init__(self): + super(SignalsConv1d, self).__init__() + + def forward(self, signal, system_ir): + """output = forward(signal, system_ir) + + signal: (batchsize, length1, dim) + system_ir: (length2, dim) + + output: (batchsize, length1, dim) + """ + if signal.shape[-1] != system_ir.shape[-1]: + print("Error: SignalsConv1d expects shape:") + print("signal (batchsize, length1, dim)") + print("system_id (batchsize, length2, dim)") + print("But received signal: {:s}".format(str(signal.shape))) + print(" system_ir: {:s}".format(str(system_ir.shape))) + sys.exit(1) + padding_length = system_ir.shape[0] - 1 + groups = signal.shape[-1] + + # pad signal on the left + signal_pad = torch_nn_func.pad(signal.permute(0, 2, 1), (padding_length, 0)) + # prepare system impulse response as (dim, 1, length2) + # also flip the impulse response + ir = torch.flip(system_ir.unsqueeze(1).permute(2, 1, 0), dims=[2]) + # convolute + output = torch_nn_func.conv1d(signal_pad, ir, groups=groups) + return output.permute(0, 2, 1) + + +class CyclicNoiseGen_v1(torch.nn.Module): + """CyclicnoiseGen_v1 + Cyclic noise with a single parameter of beta. + Pytorch v1 implementation assumes f_t is also fixed + """ + + def __init__(self, samp_rate, noise_std=0.003, voiced_threshold=0): + super(CyclicNoiseGen_v1, self).__init__() + self.samp_rate = samp_rate + self.noise_std = noise_std + self.voiced_threshold = voiced_threshold + + self.l_pulse = PulseGen( + samp_rate, + pulse_amp=1.0, + noise_std=noise_std, + voiced_threshold=voiced_threshold, + ) + self.l_conv = SignalsConv1d() + + def noise_decay(self, beta, f0mean): + """decayed_noise = noise_decay(beta, f0mean) + decayed_noise = n[t]exp(-t * f_mean / beta / samp_rate) + + beta: (dim=1) or (batchsize=1, 1, dim=1) + f0mean (batchsize=1, 1, dim=1) + + decayed_noise (batchsize=1, length, dim=1) + """ + with torch.no_grad(): + # exp(-1.0 n / T) < 0.01 => n > -log(0.01)*T = 4.60*T + # truncate the noise when decayed by -40 dB + length = 4.6 * self.samp_rate / f0mean + length = length.int() + time_idx = torch.arange(0, length, device=beta.device) + time_idx = time_idx.unsqueeze(0).unsqueeze(2) + time_idx = time_idx.repeat(beta.shape[0], 1, beta.shape[2]) + + noise = torch.randn(time_idx.shape, device=beta.device) + + # due to Pytorch implementation, use f0_mean as the f0 factor + decay = torch.exp(-time_idx * f0mean / beta / self.samp_rate) + return noise * self.noise_std * decay + + def forward(self, f0s, beta): + """Producde cyclic-noise""" + # pulse train + pulse_train, sine_wav, uv, noise = self.l_pulse(f0s) + pure_pulse = pulse_train - noise + + # decayed_noise (length, dim=1) + if (uv < 1).all(): + # all unvoiced + cyc_noise = torch.zeros_like(sine_wav) + else: + f0mean = f0s[uv > 0].mean() + + decayed_noise = self.noise_decay(beta, f0mean)[0, :, :] + # convolute + cyc_noise = self.l_conv(pure_pulse, decayed_noise) + + # add noise in invoiced segments + cyc_noise = cyc_noise + noise * (1.0 - uv) + return cyc_noise, pulse_train, sine_wav, uv, noise + + +class SineGen(torch.nn.Module): + """Definition of sine generator + SineGen(samp_rate, harmonic_num = 0, + sine_amp = 0.1, noise_std = 0.003, + voiced_threshold = 0, + flag_for_pulse=False) + + samp_rate: sampling rate in Hz + harmonic_num: number of harmonic overtones (default 0) + sine_amp: amplitude of sine-wavefrom (default 0.1) + noise_std: std of Gaussian noise (default 0.003) + voiced_thoreshold: F0 threshold for U/V classification (default 0) + flag_for_pulse: this SinGen is used inside PulseGen (default False) + + Note: when flag_for_pulse is True, the first time step of a voiced + segment is always sin(np.pi) or cos(0) + """ + + def __init__( + self, + samp_rate, + harmonic_num=0, + sine_amp=0.1, + noise_std=0.003, + voiced_threshold=0, + flag_for_pulse=False, + ): + super(SineGen, self).__init__() + self.sine_amp = sine_amp + self.noise_std = noise_std + self.harmonic_num = harmonic_num + self.dim = self.harmonic_num + 1 + self.sampling_rate = samp_rate + self.voiced_threshold = voiced_threshold + self.flag_for_pulse = flag_for_pulse + + def _f02uv(self, f0): + # generate uv signal + uv = torch.ones_like(f0) + uv = uv * (f0 > self.voiced_threshold) + return uv + + def _f02sine(self, f0_values): + """f0_values: (batchsize, length, dim) + where dim indicates fundamental tone and overtones + """ + # convert to F0 in rad. The interger part n can be ignored + # because 2 * np.pi * n doesn't affect phase + rad_values = (f0_values / self.sampling_rate) % 1 + + # initial phase noise (no noise for fundamental component) + rand_ini = torch.rand( + f0_values.shape[0], f0_values.shape[2], device=f0_values.device + ) + rand_ini[:, 0] = 0 + rad_values[:, 0, :] = rad_values[:, 0, :] + rand_ini + + # instantanouse phase sine[t] = sin(2*pi \sum_i=1 ^{t} rad) + if not self.flag_for_pulse: + # for normal case + + # To prevent torch.cumsum numerical overflow, + # it is necessary to add -1 whenever \sum_k=1^n rad_value_k > 1. + # Buffer tmp_over_one_idx indicates the time step to add -1. + # This will not change F0 of sine because (x-1) * 2*pi = x * 2*pi + tmp_over_one = torch.cumsum(rad_values, 1) % 1 + tmp_over_one_idx = (tmp_over_one[:, 1:, :] - tmp_over_one[:, :-1, :]) < 0 + cumsum_shift = torch.zeros_like(rad_values) + cumsum_shift[:, 1:, :] = tmp_over_one_idx * -1.0 + + sines = torch.sin( + torch.cumsum(rad_values + cumsum_shift, dim=1) * 2 * np.pi + ) + else: + # If necessary, make sure that the first time step of every + # voiced segments is sin(pi) or cos(0) + # This is used for pulse-train generation + + # identify the last time step in unvoiced segments + uv = self._f02uv(f0_values) + uv_1 = torch.roll(uv, shifts=-1, dims=1) + uv_1[:, -1, :] = 1 + u_loc = (uv < 1) * (uv_1 > 0) + + # get the instantanouse phase + tmp_cumsum = torch.cumsum(rad_values, dim=1) + # different batch needs to be processed differently + for idx in range(f0_values.shape[0]): + temp_sum = tmp_cumsum[idx, u_loc[idx, :, 0], :] + temp_sum[1:, :] = temp_sum[1:, :] - temp_sum[0:-1, :] + # stores the accumulation of i.phase within + # each voiced segments + tmp_cumsum[idx, :, :] = 0 + tmp_cumsum[idx, u_loc[idx, :, 0], :] = temp_sum + + # rad_values - tmp_cumsum: remove the accumulation of i.phase + # within the previous voiced segment. + i_phase = torch.cumsum(rad_values - tmp_cumsum, dim=1) + + # get the sines + sines = torch.cos(i_phase * 2 * np.pi) + return sines + + def forward(self, f0): + """sine_tensor, uv = forward(f0) + input F0: tensor(batchsize=1, length, dim=1) + f0 for unvoiced steps should be 0 + output sine_tensor: tensor(batchsize=1, length, dim) + output uv: tensor(batchsize=1, length, 1) + """ + with torch.no_grad(): + f0_buf = torch.zeros(f0.shape[0], f0.shape[1], self.dim, device=f0.device) + # fundamental component + f0_buf[:, :, 0] = f0[:, :, 0] + for idx in np.arange(self.harmonic_num): + # idx + 2: the (idx+1)-th overtone, (idx+2)-th harmonic + f0_buf[:, :, idx + 1] = f0_buf[:, :, 0] * (idx + 2) + + # generate sine waveforms + sine_waves = self._f02sine(f0_buf) * self.sine_amp + + # generate uv signal + # uv = torch.ones(f0.shape) + # uv = uv * (f0 > self.voiced_threshold) + uv = self._f02uv(f0) + + # noise: for unvoiced should be similar to sine_amp + # std = self.sine_amp/3 -> max value ~ self.sine_amp + # . for voiced regions is self.noise_std + noise_amp = uv * self.noise_std + (1 - uv) * self.sine_amp / 3 + noise = noise_amp * torch.randn_like(sine_waves) + + # first: set the unvoiced part to 0 by uv + # then: additive noise + sine_waves = sine_waves * uv + noise + return sine_waves + + +class SourceModuleCycNoise_v1(torch.nn.Module): + """SourceModuleCycNoise_v1 + SourceModule(sampling_rate, noise_std=0.003, voiced_threshod=0) + sampling_rate: sampling_rate in Hz + + noise_std: std of Gaussian noise (default: 0.003) + voiced_threshold: threshold to set U/V given F0 (default: 0) + + cyc, noise, uv = SourceModuleCycNoise_v1(F0_upsampled, beta) + F0_upsampled (batchsize, length, 1) + beta (1) + cyc (batchsize, length, 1) + noise (batchsize, length, 1) + uv (batchsize, length, 1) + """ + + def __init__(self, sampling_rate, noise_std=0.003, voiced_threshod=0): + super(SourceModuleCycNoise_v1, self).__init__() + self.sampling_rate = sampling_rate + self.noise_std = noise_std + self.l_cyc_gen = CyclicNoiseGen_v1(sampling_rate, noise_std, voiced_threshod) + + def forward(self, f0_upsamped, beta): + """ + cyc, noise, uv = SourceModuleCycNoise_v1(F0, beta) + F0_upsampled (batchsize, length, 1) + beta (1) + cyc (batchsize, length, 1) + noise (batchsize, length, 1) + uv (batchsize, length, 1) + """ + # source for harmonic branch + cyc, pulse, sine, uv, add_noi = self.l_cyc_gen(f0_upsamped, beta) + + # source for noise branch, in the same shape as uv + noise = torch.randn_like(uv) * self.noise_std / 3 + return cyc, noise, uv + + +class SourceModuleHnNSF(torch.nn.Module): + def __init__( + self, + sampling_rate=32000, + sine_amp=0.1, + add_noise_std=0.003, + voiced_threshod=0, + ): + super(SourceModuleHnNSF, self).__init__() + harmonic_num = 10 + self.sine_amp = sine_amp + self.noise_std = add_noise_std + + # to produce sine waveforms + self.l_sin_gen = SineGen( + sampling_rate, harmonic_num, sine_amp, add_noise_std, voiced_threshod + ) + + # to merge source harmonics into a single excitation + self.l_tanh = torch.nn.Tanh() + self.register_buffer('merge_w', torch.FloatTensor([[ + 0.2942, -0.2243, 0.0033, -0.0056, -0.0020, -0.0046, + 0.0221, -0.0083, -0.0241, -0.0036, -0.0581]])) + self.register_buffer('merge_b', torch.FloatTensor([0.0008])) + + def forward(self, x): + """ + Sine_source = SourceModuleHnNSF(F0_sampled) + F0_sampled (batchsize, length, 1) + Sine_source (batchsize, length, 1) + """ + # source for harmonic branch + sine_wavs = self.l_sin_gen(x) + sine_wavs = torch_nn_func.linear( + sine_wavs, self.merge_w) + self.merge_b + sine_merge = self.l_tanh(sine_wavs) + return sine_merge diff --git a/vits_extend/__init__.py b/vits_extend/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/vits_extend/dataloader.py b/vits_extend/dataloader.py new file mode 100644 index 0000000000000000000000000000000000000000..5f26fe0e15f719b6594110799f3863e720377150 --- /dev/null +++ b/vits_extend/dataloader.py @@ -0,0 +1,38 @@ +from torch.utils.data import DataLoader +from vits.data_utils import DistributedBucketSampler +from vits.data_utils import TextAudioSpeakerCollate +from vits.data_utils import TextAudioSpeakerSet + + +def create_dataloader_train(hps, n_gpus, rank): + collate_fn = TextAudioSpeakerCollate() + train_dataset = TextAudioSpeakerSet(hps.data.training_files, hps.data) + train_sampler = DistributedBucketSampler( + train_dataset, + hps.train.batch_size, + [150, 300, 450], + num_replicas=n_gpus, + rank=rank, + shuffle=True) + train_loader = DataLoader( + train_dataset, + num_workers=4, + shuffle=False, + pin_memory=True, + collate_fn=collate_fn, + batch_sampler=train_sampler) + return train_loader + + +def create_dataloader_eval(hps): + collate_fn = TextAudioSpeakerCollate() + eval_dataset = TextAudioSpeakerSet(hps.data.validation_files, hps.data) + eval_loader = DataLoader( + eval_dataset, + num_workers=2, + shuffle=False, + batch_size=hps.train.batch_size, + pin_memory=True, + drop_last=False, + collate_fn=collate_fn) + return eval_loader diff --git a/vits_extend/plotting.py b/vits_extend/plotting.py new file mode 100644 index 0000000000000000000000000000000000000000..99b3e817c01d24ef484402424ed1c95148924593 --- /dev/null +++ b/vits_extend/plotting.py @@ -0,0 +1,50 @@ +import logging +mpl_logger = logging.getLogger('matplotlib') # must before import matplotlib +mpl_logger.setLevel(logging.WARNING) +import matplotlib +matplotlib.use("Agg") + +import numpy as np +import matplotlib.pylab as plt + + +def save_figure_to_numpy(fig): + # save it to a numpy array. + data = np.frombuffer(fig.canvas.buffer_rgba(), dtype=np.uint8) + data = data.reshape(fig.canvas.get_width_height()[::-1] + (4,)) + data = data[..., :3] # RGB + data = np.transpose(data, (2, 0, 1)) + return data + + +def plot_waveform_to_numpy(waveform): + fig, ax = plt.subplots(figsize=(12, 4)) + ax.plot() + ax.plot(range(len(waveform)), waveform, + linewidth=0.1, alpha=0.7, color='blue') + + plt.xlabel("Samples") + plt.ylabel("Amplitude") + plt.ylim(-1, 1) + plt.tight_layout() + + fig.canvas.draw() + data = save_figure_to_numpy(fig) + plt.close() + + return data + + +def plot_spectrogram_to_numpy(spectrogram): + fig, ax = plt.subplots(figsize=(12, 4)) + im = ax.imshow(spectrogram, aspect="auto", origin="lower", + interpolation='none') + plt.colorbar(im, ax=ax) + plt.xlabel("Frames") + plt.ylabel("Channels") + plt.tight_layout() + + fig.canvas.draw() + data = save_figure_to_numpy(fig) + plt.close() + return data diff --git a/vits_extend/stft.py b/vits_extend/stft.py new file mode 100644 index 0000000000000000000000000000000000000000..9510305ffa19528c80380f1e30bb71e38e9fbcf8 --- /dev/null +++ b/vits_extend/stft.py @@ -0,0 +1,104 @@ +# MIT License +# +# Copyright (c) 2020 Jungil Kong +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in all +# copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. + +import math +import os +import random +import torch +import torch.utils.data +import numpy as np +from librosa.util import normalize +from scipy.io.wavfile import read +from librosa.filters import mel as librosa_mel_fn + + +class TacotronSTFT(torch.nn.Module): + def __init__(self, filter_length=512, hop_length=160, win_length=512, + n_mel_channels=80, sampling_rate=16000, mel_fmin=0.0, + mel_fmax=None, center=False, device='cpu'): + super(TacotronSTFT, self).__init__() + self.n_mel_channels = n_mel_channels + self.sampling_rate = sampling_rate + self.n_fft = filter_length + self.hop_size = hop_length + self.win_size = win_length + self.fmin = mel_fmin + self.fmax = mel_fmax + self.center = center + + mel = librosa_mel_fn( + sr=sampling_rate, n_fft=filter_length, n_mels=n_mel_channels, fmin=mel_fmin, fmax=mel_fmax) + + mel_basis = torch.from_numpy(mel).float().to(device) + hann_window = torch.hann_window(win_length).to(device) + + self.register_buffer('mel_basis', mel_basis) + self.register_buffer('hann_window', hann_window) + + def linear_spectrogram(self, y): + assert (torch.min(y.data) >= -1) + assert (torch.max(y.data) <= 1) + + y = torch.nn.functional.pad(y.unsqueeze(1), + (int((self.n_fft - self.hop_size) / 2), int((self.n_fft - self.hop_size) / 2)), + mode='reflect') + y = y.squeeze(1) + spec = torch.stft(y, self.n_fft, hop_length=self.hop_size, win_length=self.win_size, window=self.hann_window, + center=self.center, pad_mode='reflect', normalized=False, onesided=True, return_complex=False) + spec = torch.norm(spec, p=2, dim=-1) + + return spec + + def mel_spectrogram(self, y): + """Computes mel-spectrograms from a batch of waves + PARAMS + ------ + y: Variable(torch.FloatTensor) with shape (B, T) in range [-1, 1] + + RETURNS + ------- + mel_output: torch.FloatTensor of shape (B, n_mel_channels, T) + """ + assert(torch.min(y.data) >= -1) + assert(torch.max(y.data) <= 1) + + y = torch.nn.functional.pad(y.unsqueeze(1), + (int((self.n_fft - self.hop_size) / 2), int((self.n_fft - self.hop_size) / 2)), + mode='reflect') + y = y.squeeze(1) + + spec = torch.stft(y, self.n_fft, hop_length=self.hop_size, win_length=self.win_size, window=self.hann_window, + center=self.center, pad_mode='reflect', normalized=False, onesided=True, return_complex=False) + + spec = torch.sqrt(spec.pow(2).sum(-1) + (1e-9)) + + spec = torch.matmul(self.mel_basis, spec) + spec = self.spectral_normalize_torch(spec) + + return spec + + def spectral_normalize_torch(self, magnitudes): + output = self.dynamic_range_compression_torch(magnitudes) + return output + + def dynamic_range_compression_torch(self, x, C=1, clip_val=1e-5): + return torch.log(torch.clamp(x, min=clip_val) * C) diff --git a/vits_extend/stft_loss.py b/vits_extend/stft_loss.py new file mode 100644 index 0000000000000000000000000000000000000000..ed672b0000b993067668413f2dc6562ae8febdeb --- /dev/null +++ b/vits_extend/stft_loss.py @@ -0,0 +1,133 @@ +# -*- coding: utf-8 -*- + +# Copyright 2019 Tomoki Hayashi +# MIT License (https://opensource.org/licenses/MIT) + +"""STFT-based Loss modules.""" + +import torch +import torch.nn.functional as F + + +def stft(x, fft_size, hop_size, win_length, window): + """Perform STFT and convert to magnitude spectrogram. + Args: + x (Tensor): Input signal tensor (B, T). + fft_size (int): FFT size. + hop_size (int): Hop size. + win_length (int): Window length. + window (str): Window function type. + Returns: + Tensor: Magnitude spectrogram (B, #frames, fft_size // 2 + 1). + """ + x_stft = torch.stft(x, fft_size, hop_size, win_length, window, return_complex=False) + real = x_stft[..., 0] + imag = x_stft[..., 1] + + # NOTE(kan-bayashi): clamp is needed to avoid nan or inf + return torch.sqrt(torch.clamp(real ** 2 + imag ** 2, min=1e-7)).transpose(2, 1) + + +class SpectralConvergengeLoss(torch.nn.Module): + """Spectral convergence loss module.""" + + def __init__(self): + """Initilize spectral convergence loss module.""" + super(SpectralConvergengeLoss, self).__init__() + + def forward(self, x_mag, y_mag): + """Calculate forward propagation. + Args: + x_mag (Tensor): Magnitude spectrogram of predicted signal (B, #frames, #freq_bins). + y_mag (Tensor): Magnitude spectrogram of groundtruth signal (B, #frames, #freq_bins). + Returns: + Tensor: Spectral convergence loss value. + """ + return torch.norm(y_mag - x_mag, p="fro") / torch.norm(y_mag, p="fro") + + +class LogSTFTMagnitudeLoss(torch.nn.Module): + """Log STFT magnitude loss module.""" + + def __init__(self): + """Initilize los STFT magnitude loss module.""" + super(LogSTFTMagnitudeLoss, self).__init__() + + def forward(self, x_mag, y_mag): + """Calculate forward propagation. + Args: + x_mag (Tensor): Magnitude spectrogram of predicted signal (B, #frames, #freq_bins). + y_mag (Tensor): Magnitude spectrogram of groundtruth signal (B, #frames, #freq_bins). + Returns: + Tensor: Log STFT magnitude loss value. + """ + return F.l1_loss(torch.log(y_mag), torch.log(x_mag)) + + +class STFTLoss(torch.nn.Module): + """STFT loss module.""" + + def __init__(self, device, fft_size=1024, shift_size=120, win_length=600, window="hann_window"): + """Initialize STFT loss module.""" + super(STFTLoss, self).__init__() + self.fft_size = fft_size + self.shift_size = shift_size + self.win_length = win_length + self.window = getattr(torch, window)(win_length).to(device) + self.spectral_convergenge_loss = SpectralConvergengeLoss() + self.log_stft_magnitude_loss = LogSTFTMagnitudeLoss() + + def forward(self, x, y): + """Calculate forward propagation. + Args: + x (Tensor): Predicted signal (B, T). + y (Tensor): Groundtruth signal (B, T). + Returns: + Tensor: Spectral convergence loss value. + Tensor: Log STFT magnitude loss value. + """ + x_mag = stft(x, self.fft_size, self.shift_size, self.win_length, self.window) + y_mag = stft(y, self.fft_size, self.shift_size, self.win_length, self.window) + sc_loss = self.spectral_convergenge_loss(x_mag, y_mag) + mag_loss = self.log_stft_magnitude_loss(x_mag, y_mag) + + return sc_loss, mag_loss + + +class MultiResolutionSTFTLoss(torch.nn.Module): + """Multi resolution STFT loss module.""" + + def __init__(self, + device, + resolutions, + window="hann_window"): + """Initialize Multi resolution STFT loss module. + Args: + resolutions (list): List of (FFT size, hop size, window length). + window (str): Window function type. + """ + super(MultiResolutionSTFTLoss, self).__init__() + self.stft_losses = torch.nn.ModuleList() + for fs, ss, wl in resolutions: + self.stft_losses += [STFTLoss(device, fs, ss, wl, window)] + + def forward(self, x, y): + """Calculate forward propagation. + Args: + x (Tensor): Predicted signal (B, T). + y (Tensor): Groundtruth signal (B, T). + Returns: + Tensor: Multi resolution spectral convergence loss value. + Tensor: Multi resolution log STFT magnitude loss value. + """ + sc_loss = 0.0 + mag_loss = 0.0 + for f in self.stft_losses: + sc_l, mag_l = f(x, y) + sc_loss += sc_l + mag_loss += mag_l + + sc_loss /= len(self.stft_losses) + mag_loss /= len(self.stft_losses) + + return sc_loss, mag_loss diff --git a/vits_extend/train.py b/vits_extend/train.py new file mode 100644 index 0000000000000000000000000000000000000000..f822a2412b800d708e5cfa2f5b6788c8366dfb43 --- /dev/null +++ b/vits_extend/train.py @@ -0,0 +1,317 @@ +import os +import re +import time +import logging +import math +import tqdm +import torch +import torch.nn as nn +import torch.nn.functional as F +from torch.distributed import init_process_group +from torch.nn.parallel import DistributedDataParallel +import itertools +import traceback + +from vits_extend.dataloader import create_dataloader_train +from vits_extend.dataloader import create_dataloader_eval +from vits_extend.writer import MyWriter +from vits_extend.stft import TacotronSTFT +from vits_extend.stft_loss import MultiResolutionSTFTLoss +from vits_extend.validation import validate +from vits_decoder.discriminator import Discriminator +from vits.models import SynthesizerTrn +from vits import commons +from vits.losses import kl_loss +from vits.commons import clip_grad_value_ + + +def load_part(model, saved_state_dict): + if hasattr(model, 'module'): + state_dict = model.module.state_dict() + else: + state_dict = model.state_dict() + new_state_dict = {} + for k, v in state_dict.items(): + if k.startswith('TODO'): + new_state_dict[k] = v + else: + new_state_dict[k] = saved_state_dict[k] + if hasattr(model, 'module'): + model.module.load_state_dict(new_state_dict) + else: + model.load_state_dict(new_state_dict) + return model + + +def load_model(model, saved_state_dict): + if hasattr(model, 'module'): + state_dict = model.module.state_dict() + else: + state_dict = model.state_dict() + new_state_dict = {} + for k, v in state_dict.items(): + try: + new_state_dict[k] = saved_state_dict[k] + except: + print("%s is not in the checkpoint" % k) + new_state_dict[k] = v + if hasattr(model, 'module'): + model.module.load_state_dict(new_state_dict) + else: + model.load_state_dict(new_state_dict) + return model + + +def train(rank, args, chkpt_path, hp, hp_str): + + if args.num_gpus > 1: + init_process_group(backend=hp.dist_config.dist_backend, init_method=hp.dist_config.dist_url, + world_size=hp.dist_config.world_size * args.num_gpus, rank=rank) + + if torch.cuda.is_available(): + torch.cuda.manual_seed(hp.train.seed) + device = torch.device('cuda:{:d}'.format(rank)) + elif torch.backends.mps.is_available(): + device = torch.device('mps') + else: + device = torch.device('cpu') + + model_g = SynthesizerTrn( + hp.data.filter_length // 2 + 1, + hp.data.segment_size // hp.data.hop_length, + hp).to(device) + model_d = Discriminator(hp).to(device) + + optim_g = torch.optim.AdamW(model_g.parameters(), + lr=hp.train.learning_rate, betas=hp.train.betas, eps=hp.train.eps) + optim_d = torch.optim.AdamW(model_d.parameters(), + lr=hp.train.learning_rate, betas=hp.train.betas, eps=hp.train.eps) + + init_epoch = 1 + step = 0 + + stft = TacotronSTFT(filter_length=hp.data.filter_length, + hop_length=hp.data.hop_length, + win_length=hp.data.win_length, + n_mel_channels=hp.data.mel_channels, + sampling_rate=hp.data.sampling_rate, + mel_fmin=hp.data.mel_fmin, + mel_fmax=hp.data.mel_fmax, + center=False, + device=device) + # define logger, writer, valloader, stft at rank_zero + if rank == 0: + pth_dir = os.path.join(hp.log.pth_dir, args.name) + log_dir = os.path.join(hp.log.log_dir, args.name) + os.makedirs(pth_dir, exist_ok=True) + os.makedirs(log_dir, exist_ok=True) + + logging.basicConfig( + level=logging.INFO, + format='%(asctime)s - %(levelname)s - %(message)s', + handlers=[ + logging.FileHandler(os.path.join(log_dir, '%s-%d.log' % (args.name, time.time()))), + logging.StreamHandler() + ] + ) + logger = logging.getLogger() + writer = MyWriter(hp, log_dir) + valloader = create_dataloader_eval(hp) + + if os.path.isfile(hp.train.pretrain): + if rank == 0: + logger.info("Start from 32k pretrain model: %s" % hp.train.pretrain) + checkpoint = torch.load(hp.train.pretrain, map_location='cpu') + load_model(model_g, checkpoint['model_g']) + load_model(model_d, checkpoint['model_d']) + + if chkpt_path is not None: + if rank == 0: + logger.info("Resuming from checkpoint: %s" % chkpt_path) + checkpoint = torch.load(chkpt_path, map_location='cpu') + load_model(model_g, checkpoint['model_g']) + load_model(model_d, checkpoint['model_d']) + optim_g.load_state_dict(checkpoint['optim_g']) + optim_d.load_state_dict(checkpoint['optim_d']) + init_epoch = checkpoint['epoch'] + step = checkpoint['step'] + + if rank == 0: + if hp_str != checkpoint['hp_str']: + logger.warning("New hparams is different from checkpoint. Will use new.") + else: + if rank == 0: + logger.info("Starting new training run.") + + if args.num_gpus > 1: + model_g = DistributedDataParallel(model_g, device_ids=[rank]) + model_d = DistributedDataParallel(model_d, device_ids=[rank]) + + # this accelerates training when the size of minibatch is always consistent. + # if not consistent, it'll horribly slow down. + torch.backends.cudnn.benchmark = True + + scheduler_g = torch.optim.lr_scheduler.ExponentialLR(optim_g, gamma=hp.train.lr_decay, last_epoch=init_epoch-2) + scheduler_d = torch.optim.lr_scheduler.ExponentialLR(optim_d, gamma=hp.train.lr_decay, last_epoch=init_epoch-2) + + stft_criterion = MultiResolutionSTFTLoss(device, eval(hp.mrd.resolutions)) + spkc_criterion = nn.CosineEmbeddingLoss() + + trainloader = create_dataloader_train(hp, args.num_gpus, rank) + + for epoch in range(init_epoch, hp.train.epochs+1): + + trainloader.batch_sampler.set_epoch(epoch) + + if rank == 0 and epoch % hp.log.eval_interval == 0: + with torch.no_grad(): + validate(hp, args, model_g, model_d, valloader, stft, writer, step, device) + + if rank == 0: + loader = tqdm.tqdm(trainloader, desc='Loading train data') + else: + loader = trainloader + + model_g.train() + model_d.train() + + for ppg, ppg_l, vec, pit, spk, spec, spec_l, audio, audio_l in loader: + + ppg = ppg.to(device) + vec = vec.to(device) + pit = pit.to(device) + spk = spk.to(device) + spec = spec.to(device) + audio = audio.to(device) + ppg_l = ppg_l.to(device) + spec_l = spec_l.to(device) + audio_l = audio_l.to(device) + + # generator + optim_g.zero_grad() + + fake_audio, ids_slice, z_mask, \ + (z_f, z_r, z_p, m_p, logs_p, z_q, m_q, logs_q, logdet_f, logdet_r), spk_preds = model_g( + ppg, vec, pit, spec, spk, ppg_l, spec_l) + + + audio = commons.slice_segments( + audio, ids_slice * hp.data.hop_length, hp.data.segment_size) # slice + # Spk Loss + spk_loss = spkc_criterion(spk, spk_preds, torch.Tensor(spk_preds.size(0)) + .to(device).fill_(1.0)) + # Mel Loss + mel_fake = stft.mel_spectrogram(fake_audio.squeeze(1)) + mel_real = stft.mel_spectrogram(audio.squeeze(1)) + mel_loss = F.l1_loss(mel_fake, mel_real) * hp.train.c_mel + + # Multi-Resolution STFT Loss + sc_loss, mag_loss = stft_criterion(fake_audio.squeeze(1), audio.squeeze(1)) + stft_loss = (sc_loss + mag_loss) * hp.train.c_stft + + # Generator Loss + disc_fake = model_d(fake_audio) + score_loss = 0.0 + for (_, score_fake) in disc_fake: + score_loss += torch.mean(torch.pow(score_fake - 1.0, 2)) + score_loss = score_loss / len(disc_fake) + + # Feature Loss + disc_real = model_d(audio) + feat_loss = 0.0 + for (feat_fake, _), (feat_real, _) in zip(disc_fake, disc_real): + for fake, real in zip(feat_fake, feat_real): + feat_loss += torch.mean(torch.abs(fake - real)) + feat_loss = feat_loss / len(disc_fake) + feat_loss = feat_loss * 2 + + # Kl Loss + loss_kl_f = kl_loss(z_f, logs_q, m_p, logs_p, logdet_f, z_mask) * hp.train.c_kl + loss_kl_r = kl_loss(z_r, logs_p, m_q, logs_q, logdet_r, z_mask) * hp.train.c_kl + + # Loss + loss_g = score_loss + feat_loss + mel_loss + stft_loss + loss_kl_f + loss_kl_r * 0.5 + spk_loss * 2 + loss_g.backward() + clip_grad_value_(model_g.parameters(), None) + optim_g.step() + + # discriminator + optim_d.zero_grad() + disc_fake = model_d(fake_audio.detach()) + disc_real = model_d(audio) + + loss_d = 0.0 + for (_, score_fake), (_, score_real) in zip(disc_fake, disc_real): + loss_d += torch.mean(torch.pow(score_real - 1.0, 2)) + loss_d += torch.mean(torch.pow(score_fake, 2)) + loss_d = loss_d / len(disc_fake) + + loss_d.backward() + clip_grad_value_(model_d.parameters(), None) + optim_d.step() + + step += 1 + # logging + loss_g = loss_g.item() + loss_d = loss_d.item() + loss_s = stft_loss.item() + loss_m = mel_loss.item() + loss_k = loss_kl_f.item() + loss_r = loss_kl_r.item() + loss_i = spk_loss.item() + + if rank == 0 and step % hp.log.info_interval == 0: + writer.log_training( + loss_g, loss_d, loss_m, loss_s, loss_k, loss_r, score_loss.item(), step) + logger.info("epoch %d | g %.04f m %.04f s %.04f d %.04f k %.04f r %.04f i %.04f | step %d" % ( + epoch, loss_g, loss_m, loss_s, loss_d, loss_k, loss_r, loss_i, step)) + + if rank == 0 and epoch % hp.log.save_interval == 0: + save_path = os.path.join(pth_dir, '%s_%04d.pt' + % (args.name, epoch)) + torch.save({ + 'model_g': (model_g.module if args.num_gpus > 1 else model_g).state_dict(), + 'model_d': (model_d.module if args.num_gpus > 1 else model_d).state_dict(), + 'optim_g': optim_g.state_dict(), + 'optim_d': optim_d.state_dict(), + 'step': step, + 'epoch': epoch, + 'hp_str': hp_str, + }, save_path) + logger.info("Saved checkpoint to: %s" % save_path) + + # 删除模型,释放空间 + def clean_checkpoints(path_to_models=f'{pth_dir}', n_ckpts_to_keep=hp.log.keep_ckpts, sort_by_time=True): + """Freeing up space by deleting saved ckpts + Arguments: + path_to_models -- Path to the model directory + n_ckpts_to_keep -- Number of ckpts to keep, excluding sovits5.0_0.pth + If n_ckpts_to_keep == 0, do not delete any ckpts + sort_by_time -- True -> chronologically delete ckpts + False -> lexicographically delete ckpts + """ + assert isinstance(n_ckpts_to_keep, int) and n_ckpts_to_keep >= 0 + ckpts_files = [f for f in os.listdir(path_to_models) if os.path.isfile(os.path.join(path_to_models, f))] + name_key = (lambda _f: int(re.compile(f'{args.name}_(\d+)\.pt').match(_f).group(1))) + time_key = (lambda _f: os.path.getmtime(os.path.join(path_to_models, _f))) + sort_key = time_key if sort_by_time else name_key + x_sorted = lambda _x: sorted( + [f for f in ckpts_files if f.startswith(_x) and not f.endswith('sovits5.0_0.pth')], key=sort_key) + if n_ckpts_to_keep == 0: + to_del = [] + else: + to_del = [os.path.join(path_to_models, fn) for fn in x_sorted(f'{args.name}')[:-n_ckpts_to_keep]] + del_info = lambda fn: logger.info(f"Free up space by deleting ckpt {fn}") + del_routine = lambda x: [os.remove(x), del_info(x)] + rs = [del_routine(fn) for fn in to_del] + + clean_checkpoints() + + if rank == 0: + os.makedirs(f'{pth_dir}', exist_ok=True) + keep_ckpts = getattr(hp.log, 'keep_ckpts', 0) + if keep_ckpts > 0: + clean_checkpoints(path_to_models=f'{pth_dir}', n_ckpts_to_keep=hp.log.keep_ckpts, sort_by_time=True) + + scheduler_g.step() + scheduler_d.step() diff --git a/vits_extend/validation.py b/vits_extend/validation.py new file mode 100644 index 0000000000000000000000000000000000000000..acf93a1bb428b25386e8365bac19d7cfe22759d7 --- /dev/null +++ b/vits_extend/validation.py @@ -0,0 +1,48 @@ +import tqdm +import torch +import torch.nn.functional as F + + +def validate(hp, args, generator, discriminator, valloader, stft, writer, step, device): + generator.eval() + discriminator.eval() + torch.backends.cudnn.benchmark = False + + loader = tqdm.tqdm(valloader, desc='Validation loop') + mel_loss = 0.0 + for idx, (ppg, ppg_l, vec, pit, spk, spec, spec_l, audio, audio_l) in enumerate(loader): + ppg = ppg.to(device) + vec = vec.to(device) + pit = pit.to(device) + spk = spk.to(device) + ppg_l = ppg_l.to(device) + audio = audio.to(device) + + if hasattr(generator, 'module'): + fake_audio = generator.module.infer(ppg, vec, pit, spk, ppg_l)[ + :, :, :audio.size(2)] + else: + fake_audio = generator.infer(ppg, vec, pit, spk, ppg_l)[ + :, :, :audio.size(2)] + + mel_fake = stft.mel_spectrogram(fake_audio.squeeze(1)) + mel_real = stft.mel_spectrogram(audio.squeeze(1)) + + mel_loss += F.l1_loss(mel_fake, mel_real).item() + + if idx < hp.log.num_audio: + spec_fake = stft.linear_spectrogram(fake_audio.squeeze(1)) + spec_real = stft.linear_spectrogram(audio.squeeze(1)) + + audio = audio[0][0].cpu().detach().numpy() + fake_audio = fake_audio[0][0].cpu().detach().numpy() + spec_fake = spec_fake[0].cpu().detach().numpy() + spec_real = spec_real[0].cpu().detach().numpy() + writer.log_fig_audio( + audio, fake_audio, spec_fake, spec_real, idx, step) + + mel_loss = mel_loss / len(valloader.dataset) + + writer.log_validation(mel_loss, generator, discriminator, step) + + torch.backends.cudnn.benchmark = True diff --git a/vits_extend/writer.py b/vits_extend/writer.py new file mode 100644 index 0000000000000000000000000000000000000000..386682bfc4467ee027efdca6d2bdbbe50d574895 --- /dev/null +++ b/vits_extend/writer.py @@ -0,0 +1,39 @@ +from torch.utils.tensorboard import SummaryWriter +import numpy as np +import librosa + +from .plotting import plot_waveform_to_numpy, plot_spectrogram_to_numpy + +class MyWriter(SummaryWriter): + def __init__(self, hp, logdir): + super(MyWriter, self).__init__(logdir) + self.sample_rate = hp.data.sampling_rate + + def log_training(self, g_loss, d_loss, mel_loss, stft_loss, k_loss, r_loss, score_loss, step): + self.add_scalar('train/g_loss', g_loss, step) + self.add_scalar('train/d_loss', d_loss, step) + + self.add_scalar('train/score_loss', score_loss, step) + self.add_scalar('train/stft_loss', stft_loss, step) + self.add_scalar('train/mel_loss', mel_loss, step) + self.add_scalar('train/kl_f_loss', k_loss, step) + self.add_scalar('train/kl_r_loss', r_loss, step) + + def log_validation(self, mel_loss, generator, discriminator, step): + self.add_scalar('validation/mel_loss', mel_loss, step) + + def log_fig_audio(self, real, fake, spec_fake, spec_real, idx, step): + if idx == 0: + spec_fake = librosa.amplitude_to_db(spec_fake, ref=np.max,top_db=80.) + spec_real = librosa.amplitude_to_db(spec_real, ref=np.max,top_db=80.) + self.add_image(f'spec_fake/{step}', plot_spectrogram_to_numpy(spec_fake), step) + self.add_image(f'wave_fake/{step}', plot_waveform_to_numpy(fake), step) + self.add_image(f'spec_real/{step}', plot_spectrogram_to_numpy(spec_real), step) + self.add_image(f'wave_real/{step}', plot_waveform_to_numpy(real), step) + + self.add_audio(f'fake/{step}', fake, step, self.sample_rate) + self.add_audio(f'real/{step}', real, step, self.sample_rate) + + def log_histogram(self, model, step): + for tag, value in model.named_parameters(): + self.add_histogram(tag.replace('.', '/'), value.cpu().detach().numpy(), step) diff --git a/whisper/LICENSE b/whisper/LICENSE new file mode 100644 index 0000000000000000000000000000000000000000..d25552598bb9c5400612159ed4bab92ce12a5ce5 --- /dev/null +++ b/whisper/LICENSE @@ -0,0 +1,21 @@ +MIT License + +Copyright (c) 2022 OpenAI + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. diff --git a/whisper/README.md b/whisper/README.md new file mode 100644 index 0000000000000000000000000000000000000000..9ea3a38e58aa56be82a79e31461849083917babb --- /dev/null +++ b/whisper/README.md @@ -0,0 +1,147 @@ +# Whisper + +[[Blog]](https://openai.com/blog/whisper) +[[Paper]](https://arxiv.org/abs/2212.04356) +[[Model card]](https://github.com/openai/whisper/blob/main/model-card.md) +[[Colab example]](https://colab.research.google.com/github/openai/whisper/blob/master/notebooks/LibriSpeech.ipynb) + +Whisper is a general-purpose speech recognition model. It is trained on a large dataset of diverse audio and is also a multitasking model that can perform multilingual speech recognition, speech translation, and language identification. + + +## Approach + +![Approach](https://raw.githubusercontent.com/openai/whisper/main/approach.png) + +A Transformer sequence-to-sequence model is trained on various speech processing tasks, including multilingual speech recognition, speech translation, spoken language identification, and voice activity detection. These tasks are jointly represented as a sequence of tokens to be predicted by the decoder, allowing a single model to replace many stages of a traditional speech-processing pipeline. The multitask training format uses a set of special tokens that serve as task specifiers or classification targets. + + +## Setup + +We used Python 3.9.9 and [PyTorch](https://pytorch.org/) 1.10.1 to train and test our models, but the codebase is expected to be compatible with Python 3.8-3.10 and recent PyTorch versions. The codebase also depends on a few Python packages, most notably [HuggingFace Transformers](https://huggingface.co/docs/transformers/index) for their fast tokenizer implementation and [ffmpeg-python](https://github.com/kkroening/ffmpeg-python) for reading audio files. You can download and install (or update to) the latest release of Whisper with the following command: + + pip install -U openai-whisper + +Alternatively, the following command will pull and install the latest commit from this repository, along with its Python dependencies: + + pip install git+https://github.com/openai/whisper.git + +To update the package to the latest version of this repository, please run: + + pip install --upgrade --no-deps --force-reinstall git+https://github.com/openai/whisper.git + +It also requires the command-line tool [`ffmpeg`](https://ffmpeg.org/) to be installed on your system, which is available from most package managers: + +```bash +# on Ubuntu or Debian +sudo apt update && sudo apt install ffmpeg + +# on Arch Linux +sudo pacman -S ffmpeg + +# on MacOS using Homebrew (https://brew.sh/) +brew install ffmpeg + +# on Windows using Chocolatey (https://chocolatey.org/) +choco install ffmpeg + +# on Windows using Scoop (https://scoop.sh/) +scoop install ffmpeg +``` + +You may need [`rust`](http://rust-lang.org) installed as well, in case [tokenizers](https://pypi.org/project/tokenizers/) does not provide a pre-built wheel for your platform. If you see installation errors during the `pip install` command above, please follow the [Getting started page](https://www.rust-lang.org/learn/get-started) to install Rust development environment. Additionally, you may need to configure the `PATH` environment variable, e.g. `export PATH="$HOME/.cargo/bin:$PATH"`. If the installation fails with `No module named 'setuptools_rust'`, you need to install `setuptools_rust`, e.g. by running: + +```bash +pip install setuptools-rust +``` + + +## Available models and languages + +There are five model sizes, four with English-only versions, offering speed and accuracy tradeoffs. Below are the names of the available models and their approximate memory requirements and relative speed. + + +| Size | Parameters | English-only model | Multilingual model | Required VRAM | Relative speed | +|:------:|:----------:|:------------------:|:------------------:|:-------------:|:--------------:| +| tiny | 39 M | `tiny.en` | `tiny` | ~1 GB | ~32x | +| base | 74 M | `base.en` | `base` | ~1 GB | ~16x | +| small | 244 M | `small.en` | `small` | ~2 GB | ~6x | +| medium | 769 M | `medium.en` | `medium` | ~5 GB | ~2x | +| large | 1550 M | N/A | `large` | ~10 GB | 1x | + +The `.en` models for English-only applications tend to perform better, especially for the `tiny.en` and `base.en` models. We observed that the difference becomes less significant for the `small.en` and `medium.en` models. + +Whisper's performance varies widely depending on the language. The figure below shows a WER (Word Error Rate) breakdown by languages of the Fleurs dataset using the `large-v2` model. More WER and BLEU scores corresponding to the other models and datasets can be found in Appendix D in [the paper](https://arxiv.org/abs/2212.04356). The smaller, the better. + +![WER breakdown by language](https://raw.githubusercontent.com/openai/whisper/main/language-breakdown.svg) + + + +## Command-line usage + +The following command will transcribe speech in audio files, using the `medium` model: + + whisper audio.flac audio.mp3 audio.wav --model medium + +The default setting (which selects the `small` model) works well for transcribing English. To transcribe an audio file containing non-English speech, you can specify the language using the `--language` option: + + whisper japanese.wav --language Japanese + +Adding `--task translate` will translate the speech into English: + + whisper japanese.wav --language Japanese --task translate + +Run the following to view all available options: + + whisper --help + +See [tokenizer.py](https://github.com/openai/whisper/blob/main/whisper/tokenizer.py) for the list of all available languages. + + +## Python usage + +Transcription can also be performed within Python: + +```python +import whisper + +model = whisper.load_model("base") +result = model.transcribe("audio.mp3") +print(result["text"]) +``` + +Internally, the `transcribe()` method reads the entire file and processes the audio with a sliding 30-second window, performing autoregressive sequence-to-sequence predictions on each window. + +Below is an example usage of `whisper.detect_language()` and `whisper.decode()` which provide lower-level access to the model. + +```python +import whisper + +model = whisper.load_model("base") + +# load audio and pad/trim it to fit 30 seconds +audio = whisper.load_audio("audio.mp3") +audio = whisper.pad_or_trim(audio) + +# make log-Mel spectrogram and move to the same device as the model +mel = whisper.log_mel_spectrogram(audio).to(model.device) + +# detect the spoken language +_, probs = model.detect_language(mel) +print(f"Detected language: {max(probs, key=probs.get)}") + +# decode the audio +options = whisper.DecodingOptions() +result = whisper.decode(model, mel, options) + +# print the recognized text +print(result.text) +``` + +## More examples + +Please use the [🙌 Show and tell](https://github.com/openai/whisper/discussions/categories/show-and-tell) category in Discussions for sharing more example usages of Whisper and third-party extensions such as web demos, integrations with other tools, ports for different platforms, etc. + + +## License + +Whisper's code and model weights are released under the MIT License. See [LICENSE](https://github.com/openai/whisper/blob/main/LICENSE) for further details. \ No newline at end of file diff --git a/whisper/__init__.py b/whisper/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/whisper/audio.py b/whisper/audio.py new file mode 100644 index 0000000000000000000000000000000000000000..2dfe105adda10dfe78179edb5e39cc6d3bde39f9 --- /dev/null +++ b/whisper/audio.py @@ -0,0 +1,100 @@ +import os +from functools import lru_cache +from typing import Union + +import librosa +import numpy as np +import torch +import torch.nn.functional as F + +from .utils import exact_div + +from librosa.filters import mel as librosa_mel_fn + +# hard-coded audio hyperparameters +SAMPLE_RATE = 16000 +N_FFT = 400 +N_MELS = 80 +HOP_LENGTH = 160 +CHUNK_LENGTH = 30 +N_SAMPLES = CHUNK_LENGTH * SAMPLE_RATE # 480000: number of samples in a chunk +N_FRAMES = exact_div(N_SAMPLES, HOP_LENGTH) # 3000: number of frames in a mel spectrogram input + + +def load_audio(file: str, sr: int = SAMPLE_RATE): + x, sr = librosa.load(file, sr=sr) + return x + + +def pad_or_trim(array, length_max: int = N_SAMPLES, length_min: int = N_SAMPLES // 2, *, axis: int = -1): + """ + Pad or trim the audio array to N_SAMPLES, as expected by the encoder. + """ + if torch.is_tensor(array): + if array.shape[axis] > length_max: + array = array.index_select(dim=axis, index=torch.arange(length_max, device=array.device)) + + if array.shape[axis] < length_min: + pad_widths = [(0, 0)] * array.ndim + pad_widths[axis] = (0, length_min - array.shape[axis]) + array = F.pad(array, [pad for sizes in pad_widths[::-1] for pad in sizes]) + else: + if array.shape[axis] > length_max: + array = array.take(indices=range(length_max), axis=axis) + + if array.shape[axis] < length_min: + pad_widths = [(0, 0)] * array.ndim + pad_widths[axis] = (0, length_min - array.shape[axis]) + array = np.pad(array, pad_widths) + + return array + + +@lru_cache(maxsize=None) +def mel_filters(device, n_mels: int = N_MELS) -> torch.Tensor: + """ + load the mel filterbank matrix for projecting STFT into a Mel spectrogram. + Allows decoupling librosa dependency; saved using: + + np.savez_compressed( + "mel_filters.npz", + mel_80=librosa.filters.mel(sr=16000, n_fft=400, n_mels=80), + ) + """ + assert n_mels == 80, f"Unsupported n_mels: {n_mels}" + return torch.from_numpy(librosa_mel_fn(sr=SAMPLE_RATE,n_fft=N_FFT,n_mels=n_mels)).to(device) + + +def log_mel_spectrogram(audio: Union[str, np.ndarray, torch.Tensor], n_mels: int = N_MELS): + """ + Compute the log-Mel spectrogram of + + Parameters + ---------- + audio: Union[str, np.ndarray, torch.Tensor], shape = (*) + The path to audio or either a NumPy array or Tensor containing the audio waveform in 16 kHz + + n_mels: int + The number of Mel-frequency filters, only 80 is supported + + Returns + ------- + torch.Tensor, shape = (80, n_frames) + A Tensor that contains the Mel spectrogram + """ + if not torch.is_tensor(audio): + if isinstance(audio, str): + audio = load_audio(audio) + audio = torch.from_numpy(audio) + + window = torch.hann_window(N_FFT).to(audio.device) + stft = torch.stft(audio, N_FFT, HOP_LENGTH, window=window, return_complex=True) + magnitudes = stft[..., :-1].abs() ** 2 + + filters = mel_filters(audio.device, n_mels) + mel_spec = filters @ magnitudes + + log_spec = torch.clamp(mel_spec, min=1e-10).log10() + log_spec = torch.maximum(log_spec, log_spec.max() - 8.0) + log_spec = (log_spec + 4.0) / 4.0 + return log_spec diff --git a/whisper/decoding.py b/whisper/decoding.py new file mode 100644 index 0000000000000000000000000000000000000000..603546d4c9ff67514d2567576935b974fe373bef --- /dev/null +++ b/whisper/decoding.py @@ -0,0 +1,712 @@ +from dataclasses import dataclass, field +from typing import Dict, List, Tuple, Iterable, Optional, Sequence, Union, TYPE_CHECKING + +import numpy as np +import torch +import torch.nn.functional as F +from torch import Tensor +from torch.distributions import Categorical + +from .audio import CHUNK_LENGTH +from .tokenizer import Tokenizer, get_tokenizer +from .utils import compression_ratio + +if TYPE_CHECKING: + from .model import Whisper + + +@torch.no_grad() +def detect_language(model: "Whisper", mel: Tensor, tokenizer: Tokenizer = None) -> Tuple[Tensor, List[dict]]: + """ + Detect the spoken language in the audio, and return them as list of strings, along with the ids + of the most probable language tokens and the probability distribution over all language tokens. + This is performed outside the main decode loop in order to not interfere with kv-caching. + + Returns + ------- + language_tokens : Tensor, shape = (n_audio,) + ids of the most probable language tokens, which appears after the startoftranscript token. + language_probs : List[Dict[str, float]], length = n_audio + list of dictionaries containing the probability distribution over all languages. + """ + if tokenizer is None: + tokenizer = get_tokenizer(model.is_multilingual) + if tokenizer.language is None or tokenizer.language_token not in tokenizer.sot_sequence: + raise ValueError(f"This model doesn't have language tokens so it can't perform lang id") + + single = mel.ndim == 2 + if single: + mel = mel.unsqueeze(0) + + # skip encoder forward pass if already-encoded audio features were given + if mel.shape[-2:] != (model.dims.n_audio_ctx, model.dims.n_audio_state): + mel = model.encoder(mel) + + # forward pass using a single token, startoftranscript + n_audio = mel.shape[0] + x = torch.tensor([[tokenizer.sot]] * n_audio).to(mel.device) # [n_audio, 1] + logits = model.logits(x, mel)[:, 0] + + # collect detected languages; suppress all non-language tokens + mask = torch.ones(logits.shape[-1], dtype=torch.bool) + mask[list(tokenizer.all_language_tokens)] = False + logits[:, mask] = -np.inf + language_tokens = logits.argmax(dim=-1) + language_token_probs = logits.softmax(dim=-1).cpu() + language_probs = [ + { + c: language_token_probs[i, j].item() + for j, c in zip(tokenizer.all_language_tokens, tokenizer.all_language_codes) + } + for i in range(n_audio) + ] + + if single: + language_tokens = language_tokens[0] + language_probs = language_probs[0] + + return language_tokens, language_probs + + +@dataclass(frozen=True) +class DecodingOptions: + task: str = "transcribe" # whether to perform X->X "transcribe" or X->English "translate" + language: Optional[str] = None # language that the audio is in; uses detected language if None + + # sampling-related options + temperature: float = 0.0 + sample_len: Optional[int] = None # maximum number of tokens to sample + best_of: Optional[int] = None # number of independent samples to collect, when t > 0 + beam_size: Optional[int] = None # number of beams in beam search, when t == 0 + patience: Optional[float] = None # patience in beam search (https://arxiv.org/abs/2204.05424) + + # options for ranking generations (either beams or best-of-N samples) + length_penalty: Optional[float] = None # "alpha" in Google NMT, None defaults to length norm + + # prompt, prefix, and token suppression + prompt: Optional[Union[str, List[int]]] = None # text or tokens for the previous context + prefix: Optional[Union[str, List[int]]] = None # text or tokens to prefix the current context + suppress_blank: bool = True # this will suppress blank outputs + + # list of tokens ids (or comma-separated token ids) to suppress + # "-1" will suppress a set of symbols as defined in `tokenizer.non_speech_tokens()` + suppress_tokens: Optional[Union[str, Iterable[int]]] = "-1" + + # timestamp sampling options + without_timestamps: bool = False # use <|notimestamps|> to sample text tokens only + max_initial_timestamp: Optional[float] = 1.0 # the initial timestamp cannot be later than this + + # implementation details + fp16: bool = True # use fp16 for most of the calculation + + +@dataclass(frozen=True) +class DecodingResult: + audio_features: Tensor + language: str + language_probs: Optional[Dict[str, float]] = None + tokens: List[int] = field(default_factory=list) + text: str = "" + avg_logprob: float = np.nan + no_speech_prob: float = np.nan + temperature: float = np.nan + compression_ratio: float = np.nan + + +class Inference: + def logits(self, tokens: Tensor, audio_features: Tensor) -> Tensor: + """Perform a forward pass on the decoder and return per-token logits""" + raise NotImplementedError + + def rearrange_kv_cache(self, source_indices) -> None: + """Update the key-value cache according to the updated beams""" + raise NotImplementedError + + def cleanup_caching(self) -> None: + """Clean up any resources or hooks after decoding is finished""" + pass + + +class PyTorchInference(Inference): + def __init__(self, model: "Whisper", initial_token_length: int): + self.model: "Whisper" = model + self.initial_token_length = initial_token_length + self.kv_cache = {} + self.hooks = [] + + def logits(self, tokens: Tensor, audio_features: Tensor) -> Tensor: + if not self.kv_cache: + self.kv_cache, self.hooks = self.model.install_kv_cache_hooks() + + if tokens.shape[-1] > self.initial_token_length: + # only need to use the last token except in the first forward pass + tokens = tokens[:, -1:] + + return self.model.decoder(tokens, audio_features, kv_cache=self.kv_cache) + + def cleanup_caching(self): + for hook in self.hooks: + hook.remove() + + self.kv_cache = {} + self.hooks = [] + + def rearrange_kv_cache(self, source_indices): + for module, tensor in self.kv_cache.items(): + # update the key/value cache to contain the selected sequences + self.kv_cache[module] = tensor[source_indices].detach() + + +class SequenceRanker: + def rank(self, tokens: List[List[Tensor]], sum_logprobs: List[List[float]]) -> List[int]: + """ + Given a list of groups of samples and their cumulative log probabilities, + return the indices of the samples in each group to select as the final result + """ + raise NotImplementedError + + +class MaximumLikelihoodRanker(SequenceRanker): + """ + Select the sample with the highest log probabilities, penalized using either + a simple length normalization or Google NMT paper's length penalty + """ + + def __init__(self, length_penalty: Optional[float]): + self.length_penalty = length_penalty + + def rank(self, tokens: List[List[Tensor]], sum_logprobs: List[List[float]]): + def scores(logprobs, lengths): + result = [] + for logprob, length in zip(logprobs, lengths): + if self.length_penalty is None: + penalty = length + else: + # from the Google NMT paper + penalty = ((5 + length) / 6) ** self.length_penalty + result.append(logprob / penalty) + return result + + # get the sequence with the highest score + lengths = [[len(t) for t in s] for s in tokens] + return [np.argmax(scores(p, l)) for p, l in zip(sum_logprobs, lengths)] + + +class TokenDecoder: + def reset(self): + """Initialize any stateful variables for decoding a new sequence""" + + def update(self, tokens: Tensor, logits: Tensor, sum_logprobs: Tensor) -> Tuple[Tensor, bool]: + """Specify how to select the next token, based on the current trace and logits + + Parameters + ---------- + tokens : Tensor, shape = (n_batch, current_sequence_length) + all tokens in the context so far, including the prefix and sot_sequence tokens + + logits : Tensor, shape = (n_batch, vocab_size) + per-token logits of the probability distribution at the current step + + sum_logprobs : Tensor, shape = (n_batch) + cumulative log probabilities for each sequence + + Returns + ------- + tokens : Tensor, shape = (n_batch, current_sequence_length + 1) + the tokens, appended with the selected next token + + completed : bool + True if all sequences has reached the end of text + + """ + raise NotImplementedError + + def finalize( + self, tokens: Tensor, sum_logprobs: Tensor + ) -> Tuple[Sequence[Sequence[Tensor]], List[List[float]]]: + """Finalize search and return the final candidate sequences + + Parameters + ---------- + tokens : Tensor, shape = (n_audio, n_group, current_sequence_length) + all tokens in the context so far, including the prefix and sot_sequence + + sum_logprobs : Tensor, shape = (n_audio, n_group) + cumulative log probabilities for each sequence + + Returns + ------- + tokens : Sequence[Sequence[Tensor]], length = n_audio + sequence of Tensors containing candidate token sequences, for each audio input + + sum_logprobs : List[List[float]], length = n_audio + sequence of cumulative log probabilities corresponding to the above + + """ + raise NotImplementedError + + +class GreedyDecoder(TokenDecoder): + def __init__(self, temperature: float, eot: int): + self.temperature = temperature + self.eot = eot + + def update(self, tokens: Tensor, logits: Tensor, sum_logprobs: Tensor) -> Tuple[Tensor, bool]: + temperature = self.temperature + if temperature == 0: + next_tokens = logits.argmax(dim=-1) + else: + next_tokens = Categorical(logits=logits / temperature).sample() + + logprobs = F.log_softmax(logits.float(), dim=-1) + current_logprobs = logprobs[torch.arange(logprobs.shape[0]), next_tokens] + sum_logprobs += current_logprobs * (tokens[:, -1] != self.eot) + + next_tokens[tokens[:, -1] == self.eot] = self.eot + tokens = torch.cat([tokens, next_tokens[:, None]], dim=-1) + + completed = (tokens[:, -1] == self.eot).all() + return tokens, completed + + def finalize(self, tokens: Tensor, sum_logprobs: Tensor): + # make sure each sequence has at least one EOT token at the end + tokens = F.pad(tokens, (0, 1), value=self.eot) + return tokens, sum_logprobs.tolist() + + +class BeamSearchDecoder(TokenDecoder): + def __init__(self, beam_size: int, eot: int, inference: Inference, patience: Optional[float] = None): + self.beam_size = beam_size + self.eot = eot + self.inference = inference + self.patience = patience or 1.0 + self.max_candidates: int = round(beam_size * self.patience) + self.finished_sequences = None + + assert self.max_candidates > 0, f"Invalid beam size ({beam_size}) or patience ({patience})" + + def reset(self): + self.finished_sequences = None + + def update(self, tokens: Tensor, logits: Tensor, sum_logprobs: Tensor) -> Tuple[Tensor, bool]: + if tokens.shape[0] % self.beam_size != 0: + raise ValueError(f"{tokens.shape}[0] % {self.beam_size} != 0") + + n_audio = tokens.shape[0] // self.beam_size + if self.finished_sequences is None: # for the first update + self.finished_sequences = [{} for _ in range(n_audio)] + + logprobs = F.log_softmax(logits.float(), dim=-1) + next_tokens, source_indices, finished_sequences = [], [], [] + for i in range(n_audio): + scores, sources, finished = {}, {}, {} + + # STEP 1: calculate the cumulative log probabilities for possible candidates + for j in range(self.beam_size): + idx = i * self.beam_size + j + prefix = tokens[idx].tolist() + for logprob, token in zip(*logprobs[idx].topk(self.beam_size + 1)): + new_logprob = (sum_logprobs[idx] + logprob).item() + sequence = tuple(prefix + [token.item()]) + scores[sequence] = new_logprob + sources[sequence] = idx + + # STEP 2: rank the candidates and keep the top beam_size sequences for each audio + saved = 0 + for sequence in sorted(scores, key=scores.get, reverse=True): + if sequence[-1] == self.eot: + finished[sequence] = scores[sequence] + else: + sum_logprobs[len(next_tokens)] = scores[sequence] + next_tokens.append(sequence) + source_indices.append(sources[sequence]) + + saved += 1 + if saved == self.beam_size: + break + + finished_sequences.append(finished) + + tokens = torch.tensor(next_tokens, device=tokens.device) + self.inference.rearrange_kv_cache(source_indices) + + # add newly finished sequences to self.finished_sequences + assert len(self.finished_sequences) == len(finished_sequences) + for previously_finished, newly_finished in zip(self.finished_sequences, finished_sequences): + for seq in sorted(newly_finished, key=newly_finished.get, reverse=True): + if len(previously_finished) >= self.max_candidates: + break # the candidate list is full + previously_finished[seq] = newly_finished[seq] + + # mark as completed if all audio has enough number of samples + completed = all( + len(sequences) >= self.max_candidates for sequences in self.finished_sequences + ) + return tokens, completed + + def finalize(self, preceding_tokens: Tensor, sum_logprobs: Tensor): + # collect all finished sequences, including patience, and add unfinished ones if not enough + sum_logprobs = sum_logprobs.cpu() + for i, sequences in enumerate(self.finished_sequences): + if len(sequences) < self.beam_size: # when not enough sequences are finished + for j in list(np.argsort(sum_logprobs[i]))[::-1]: + sequence = preceding_tokens[i, j].tolist() + [self.eot] + sequences[tuple(sequence)] = sum_logprobs[i][j].item() + if len(sequences) >= self.beam_size: + break + + tokens: List[List[Tensor]] = [ + [torch.tensor(seq) for seq in sequences.keys()] for sequences in self.finished_sequences + ] + sum_logprobs: List[List[float]] = [ + list(sequences.values()) for sequences in self.finished_sequences + ] + return tokens, sum_logprobs + + +class LogitFilter: + def apply(self, logits: Tensor, tokens: Tensor) -> None: + """Apply any filtering or masking to logits in-place + + Parameters + ---------- + logits : Tensor, shape = (n_batch, vocab_size) + per-token logits of the probability distribution at the current step + + tokens : Tensor, shape = (n_batch, current_sequence_length) + all tokens in the context so far, including the prefix and sot_sequence tokens + + """ + raise NotImplementedError + + +class SuppressBlank(LogitFilter): + def __init__(self, tokenizer: Tokenizer, sample_begin: int): + self.tokenizer = tokenizer + self.sample_begin = sample_begin + + def apply(self, logits: Tensor, tokens: Tensor): + if tokens.shape[1] == self.sample_begin: + logits[:, self.tokenizer.encode(" ") + [self.tokenizer.eot]] = -np.inf + + +class SuppressTokens(LogitFilter): + def __init__(self, suppress_tokens: Sequence[int]): + self.suppress_tokens = list(suppress_tokens) + + def apply(self, logits: Tensor, tokens: Tensor): + logits[:, self.suppress_tokens] = -np.inf + + +class ApplyTimestampRules(LogitFilter): + def __init__( + self, tokenizer: Tokenizer, sample_begin: int, max_initial_timestamp_index: Optional[int] + ): + self.tokenizer = tokenizer + self.sample_begin = sample_begin + self.max_initial_timestamp_index = max_initial_timestamp_index + + def apply(self, logits: Tensor, tokens: Tensor): + # suppress <|notimestamps|> which is handled by without_timestamps + if self.tokenizer.no_timestamps is not None: + logits[:, self.tokenizer.no_timestamps] = -np.inf + + # timestamps have to appear in pairs, except directly before EOT; mask logits accordingly + for k in range(tokens.shape[0]): + seq = [t for t in tokens[k, self.sample_begin :].tolist()] + last_was_timestamp = len(seq) >= 1 and seq[-1] >= self.tokenizer.timestamp_begin + penultimate_was_timestamp = len(seq) < 2 or seq[-2] >= self.tokenizer.timestamp_begin + + if last_was_timestamp: + if penultimate_was_timestamp: # has to be non-timestamp + logits[k, self.tokenizer.timestamp_begin :] = -np.inf + else: # cannot be normal text tokens + logits[k, : self.tokenizer.eot] = -np.inf + + if tokens.shape[1] == self.sample_begin: + # suppress generating non-timestamp tokens at the beginning + logits[:, : self.tokenizer.timestamp_begin] = -np.inf + + # apply the `max_initial_timestamp` option + if self.max_initial_timestamp_index is not None: + last_allowed = self.tokenizer.timestamp_begin + self.max_initial_timestamp_index + logits[:, last_allowed + 1 :] = -np.inf + + # if sum of probability over timestamps is above any other token, sample timestamp + logprobs = F.log_softmax(logits.float(), dim=-1) + for k in range(tokens.shape[0]): + timestamp_logprob = logprobs[k, self.tokenizer.timestamp_begin :].logsumexp(dim=-1) + max_text_token_logprob = logprobs[k, : self.tokenizer.timestamp_begin].max() + if timestamp_logprob > max_text_token_logprob: + logits[k, : self.tokenizer.timestamp_begin] = -np.inf + + +class DecodingTask: + inference: Inference + sequence_ranker: SequenceRanker + decoder: TokenDecoder + logit_filters: List[LogitFilter] + + def __init__(self, model: "Whisper", options: DecodingOptions): + self.model = model + + language = options.language or "en" + tokenizer = get_tokenizer(model.is_multilingual, language=language, task=options.task) + self.tokenizer: Tokenizer = tokenizer + self.options: DecodingOptions = self._verify_options(options) + + self.n_group: int = options.beam_size or options.best_of or 1 + self.n_ctx: int = model.dims.n_text_ctx + self.sample_len: int = options.sample_len or model.dims.n_text_ctx // 2 + + self.sot_sequence: Tuple[int] = tokenizer.sot_sequence + if self.options.without_timestamps: + self.sot_sequence = tokenizer.sot_sequence_including_notimestamps + + self.initial_tokens: Tuple[int] = self._get_initial_tokens() + self.sample_begin: int = len(self.initial_tokens) + self.sot_index: int = self.initial_tokens.index(tokenizer.sot) + + # inference: implements the forward pass through the decoder, including kv caching + self.inference = PyTorchInference(model, len(self.initial_tokens)) + + # sequence ranker: implements how to rank a group of sampled sequences + self.sequence_ranker = MaximumLikelihoodRanker(options.length_penalty) + + # decoder: implements how to select the next tokens, given the autoregressive distribution + if options.beam_size is not None: + self.decoder = BeamSearchDecoder( + options.beam_size, tokenizer.eot, self.inference, options.patience + ) + else: + self.decoder = GreedyDecoder(options.temperature, tokenizer.eot) + + # logit filters: applies various rules to suppress or penalize certain tokens + self.logit_filters = [] + if self.options.suppress_blank: + self.logit_filters.append(SuppressBlank(self.tokenizer, self.sample_begin)) + if self.options.suppress_tokens: + self.logit_filters.append(SuppressTokens(self._get_suppress_tokens())) + if not options.without_timestamps: + precision = CHUNK_LENGTH / model.dims.n_audio_ctx # usually 0.02 seconds + max_initial_timestamp_index = None + if options.max_initial_timestamp: + max_initial_timestamp_index = round(self.options.max_initial_timestamp / precision) + self.logit_filters.append( + ApplyTimestampRules(tokenizer, self.sample_begin, max_initial_timestamp_index) + ) + + def _verify_options(self, options: DecodingOptions) -> DecodingOptions: + if options.beam_size is not None and options.best_of is not None: + raise ValueError("beam_size and best_of can't be given together") + if options.temperature == 0: + if options.best_of is not None: + raise ValueError("best_of with greedy sampling (T=0) is not compatible") + if options.patience is not None and options.beam_size is None: + raise ValueError("patience requires beam_size to be given") + if options.length_penalty is not None and not (0 <= options.length_penalty <= 1): + raise ValueError("length_penalty (alpha) should be a value between 0 and 1") + + return options + + def _get_initial_tokens(self) -> Tuple[int]: + tokens = list(self.sot_sequence) + prefix = self.options.prefix + prompt = self.options.prompt + + if prefix: + prefix_tokens = ( + self.tokenizer.encode(" " + prefix.strip()) if isinstance(prefix, str) else prefix + ) + if self.sample_len is not None: + max_prefix_len = self.n_ctx // 2 - self.sample_len + prefix_tokens = prefix_tokens[-max_prefix_len:] + tokens = tokens + prefix_tokens + + if prompt: + prompt_tokens = ( + self.tokenizer.encode(" " + prompt.strip()) if isinstance(prompt, str) else prompt + ) + tokens = [self.tokenizer.sot_prev] + prompt_tokens[-(self.n_ctx // 2 - 1) :] + tokens + + return tuple(tokens) + + def _get_suppress_tokens(self) -> Tuple[int]: + suppress_tokens = self.options.suppress_tokens + + if isinstance(suppress_tokens, str): + suppress_tokens = [int(t) for t in suppress_tokens.split(",")] + + if -1 in suppress_tokens: + suppress_tokens = [t for t in suppress_tokens if t >= 0] + suppress_tokens.extend(self.tokenizer.non_speech_tokens) + elif suppress_tokens is None or len(suppress_tokens) == 0: + suppress_tokens = [] # interpret empty string as an empty list + else: + assert isinstance(suppress_tokens, list), "suppress_tokens must be a list" + + suppress_tokens.extend( + [self.tokenizer.sot, self.tokenizer.sot_prev, self.tokenizer.sot_lm] + ) + if self.tokenizer.no_speech is not None: + # no-speech probability is collected separately + suppress_tokens.append(self.tokenizer.no_speech) + + return tuple(sorted(set(suppress_tokens))) + + def _get_audio_features(self, mel: Tensor): + if self.options.fp16: + mel = mel.half() + + if mel.shape[-2:] == (self.model.dims.n_audio_ctx, self.model.dims.n_audio_state): + # encoded audio features are given; skip audio encoding + print("encoded audio features are given; skip audio encoding") + audio_features = mel + else: + print(mel.shape) + print("===============================") + audio_features = self.model.encoder(mel) + + if audio_features.dtype != (torch.float16 if self.options.fp16 else torch.float32): + return TypeError(f"audio_features has an incorrect dtype: {audio_features.dtype}") + + return audio_features + + def _detect_language(self, audio_features: Tensor, tokens: Tensor): + languages = [self.options.language] * audio_features.shape[0] + lang_probs = None + + if self.options.language is None or self.options.task == "lang_id": + lang_tokens, lang_probs = self.model.detect_language(audio_features, self.tokenizer) + languages = [max(probs, key=probs.get) for probs in lang_probs] + if self.options.language is None: + tokens[:, self.sot_index + 1] = lang_tokens # write language tokens + + return languages, lang_probs + + def _main_loop(self, audio_features: Tensor, tokens: Tensor): + assert audio_features.shape[0] == tokens.shape[0] + n_batch = tokens.shape[0] + sum_logprobs: Tensor = torch.zeros(n_batch, device=audio_features.device) + no_speech_probs = [np.nan] * n_batch + + try: + for i in range(self.sample_len): + logits = self.inference.logits(tokens, audio_features) + + if i == 0 and self.tokenizer.no_speech is not None: # save no_speech_probs + probs_at_sot = logits[:, self.sot_index].float().softmax(dim=-1) + no_speech_probs = probs_at_sot[:, self.tokenizer.no_speech].tolist() + + # now we need to consider the logits at the last token only + logits = logits[:, -1] + + # apply the logit filters, e.g. for suppressing or applying penalty to + for logit_filter in self.logit_filters: + logit_filter.apply(logits, tokens) + + # expand the tokens tensor with the selected next tokens + tokens, completed = self.decoder.update(tokens, logits, sum_logprobs) + + if completed or tokens.shape[-1] > self.n_ctx: + break + finally: + self.inference.cleanup_caching() + + return tokens, sum_logprobs, no_speech_probs + + @torch.no_grad() + def run(self, mel: Tensor) -> List[DecodingResult]: + self.decoder.reset() + tokenizer: Tokenizer = self.tokenizer + n_audio: int = mel.shape[0] + + audio_features: Tensor = self._get_audio_features(mel) # encoder forward pass + tokens: Tensor = torch.tensor([self.initial_tokens]).repeat(n_audio, 1) + + # detect language if requested, overwriting the language token + languages, language_probs = self._detect_language(audio_features, tokens) + if self.options.task == "lang_id": + return [ + DecodingResult(audio_features=features, language=language, language_probs=probs) + for features, language, probs in zip(audio_features, languages, language_probs) + ] + + # repeat the audio & text tensors by the group size, for beam search or best-of-n sampling + audio_features = audio_features.repeat_interleave(self.n_group, dim=0) + tokens = tokens.repeat_interleave(self.n_group, dim=0).to(audio_features.device) + + # call the main sampling loop + tokens, sum_logprobs, no_speech_probs = self._main_loop(audio_features, tokens) + + # reshape the tensors to have (n_audio, n_group) as the first two dimensions + audio_features = audio_features[:: self.n_group] + no_speech_probs = no_speech_probs[:: self.n_group] + assert audio_features.shape[0] == len(no_speech_probs) == n_audio + + tokens = tokens.reshape(n_audio, self.n_group, -1) + sum_logprobs = sum_logprobs.reshape(n_audio, self.n_group) + + # get the final candidates for each group, and slice between the first sampled token and EOT + tokens, sum_logprobs = self.decoder.finalize(tokens, sum_logprobs) + tokens: List[List[Tensor]] = [ + [t[self.sample_begin : (t == tokenizer.eot).nonzero()[0, 0]] for t in s] for s in tokens + ] + + # select the top-ranked sample in each group + selected = self.sequence_ranker.rank(tokens, sum_logprobs) + tokens: List[List[int]] = [t[i].tolist() for i, t in zip(selected, tokens)] + texts: List[str] = [tokenizer.decode(t).strip() for t in tokens] + + sum_logprobs: List[float] = [lp[i] for i, lp in zip(selected, sum_logprobs)] + avg_logprobs: List[float] = [lp / (len(t) + 1) for t, lp in zip(tokens, sum_logprobs)] + + fields = (texts, languages, tokens, audio_features, avg_logprobs, no_speech_probs) + if len(set(map(len, fields))) != 1: + raise RuntimeError(f"inconsistent result lengths: {list(map(len, fields))}") + + return [ + DecodingResult( + audio_features=features, + language=language, + tokens=tokens, + text=text, + avg_logprob=avg_logprob, + no_speech_prob=no_speech_prob, + temperature=self.options.temperature, + compression_ratio=compression_ratio(text), + ) + for text, language, tokens, features, avg_logprob, no_speech_prob in zip(*fields) + ] + + +@torch.no_grad() +def decode(model: "Whisper", mel: Tensor, options: DecodingOptions = DecodingOptions()) -> Union[DecodingResult, List[DecodingResult]]: + """ + Performs decoding of 30-second audio segment(s), provided as Mel spectrogram(s). + + Parameters + ---------- + model: Whisper + the Whisper model instance + + mel: torch.Tensor, shape = (80, 3000) or (*, 80, 3000) + A tensor containing the Mel spectrogram(s) + + options: DecodingOptions + A dataclass that contains all necessary options for decoding 30-second segments + + Returns + ------- + result: Union[DecodingResult, List[DecodingResult]] + The result(s) of decoding contained in `DecodingResult` dataclass instance(s) + """ + single = mel.ndim == 2 + if single: + mel = mel.unsqueeze(0) + result = DecodingTask(model, options).run(mel) + + if single: + result = result[0] + + return result diff --git a/whisper/inference.py b/whisper/inference.py new file mode 100644 index 0000000000000000000000000000000000000000..f600f46f9337243d9636aa2888dd30f0bdd12328 --- /dev/null +++ b/whisper/inference.py @@ -0,0 +1,72 @@ +import sys,os +sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) +import numpy as np +import argparse +import torch + +from whisper.model import Whisper, ModelDimensions +from whisper.audio import load_audio, pad_or_trim, log_mel_spectrogram + + +def load_model(path) -> Whisper: + device = "cuda" if torch.cuda.is_available() else "cpu" + checkpoint = torch.load(path, map_location="cpu") + dims = ModelDimensions(**checkpoint["dims"]) + # print(dims) + model = Whisper(dims) + del model.decoder + cut = len(model.encoder.blocks) // 4 + cut = -1 * cut + del model.encoder.blocks[cut:] + model.load_state_dict(checkpoint["model_state_dict"], strict=False) + model.eval() + model.half() + model.to(device) + # torch.save({ + # 'dims': checkpoint["dims"], + # 'model_state_dict': model.state_dict(), + # }, "large-v2.pt") + return model + + +def pred_ppg(whisper: Whisper, wavPath, ppgPath): + audio = load_audio(wavPath) + audln = audio.shape[0] + ppg_a = [] + idx_s = 0 + while (idx_s + 15 * 16000 < audln): + short = audio[idx_s:idx_s + 15 * 16000] + idx_s = idx_s + 15 * 16000 + ppgln = 15 * 16000 // 320 + # short = pad_or_trim(short) + mel = log_mel_spectrogram(short).half().to(whisper.device) + with torch.no_grad(): + ppg = whisper.encoder(mel.unsqueeze(0)).squeeze().data.cpu().float().numpy() + ppg = ppg[:ppgln,] # [length, dim=1024] + ppg_a.extend(ppg) + if (idx_s < audln): + short = audio[idx_s:audln] + ppgln = (audln - idx_s) // 320 + # short = pad_or_trim(short) + mel = log_mel_spectrogram(short).half().to(whisper.device) + with torch.no_grad(): + ppg = whisper.encoder(mel.unsqueeze(0)).squeeze().data.cpu().float().numpy() + ppg = ppg[:ppgln,] # [length, dim=1024] + ppg_a.extend(ppg) + np.save(ppgPath, ppg_a, allow_pickle=False) + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.description = 'please enter embed parameter ...' + parser.add_argument("-w", "--wav", help="wav", dest="wav") + parser.add_argument("-p", "--ppg", help="ppg", dest="ppg") + args = parser.parse_args() + print(args.wav) + print(args.ppg) + + wavPath = args.wav + ppgPath = args.ppg + + whisper = load_model(os.path.join("whisper_pretrain", "large-v2.pt")) + pred_ppg(whisper, wavPath, ppgPath) diff --git a/whisper/model.py b/whisper/model.py new file mode 100644 index 0000000000000000000000000000000000000000..78d6d135ee8df21e4deb0fffa5d15e5631f960c5 --- /dev/null +++ b/whisper/model.py @@ -0,0 +1,270 @@ +from dataclasses import dataclass +from typing import Dict +from typing import Iterable, Optional + +import numpy as np +import torch +import torch.nn.functional as F +from torch import Tensor +from torch import nn + +from .decoding import detect_language as detect_language_function, decode as decode_function + + +@dataclass +class ModelDimensions: + n_mels: int + n_audio_ctx: int + n_audio_state: int + n_audio_head: int + n_audio_layer: int + n_vocab: int + n_text_ctx: int + n_text_state: int + n_text_head: int + n_text_layer: int + + +class LayerNorm(nn.LayerNorm): + def forward(self, x: Tensor) -> Tensor: + # return super().forward(x.float()).type(x.dtype) sovits5.0 + return super().forward(x).type(x.dtype) + + +class Linear(nn.Linear): + def forward(self, x: Tensor) -> Tensor: + return F.linear( + x, self.weight.to(x.dtype), None if self.bias is None else self.bias.to(x.dtype) + ) + + +class Conv1d(nn.Conv1d): + def _conv_forward(self, x: Tensor, weight: Tensor, bias: Optional[Tensor]) -> Tensor: + return super()._conv_forward( + x, weight.to(x.dtype), None if bias is None else bias.to(x.dtype) + ) + + +def sinusoids(length, channels, max_timescale=10000): + """Returns sinusoids for positional embedding""" + assert channels % 2 == 0 + log_timescale_increment = np.log(max_timescale) / (channels // 2 - 1) + inv_timescales = torch.exp(-log_timescale_increment * torch.arange(channels // 2)) + scaled_time = torch.arange(length)[:, np.newaxis] * inv_timescales[np.newaxis, :] + return torch.cat([torch.sin(scaled_time), torch.cos(scaled_time)], dim=1) + + +class MultiHeadAttention(nn.Module): + def __init__(self, n_state: int, n_head: int): + super().__init__() + self.n_head = n_head + self.query = Linear(n_state, n_state) + self.key = Linear(n_state, n_state, bias=False) + self.value = Linear(n_state, n_state) + self.out = Linear(n_state, n_state) + + def forward( + self, + x: Tensor, + xa: Optional[Tensor] = None, + mask: Optional[Tensor] = None, + kv_cache: Optional[dict] = None, + ): + q = self.query(x) + + if kv_cache is None or xa is None or self.key not in kv_cache: + # hooks, if installed (i.e. kv_cache is not None), will prepend the cached kv tensors; + # otherwise, perform key/value projections for self- or cross-attention as usual. + k = self.key(x if xa is None else xa) + v = self.value(x if xa is None else xa) + else: + # for cross-attention, calculate keys and values once and reuse in subsequent calls. + k = kv_cache[self.key] + v = kv_cache[self.value] + + wv, qk = self.qkv_attention(q, k, v, mask) + return self.out(wv), qk + + def qkv_attention(self, q: Tensor, k: Tensor, v: Tensor, mask: Optional[Tensor] = None): + n_batch, n_ctx, n_state = q.shape + scale = (n_state // self.n_head) ** -0.25 + q = q.view(*q.shape[:2], self.n_head, -1).permute(0, 2, 1, 3) * scale + k = k.view(*k.shape[:2], self.n_head, -1).permute(0, 2, 3, 1) * scale + v = v.view(*v.shape[:2], self.n_head, -1).permute(0, 2, 1, 3) + + qk = q @ k + if mask is not None: + qk = qk + mask[:n_ctx, :n_ctx] + qk = qk.float() + + w = F.softmax(qk, dim=-1).to(q.dtype) + return (w @ v).permute(0, 2, 1, 3).flatten(start_dim=2), qk.detach() + + +class ResidualAttentionBlock(nn.Module): + def __init__(self, n_state: int, n_head: int, cross_attention: bool = False): + super().__init__() + + self.attn = MultiHeadAttention(n_state, n_head) + self.attn_ln = LayerNorm(n_state) + + self.cross_attn = MultiHeadAttention(n_state, n_head) if cross_attention else None + self.cross_attn_ln = LayerNorm(n_state) if cross_attention else None + + n_mlp = n_state * 4 + self.mlp = nn.Sequential(Linear(n_state, n_mlp), nn.GELU(), Linear(n_mlp, n_state)) + self.mlp_ln = LayerNorm(n_state) + + def forward( + self, + x: Tensor, + xa: Optional[Tensor] = None, + mask: Optional[Tensor] = None, + kv_cache: Optional[dict] = None, + ): + x = x + self.attn(self.attn_ln(x), mask=mask, kv_cache=kv_cache)[0] + if self.cross_attn: + x = x + self.cross_attn(self.cross_attn_ln(x), xa, kv_cache=kv_cache)[0] + x = x + self.mlp(self.mlp_ln(x)) + return x + + +class AudioEncoder(nn.Module): + def __init__(self, n_mels: int, n_ctx: int, n_state: int, n_head: int, n_layer: int): + super().__init__() + self.conv1 = Conv1d(n_mels, n_state, kernel_size=3, padding=1) + self.conv2 = Conv1d(n_state, n_state, kernel_size=3, stride=2, padding=1) + self.register_buffer("positional_embedding", sinusoids(n_ctx, n_state)) + + self.blocks: Iterable[ResidualAttentionBlock] = nn.ModuleList( + [ResidualAttentionBlock(n_state, n_head) for _ in range(n_layer)] + ) + self.ln_post = LayerNorm(n_state) + + def forward(self, x: Tensor): + """ + x : torch.Tensor, shape = (batch_size, n_mels, n_ctx) + the mel spectrogram of the audio + """ + x = F.gelu(self.conv1(x)) + x = F.gelu(self.conv2(x)) + x = x.permute(0, 2, 1) + + len_x = x.shape[1] + len_e = self.positional_embedding.shape[0] + assert len_x <= len_e, "incorrect audio shape" + pos_e = self.positional_embedding[:len_x, :] + x = (x + pos_e).to(x.dtype) + + for block in self.blocks: + x = block(x) + + x = self.ln_post(x) + return x + + +class TextDecoder(nn.Module): + def __init__(self, n_vocab: int, n_ctx: int, n_state: int, n_head: int, n_layer: int): + super().__init__() + + self.token_embedding = nn.Embedding(n_vocab, n_state) + self.positional_embedding = nn.Parameter(torch.empty(n_ctx, n_state)) + + self.blocks: Iterable[ResidualAttentionBlock] = nn.ModuleList( + [ResidualAttentionBlock(n_state, n_head, cross_attention=True) for _ in range(n_layer)] + ) + self.ln = LayerNorm(n_state) + + mask = torch.empty(n_ctx, n_ctx).fill_(-np.inf).triu_(1) + self.register_buffer("mask", mask, persistent=False) + + def forward(self, x: Tensor, xa: Tensor, kv_cache: Optional[dict] = None): + """ + x : torch.LongTensor, shape = (batch_size, <= n_ctx) + the text tokens + xa : torch.Tensor, shape = (batch_size, n_mels, n_audio_ctx) + the encoded audio features to be attended on + """ + offset = next(iter(kv_cache.values())).shape[1] if kv_cache else 0 + x = self.token_embedding(x) + self.positional_embedding[offset : offset + x.shape[-1]] + x = x.to(xa.dtype) + + for block in self.blocks: + x = block(x, xa, mask=self.mask, kv_cache=kv_cache) + + x = self.ln(x) + logits = (x @ torch.transpose(self.token_embedding.weight.to(x.dtype), 0, 1)).float() + + return logits + + +class Whisper(nn.Module): + def __init__(self, dims: ModelDimensions): + super().__init__() + self.dims = dims + self.encoder = AudioEncoder( + self.dims.n_mels, + self.dims.n_audio_ctx, + self.dims.n_audio_state, + self.dims.n_audio_head, + self.dims.n_audio_layer, + ) + self.decoder = TextDecoder( + self.dims.n_vocab, + self.dims.n_text_ctx, + self.dims.n_text_state, + self.dims.n_text_head, + self.dims.n_text_layer, + ) + + def embed_audio(self, mel: torch.Tensor): + return self.encoder(mel) + + def logits(self, tokens: torch.Tensor, audio_features: torch.Tensor): + return self.decoder(tokens, audio_features) + + def forward(self, mel: torch.Tensor, tokens: torch.Tensor) -> Dict[str, torch.Tensor]: + return self.decoder(tokens, self.encoder(mel)) + + @property + def device(self): + return next(self.parameters()).device + + @property + def is_multilingual(self): + return self.dims.n_vocab == 51865 + + def install_kv_cache_hooks(self, cache: Optional[dict] = None): + """ + The `MultiHeadAttention` module optionally accepts `kv_cache` which stores the key and value + tensors calculated for the previous positions. This method returns a dictionary that stores + all caches, and the necessary hooks for the key and value projection modules that save the + intermediate tensors to be reused during later calculations. + + Returns + ------- + cache : Dict[nn.Module, torch.Tensor] + A dictionary object mapping the key/value projection modules to its cache + hooks : List[RemovableHandle] + List of PyTorch RemovableHandle objects to stop the hooks to be called + """ + cache = {**cache} if cache is not None else {} + hooks = [] + + def save_to_cache(module, _, output): + if module not in cache or output.shape[1] > self.decoder.positional_embedding.shape[0]: + cache[module] = output # save as-is, for the first token or cross attention + else: + cache[module] = torch.cat([cache[module], output], dim=1).detach() + return cache[module] + + def install_hooks(layer: nn.Module): + if isinstance(layer, MultiHeadAttention): + hooks.append(layer.key.register_forward_hook(save_to_cache)) + hooks.append(layer.value.register_forward_hook(save_to_cache)) + + self.decoder.apply(install_hooks) + return cache, hooks + + detect_language = detect_language_function + decode = decode_function diff --git a/whisper/tokenizer.py b/whisper/tokenizer.py new file mode 100644 index 0000000000000000000000000000000000000000..a27cb359ee891590d3f793624f9f8ec768a26cc3 --- /dev/null +++ b/whisper/tokenizer.py @@ -0,0 +1,331 @@ +import os +from dataclasses import dataclass +from functools import lru_cache +from typing import List, Optional, Tuple, Union + +import numpy as np +import torch +from transformers import GPT2TokenizerFast + +LANGUAGES = { + "en": "english", + "zh": "chinese", + "de": "german", + "es": "spanish", + "ru": "russian", + "ko": "korean", + "fr": "french", + "ja": "japanese", + "pt": "portuguese", + "tr": "turkish", + "pl": "polish", + "ca": "catalan", + "nl": "dutch", + "ar": "arabic", + "sv": "swedish", + "it": "italian", + "id": "indonesian", + "hi": "hindi", + "fi": "finnish", + "vi": "vietnamese", + "he": "hebrew", + "uk": "ukrainian", + "el": "greek", + "ms": "malay", + "cs": "czech", + "ro": "romanian", + "da": "danish", + "hu": "hungarian", + "ta": "tamil", + "no": "norwegian", + "th": "thai", + "ur": "urdu", + "hr": "croatian", + "bg": "bulgarian", + "lt": "lithuanian", + "la": "latin", + "mi": "maori", + "ml": "malayalam", + "cy": "welsh", + "sk": "slovak", + "te": "telugu", + "fa": "persian", + "lv": "latvian", + "bn": "bengali", + "sr": "serbian", + "az": "azerbaijani", + "sl": "slovenian", + "kn": "kannada", + "et": "estonian", + "mk": "macedonian", + "br": "breton", + "eu": "basque", + "is": "icelandic", + "hy": "armenian", + "ne": "nepali", + "mn": "mongolian", + "bs": "bosnian", + "kk": "kazakh", + "sq": "albanian", + "sw": "swahili", + "gl": "galician", + "mr": "marathi", + "pa": "punjabi", + "si": "sinhala", + "km": "khmer", + "sn": "shona", + "yo": "yoruba", + "so": "somali", + "af": "afrikaans", + "oc": "occitan", + "ka": "georgian", + "be": "belarusian", + "tg": "tajik", + "sd": "sindhi", + "gu": "gujarati", + "am": "amharic", + "yi": "yiddish", + "lo": "lao", + "uz": "uzbek", + "fo": "faroese", + "ht": "haitian creole", + "ps": "pashto", + "tk": "turkmen", + "nn": "nynorsk", + "mt": "maltese", + "sa": "sanskrit", + "lb": "luxembourgish", + "my": "myanmar", + "bo": "tibetan", + "tl": "tagalog", + "mg": "malagasy", + "as": "assamese", + "tt": "tatar", + "haw": "hawaiian", + "ln": "lingala", + "ha": "hausa", + "ba": "bashkir", + "jw": "javanese", + "su": "sundanese", +} + +# language code lookup by name, with a few language aliases +TO_LANGUAGE_CODE = { + **{language: code for code, language in LANGUAGES.items()}, + "burmese": "my", + "valencian": "ca", + "flemish": "nl", + "haitian": "ht", + "letzeburgesch": "lb", + "pushto": "ps", + "panjabi": "pa", + "moldavian": "ro", + "moldovan": "ro", + "sinhalese": "si", + "castilian": "es", +} + + +@dataclass(frozen=True) +class Tokenizer: + """A thin wrapper around `GPT2TokenizerFast` providing quick access to special tokens""" + + tokenizer: "GPT2TokenizerFast" + language: Optional[str] + sot_sequence: Tuple[int] + + def encode(self, text, **kwargs): + return self.tokenizer.encode(text, **kwargs) + + def decode(self, token_ids: Union[int, List[int], np.ndarray, torch.Tensor], **kwargs): + return self.tokenizer.decode(token_ids, **kwargs) + + def decode_with_timestamps(self, tokens) -> str: + """ + Timestamp tokens are above the special tokens' id range and are ignored by `decode()`. + This method decodes given tokens with timestamps tokens annotated, e.g. "<|1.08|>". + """ + outputs = [[]] + for token in tokens: + if token >= self.timestamp_begin: + timestamp = f"<|{(token - self.timestamp_begin) * 0.02:.2f}|>" + outputs.append(timestamp) + outputs.append([]) + else: + outputs[-1].append(token) + outputs = [s if isinstance(s, str) else self.tokenizer.decode(s) for s in outputs] + return "".join(outputs) + + @property + @lru_cache() + def eot(self) -> int: + return self.tokenizer.eos_token_id + + @property + @lru_cache() + def sot(self) -> int: + return self._get_single_token_id("<|startoftranscript|>") + + @property + @lru_cache() + def sot_lm(self) -> int: + return self._get_single_token_id("<|startoflm|>") + + @property + @lru_cache() + def sot_prev(self) -> int: + return self._get_single_token_id("<|startofprev|>") + + @property + @lru_cache() + def no_speech(self) -> int: + return self._get_single_token_id("<|nospeech|>") + + @property + @lru_cache() + def no_timestamps(self) -> int: + return self._get_single_token_id("<|notimestamps|>") + + @property + @lru_cache() + def timestamp_begin(self) -> int: + return self.tokenizer.all_special_ids[-1] + 1 + + @property + @lru_cache() + def language_token(self) -> int: + """Returns the token id corresponding to the value of the `language` field""" + if self.language is None: + raise ValueError(f"This tokenizer does not have language token configured") + + additional_tokens = dict( + zip( + self.tokenizer.additional_special_tokens, + self.tokenizer.additional_special_tokens_ids, + ) + ) + candidate = f"<|{self.language}|>" + if candidate in additional_tokens: + return additional_tokens[candidate] + + raise KeyError(f"Language {self.language} not found in tokenizer.") + + @property + @lru_cache() + def all_language_tokens(self) -> Tuple[int]: + result = [] + for token, token_id in zip( + self.tokenizer.additional_special_tokens, + self.tokenizer.additional_special_tokens_ids, + ): + if token.strip("<|>") in LANGUAGES: + result.append(token_id) + return tuple(result) + + @property + @lru_cache() + def all_language_codes(self) -> Tuple[str]: + return tuple(self.decode([l]).strip("<|>") for l in self.all_language_tokens) + + @property + @lru_cache() + def sot_sequence_including_notimestamps(self) -> Tuple[int]: + return tuple(list(self.sot_sequence) + [self.no_timestamps]) + + @property + @lru_cache() + def non_speech_tokens(self) -> Tuple[int]: + """ + Returns the list of tokens to suppress in order to avoid any speaker tags or non-speech + annotations, to prevent sampling texts that are not actually spoken in the audio, e.g. + + - ♪♪♪ + - ( SPEAKING FOREIGN LANGUAGE ) + - [DAVID] Hey there, + + keeping basic punctuations like commas, periods, question marks, exclamation points, etc. + """ + symbols = list("\"#()*+/:;<=>@[\\]^_`{|}~「」『』") + symbols += "<< >> <<< >>> -- --- -( -[ (' (\" (( )) ((( ))) [[ ]] {{ }} ♪♪ ♪♪♪".split() + + # symbols that may be a single token or multiple tokens depending on the tokenizer. + # In case they're multiple tokens, suppress the first token, which is safe because: + # These are between U+2640 and U+267F miscellaneous symbols that are okay to suppress + # in generations, and in the 3-byte UTF-8 representation they share the first two bytes. + miscellaneous = set("♩♪♫♬♭♮♯") + assert all(0x2640 <= ord(c) <= 0x267F for c in miscellaneous) + + # allow hyphens "-" and single quotes "'" between words, but not at the beginning of a word + result = {self.tokenizer.encode(" -")[0], self.tokenizer.encode(" '")[0]} + for symbol in symbols + list(miscellaneous): + for tokens in [self.tokenizer.encode(symbol), self.tokenizer.encode(" " + symbol)]: + if len(tokens) == 1 or symbol in miscellaneous: + result.add(tokens[0]) + + return tuple(sorted(result)) + + def _get_single_token_id(self, text) -> int: + tokens = self.tokenizer.encode(text) + assert len(tokens) == 1, f"{text} is not encoded as a single token" + return tokens[0] + + +@lru_cache(maxsize=None) +def build_tokenizer(name: str = "gpt2"): + os.environ["TOKENIZERS_PARALLELISM"] = "false" + path = os.path.join(os.path.dirname(__file__), "assets", name) + tokenizer = GPT2TokenizerFast.from_pretrained(path) + + specials = [ + "<|startoftranscript|>", + *[f"<|{lang}|>" for lang in LANGUAGES.keys()], + "<|translate|>", + "<|transcribe|>", + "<|startoflm|>", + "<|startofprev|>", + "<|nospeech|>", + "<|notimestamps|>", + ] + + tokenizer.add_special_tokens(dict(additional_special_tokens=specials)) + return tokenizer + + +@lru_cache(maxsize=None) +def get_tokenizer( + multilingual: bool, + *, + task: Optional[str] = None, # Literal["transcribe", "translate", None] + language: Optional[str] = None, +) -> Tokenizer: + if language is not None: + language = language.lower() + if language not in LANGUAGES: + if language in TO_LANGUAGE_CODE: + language = TO_LANGUAGE_CODE[language] + else: + raise ValueError(f"Unsupported language: {language}") + + if multilingual: + tokenizer_name = "multilingual" + task = task or "transcribe" + language = language or "en" + else: + tokenizer_name = "gpt2" + task = None + language = None + + tokenizer = build_tokenizer(name=tokenizer_name) + all_special_ids: List[int] = tokenizer.all_special_ids + sot: int = all_special_ids[1] + translate: int = all_special_ids[-6] + transcribe: int = all_special_ids[-5] + + langs = tuple(LANGUAGES.keys()) + sot_sequence = [sot] + if language is not None: + sot_sequence.append(sot + 1 + langs.index(language)) + if task is not None: + sot_sequence.append(transcribe if task == "transcribe" else translate) + + return Tokenizer(tokenizer=tokenizer, language=language, sot_sequence=tuple(sot_sequence)) diff --git a/whisper/utils.py b/whisper/utils.py new file mode 100644 index 0000000000000000000000000000000000000000..5dacc173c40bcd6e999d728862e29a968000b12e --- /dev/null +++ b/whisper/utils.py @@ -0,0 +1,163 @@ +import json +import os +import sys +import zlib +from typing import Callable, TextIO + +system_encoding = sys.getdefaultencoding() + +if system_encoding != "utf-8": + def make_safe(string): + # replaces any character not representable using the system default encoding with an '?', + # avoiding UnicodeEncodeError (https://github.com/openai/whisper/discussions/729). + return string.encode(system_encoding, errors="replace").decode(system_encoding) +else: + def make_safe(string): + # utf-8 can encode any Unicode code point, so no need to do the round-trip encoding + return string + + +def exact_div(x, y): + assert x % y == 0 + return x // y + + +def str2bool(string): + str2val = {"True": True, "False": False} + if string in str2val: + return str2val[string] + else: + raise ValueError(f"Expected one of {set(str2val.keys())}, got {string}") + + +def optional_int(string): + return None if string == "None" else int(string) + + +def optional_float(string): + return None if string == "None" else float(string) + + +def compression_ratio(text) -> float: + text_bytes = text.encode("utf-8") + return len(text_bytes) / len(zlib.compress(text_bytes)) + + +def format_timestamp(seconds: float, always_include_hours: bool = False, decimal_marker: str = '.'): + assert seconds >= 0, "non-negative timestamp expected" + milliseconds = round(seconds * 1000.0) + + hours = milliseconds // 3_600_000 + milliseconds -= hours * 3_600_000 + + minutes = milliseconds // 60_000 + milliseconds -= minutes * 60_000 + + seconds = milliseconds // 1_000 + milliseconds -= seconds * 1_000 + + hours_marker = f"{hours:02d}:" if always_include_hours or hours > 0 else "" + return f"{hours_marker}{minutes:02d}:{seconds:02d}{decimal_marker}{milliseconds:03d}" + + +class ResultWriter: + extension: str + + def __init__(self, output_dir: str): + self.output_dir = output_dir + + def __call__(self, result: dict, audio_path: str): + audio_basename = os.path.basename(audio_path) + output_path = os.path.join(self.output_dir, audio_basename + "." + self.extension) + + with open(output_path, "w", encoding="utf-8") as f: + self.write_result(result, file=f) + + def write_result(self, result: dict, file: TextIO): + raise NotImplementedError + + +class WriteTXT(ResultWriter): + extension: str = "txt" + + def write_result(self, result: dict, file: TextIO): + for segment in result["segments"]: + print(segment['text'].strip(), file=file, flush=True) + + +class WriteVTT(ResultWriter): + extension: str = "vtt" + + def write_result(self, result: dict, file: TextIO): + print("WEBVTT\n", file=file) + for segment in result["segments"]: + print( + f"{format_timestamp(segment['start'])} --> {format_timestamp(segment['end'])}\n" + f"{segment['text'].strip().replace('-->', '->')}\n", + file=file, + flush=True, + ) + + +class WriteSRT(ResultWriter): + extension: str = "srt" + + def write_result(self, result: dict, file: TextIO): + for i, segment in enumerate(result["segments"], start=1): + # write srt lines + print( + f"{i}\n" + f"{format_timestamp(segment['start'], always_include_hours=True, decimal_marker=',')} --> " + f"{format_timestamp(segment['end'], always_include_hours=True, decimal_marker=',')}\n" + f"{segment['text'].strip().replace('-->', '->')}\n", + file=file, + flush=True, + ) + + +class WriteTSV(ResultWriter): + """ + Write a transcript to a file in TSV (tab-separated values) format containing lines like: + \t\t + + Using integer milliseconds as start and end times means there's no chance of interference from + an environment setting a language encoding that causes the decimal in a floating point number + to appear as a comma; also is faster and more efficient to parse & store, e.g., in C++. + """ + extension: str = "tsv" + + def write_result(self, result: dict, file: TextIO): + print("start", "end", "text", sep="\t", file=file) + for segment in result["segments"]: + print(round(1000 * segment['start']), file=file, end="\t") + print(round(1000 * segment['end']), file=file, end="\t") + print(segment['text'].strip().replace("\t", " "), file=file, flush=True) + + +class WriteJSON(ResultWriter): + extension: str = "json" + + def write_result(self, result: dict, file: TextIO): + json.dump(result, file) + + +def get_writer(output_format: str, output_dir: str) -> Callable[[dict, TextIO], None]: + writers = { + "txt": WriteTXT, + "vtt": WriteVTT, + "srt": WriteSRT, + "tsv": WriteTSV, + "json": WriteJSON, + } + + if output_format == "all": + all_writers = [writer(output_dir) for writer in writers.values()] + + def write_all(result: dict, file: TextIO): + for writer in all_writers: + writer(result, file) + + return write_all + + return writers[output_format](output_dir) + diff --git a/zipit.ps1 b/zipit.ps1 new file mode 100644 index 0000000000000000000000000000000000000000..c68923af5fc71cebeb08b4cfae52458df0fe4f71 --- /dev/null +++ b/zipit.ps1 @@ -0,0 +1,12 @@ +Invoke-Expression "runtime/python.exe -m pip uninstall -y transformers" +Invoke-Expression "runtime/python.exe -m pip uninstall -y tensorboard" + +$Date = Get-Date -Format "yyyyMMdd" +$ArchiveName = "so-vits-svc-5.0-$Date.zip" + +$ExcludeItems = @("__pycache__/", "*_pretrain/", "chkpt/", "data_svc/", "dataset_raw/", "files/", "logs/", "*git/") +$ExcludeArgs = $ExcludeItems | ForEach-Object { "-xr!$_" } + +$Command = "7za a $ArchiveName $ExcludeArgs ./" + +Invoke-Expression $Command \ No newline at end of file