Upload 159 files
Browse filesThis view is limited to 50 files because it contains too many changes.
See raw diff
- .gitattributes +37 -0
- .gitignore +2 -0
- .project-root +0 -0
- README.md +12 -0
- app.py +52 -0
- checkpoints/fish-speech-1.4-sft-yth-lora/config.json +21 -0
- checkpoints/fish-speech-1.4-sft-yth-lora/model.pth +3 -0
- checkpoints/fish-speech-1.4-sft-yth-lora/special_tokens_map.json +23 -0
- checkpoints/fish-speech-1.4-sft-yth-lora/tokenizer.json +0 -0
- checkpoints/fish-speech-1.4-sft-yth-lora/tokenizer_config.json +82 -0
- checkpoints/fish-speech-1.4/README.md +61 -0
- checkpoints/fish-speech-1.4/config.json +21 -0
- checkpoints/fish-speech-1.4/firefly-gan-vq-fsq-8x1024-21hz-generator.pth +3 -0
- checkpoints/fish-speech-1.4/model.pth +3 -0
- checkpoints/fish-speech-1.4/special_tokens_map.json +23 -0
- checkpoints/fish-speech-1.4/tokenizer.json +0 -0
- checkpoints/fish-speech-1.4/tokenizer_config.json +82 -0
- configs/base.yaml +87 -0
- configs/firefly_gan_vq.yaml +33 -0
- configs/lora/r_8_alpha_16.yaml +4 -0
- configs/text2semantic_finetune.yaml +83 -0
- examples/40_matthew-001-01.lab +1 -0
- examples/40_matthew-001-01.wav +3 -0
- fish_speech/callbacks/__init__.py +3 -0
- fish_speech/callbacks/__pycache__/__init__.cpython-310.pyc +0 -0
- fish_speech/callbacks/__pycache__/grad_norm.cpython-310.pyc +0 -0
- fish_speech/callbacks/grad_norm.py +113 -0
- fish_speech/configs/base.yaml +87 -0
- fish_speech/configs/firefly_gan_vq.yaml +33 -0
- fish_speech/configs/lora/r_8_alpha_16.yaml +4 -0
- fish_speech/configs/text2semantic_finetune.yaml +83 -0
- fish_speech/conversation.py +2 -0
- fish_speech/datasets/__pycache__/semantic.cpython-310.pyc +0 -0
- fish_speech/datasets/concat_repeat.py +53 -0
- fish_speech/datasets/protos/__pycache__/text_data_pb2.cpython-310.pyc +0 -0
- fish_speech/datasets/protos/__pycache__/text_data_stream.cpython-310.pyc +0 -0
- fish_speech/datasets/protos/text-data.proto +24 -0
- fish_speech/datasets/protos/text_data_pb2.py +33 -0
- fish_speech/datasets/protos/text_data_stream.py +36 -0
- fish_speech/datasets/semantic.py +496 -0
- fish_speech/datasets/vqgan.py +147 -0
- fish_speech/i18n/README.md +27 -0
- fish_speech/i18n/__init__.py +3 -0
- fish_speech/i18n/__pycache__/__init__.cpython-310.pyc +0 -0
- fish_speech/i18n/__pycache__/core.cpython-310.pyc +0 -0
- fish_speech/i18n/core.py +40 -0
- fish_speech/i18n/locale/en_US.json +122 -0
- fish_speech/i18n/locale/es_ES.json +122 -0
- fish_speech/i18n/locale/ja_JP.json +123 -0
- fish_speech/i18n/locale/pt_BR.json +133 -0
.gitattributes
ADDED
|
@@ -0,0 +1,37 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
*.7z filter=lfs diff=lfs merge=lfs -text
|
| 2 |
+
*.arrow filter=lfs diff=lfs merge=lfs -text
|
| 3 |
+
*.bin filter=lfs diff=lfs merge=lfs -text
|
| 4 |
+
*.bz2 filter=lfs diff=lfs merge=lfs -text
|
| 5 |
+
*.ckpt filter=lfs diff=lfs merge=lfs -text
|
| 6 |
+
*.ftz filter=lfs diff=lfs merge=lfs -text
|
| 7 |
+
*.gz filter=lfs diff=lfs merge=lfs -text
|
| 8 |
+
*.h5 filter=lfs diff=lfs merge=lfs -text
|
| 9 |
+
*.joblib filter=lfs diff=lfs merge=lfs -text
|
| 10 |
+
*.lfs.* filter=lfs diff=lfs merge=lfs -text
|
| 11 |
+
*.mlmodel filter=lfs diff=lfs merge=lfs -text
|
| 12 |
+
*.model filter=lfs diff=lfs merge=lfs -text
|
| 13 |
+
*.msgpack filter=lfs diff=lfs merge=lfs -text
|
| 14 |
+
*.npy filter=lfs diff=lfs merge=lfs -text
|
| 15 |
+
*.npz filter=lfs diff=lfs merge=lfs -text
|
| 16 |
+
*.onnx filter=lfs diff=lfs merge=lfs -text
|
| 17 |
+
*.ot filter=lfs diff=lfs merge=lfs -text
|
| 18 |
+
*.parquet filter=lfs diff=lfs merge=lfs -text
|
| 19 |
+
*.pb filter=lfs diff=lfs merge=lfs -text
|
| 20 |
+
*.pickle filter=lfs diff=lfs merge=lfs -text
|
| 21 |
+
*.pkl filter=lfs diff=lfs merge=lfs -text
|
| 22 |
+
*.pt filter=lfs diff=lfs merge=lfs -text
|
| 23 |
+
*.pth filter=lfs diff=lfs merge=lfs -text
|
| 24 |
+
*.rar filter=lfs diff=lfs merge=lfs -text
|
| 25 |
+
*.safetensors filter=lfs diff=lfs merge=lfs -text
|
| 26 |
+
saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
| 27 |
+
*.tar.* filter=lfs diff=lfs merge=lfs -text
|
| 28 |
+
*.tar filter=lfs diff=lfs merge=lfs -text
|
| 29 |
+
*.tflite filter=lfs diff=lfs merge=lfs -text
|
| 30 |
+
*.tgz filter=lfs diff=lfs merge=lfs -text
|
| 31 |
+
*.wasm filter=lfs diff=lfs merge=lfs -text
|
| 32 |
+
*.xz filter=lfs diff=lfs merge=lfs -text
|
| 33 |
+
*.zip filter=lfs diff=lfs merge=lfs -text
|
| 34 |
+
*.zst filter=lfs diff=lfs merge=lfs -text
|
| 35 |
+
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
| 36 |
+
excemple/40_matthew-001-01.wav filter=lfs diff=lfs merge=lfs -text
|
| 37 |
+
examples/40_matthew-001-01.wav filter=lfs diff=lfs merge=lfs -text
|
.gitignore
ADDED
|
@@ -0,0 +1,2 @@
|
|
|
|
|
|
|
|
|
|
| 1 |
+
__pycache__
|
| 2 |
+
checkpoints
|
.project-root
ADDED
|
File without changes
|
README.md
ADDED
|
@@ -0,0 +1,12 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
---
|
| 2 |
+
title: Demo
|
| 3 |
+
emoji: 👁
|
| 4 |
+
colorFrom: gray
|
| 5 |
+
colorTo: gray
|
| 6 |
+
sdk: gradio
|
| 7 |
+
sdk_version: 5.0.1
|
| 8 |
+
app_file: app.py
|
| 9 |
+
pinned: false
|
| 10 |
+
---
|
| 11 |
+
|
| 12 |
+
Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
|
app.py
ADDED
|
@@ -0,0 +1,52 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import argparse
|
| 2 |
+
import subprocess
|
| 3 |
+
import torch
|
| 4 |
+
import spaces
|
| 5 |
+
from transformers import AutoTokenizer,AutoModelForCausalLM
|
| 6 |
+
|
| 7 |
+
@torch.no_grad()
|
| 8 |
+
def main():
|
| 9 |
+
# 设置命令行参数解析器
|
| 10 |
+
parser = argparse.ArgumentParser(description="启动 WebUI")
|
| 11 |
+
parser.add_argument(
|
| 12 |
+
"--llama-checkpoint-path",
|
| 13 |
+
type=str,
|
| 14 |
+
default="checkpoints/fish-speech-1.4-sft-yth-lora",
|
| 15 |
+
help="Llama 检查点路径",
|
| 16 |
+
)
|
| 17 |
+
parser.add_argument(
|
| 18 |
+
"--decoder-checkpoint-path",
|
| 19 |
+
type=str,
|
| 20 |
+
default="checkpoints/fish-speech-1.4/firefly-gan-vq-fsq-8x1024-21hz-generator.pth",
|
| 21 |
+
help="解码器检查点路径",
|
| 22 |
+
)
|
| 23 |
+
parser.add_argument(
|
| 24 |
+
"--decoder-config-name",
|
| 25 |
+
type=str,
|
| 26 |
+
default="firefly_gan_vq",
|
| 27 |
+
help="解码器配置名称",
|
| 28 |
+
)
|
| 29 |
+
parser.add_argument(
|
| 30 |
+
"--device",
|
| 31 |
+
type=str,
|
| 32 |
+
default="cpu",
|
| 33 |
+
help="设备类型",
|
| 34 |
+
)
|
| 35 |
+
|
| 36 |
+
# 解析命令行参数
|
| 37 |
+
args = parser.parse_args()
|
| 38 |
+
|
| 39 |
+
# 启动 WebUI
|
| 40 |
+
subprocess.run([
|
| 41 |
+
"python",
|
| 42 |
+
"tools/webui.py",
|
| 43 |
+
"--llama-checkpoint-path", args.llama_checkpoint_path,
|
| 44 |
+
"--decoder-checkpoint-path", args.decoder_checkpoint_path,
|
| 45 |
+
"--decoder-config-name", args.decoder_config_name,
|
| 46 |
+
"--device", args.device,
|
| 47 |
+
])
|
| 48 |
+
|
| 49 |
+
if __name__ == "__main__":
|
| 50 |
+
|
| 51 |
+
main()
|
| 52 |
+
|
checkpoints/fish-speech-1.4-sft-yth-lora/config.json
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"attention_qkv_bias": false,
|
| 3 |
+
"codebook_size": 1024,
|
| 4 |
+
"dim": 1024,
|
| 5 |
+
"dropout": 0.1,
|
| 6 |
+
"head_dim": 64,
|
| 7 |
+
"initializer_range": 0.02,
|
| 8 |
+
"intermediate_size": 4096,
|
| 9 |
+
"max_seq_len": 4096,
|
| 10 |
+
"model_type": "dual_ar",
|
| 11 |
+
"n_fast_layer": 4,
|
| 12 |
+
"n_head": 16,
|
| 13 |
+
"n_layer": 24,
|
| 14 |
+
"n_local_heads": 2,
|
| 15 |
+
"norm_eps": 1e-06,
|
| 16 |
+
"num_codebooks": 8,
|
| 17 |
+
"rope_base": 1000000.0,
|
| 18 |
+
"tie_word_embeddings": false,
|
| 19 |
+
"use_gradient_checkpointing": true,
|
| 20 |
+
"vocab_size": 32000
|
| 21 |
+
}
|
checkpoints/fish-speech-1.4-sft-yth-lora/model.pth
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:25a27344ad35a0514f9cd60869276d2b5bf23dde6b39fe8a0421050e06984246
|
| 3 |
+
size 988997246
|
checkpoints/fish-speech-1.4-sft-yth-lora/special_tokens_map.json
ADDED
|
@@ -0,0 +1,23 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"bos_token": {
|
| 3 |
+
"content": "<|begin_of_sequence|>",
|
| 4 |
+
"lstrip": false,
|
| 5 |
+
"normalized": false,
|
| 6 |
+
"rstrip": false,
|
| 7 |
+
"single_word": false
|
| 8 |
+
},
|
| 9 |
+
"eos_token": {
|
| 10 |
+
"content": "<|end_of_sequence|>",
|
| 11 |
+
"lstrip": false,
|
| 12 |
+
"normalized": false,
|
| 13 |
+
"rstrip": false,
|
| 14 |
+
"single_word": false
|
| 15 |
+
},
|
| 16 |
+
"pad_token": {
|
| 17 |
+
"content": "<|pad|>",
|
| 18 |
+
"lstrip": false,
|
| 19 |
+
"normalized": false,
|
| 20 |
+
"rstrip": false,
|
| 21 |
+
"single_word": false
|
| 22 |
+
}
|
| 23 |
+
}
|
checkpoints/fish-speech-1.4-sft-yth-lora/tokenizer.json
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
checkpoints/fish-speech-1.4-sft-yth-lora/tokenizer_config.json
ADDED
|
@@ -0,0 +1,82 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"added_tokens_decoder": {
|
| 3 |
+
"0": {
|
| 4 |
+
"content": "<|begin_of_sequence|>",
|
| 5 |
+
"lstrip": false,
|
| 6 |
+
"normalized": false,
|
| 7 |
+
"rstrip": false,
|
| 8 |
+
"single_word": false,
|
| 9 |
+
"special": true
|
| 10 |
+
},
|
| 11 |
+
"1": {
|
| 12 |
+
"content": "<|end_of_sequence|>",
|
| 13 |
+
"lstrip": false,
|
| 14 |
+
"normalized": false,
|
| 15 |
+
"rstrip": false,
|
| 16 |
+
"single_word": false,
|
| 17 |
+
"special": true
|
| 18 |
+
},
|
| 19 |
+
"2": {
|
| 20 |
+
"content": "<|pad|>",
|
| 21 |
+
"lstrip": false,
|
| 22 |
+
"normalized": false,
|
| 23 |
+
"rstrip": false,
|
| 24 |
+
"single_word": false,
|
| 25 |
+
"special": true
|
| 26 |
+
},
|
| 27 |
+
"3": {
|
| 28 |
+
"content": "<|im_start|>",
|
| 29 |
+
"lstrip": false,
|
| 30 |
+
"normalized": false,
|
| 31 |
+
"rstrip": false,
|
| 32 |
+
"single_word": false,
|
| 33 |
+
"special": true
|
| 34 |
+
},
|
| 35 |
+
"4": {
|
| 36 |
+
"content": "<|im_end|>",
|
| 37 |
+
"lstrip": false,
|
| 38 |
+
"normalized": false,
|
| 39 |
+
"rstrip": false,
|
| 40 |
+
"single_word": false,
|
| 41 |
+
"special": true
|
| 42 |
+
},
|
| 43 |
+
"5": {
|
| 44 |
+
"content": "<|semantic|>",
|
| 45 |
+
"lstrip": false,
|
| 46 |
+
"normalized": false,
|
| 47 |
+
"rstrip": false,
|
| 48 |
+
"single_word": false,
|
| 49 |
+
"special": true
|
| 50 |
+
},
|
| 51 |
+
"6": {
|
| 52 |
+
"content": "<|mel|>",
|
| 53 |
+
"lstrip": false,
|
| 54 |
+
"normalized": false,
|
| 55 |
+
"rstrip": false,
|
| 56 |
+
"single_word": false,
|
| 57 |
+
"special": true
|
| 58 |
+
},
|
| 59 |
+
"32000": {
|
| 60 |
+
"content": "<|reserve_0|>",
|
| 61 |
+
"lstrip": false,
|
| 62 |
+
"normalized": false,
|
| 63 |
+
"rstrip": false,
|
| 64 |
+
"single_word": false,
|
| 65 |
+
"special": true
|
| 66 |
+
},
|
| 67 |
+
"32001": {
|
| 68 |
+
"content": "<|reserve_1|>",
|
| 69 |
+
"lstrip": false,
|
| 70 |
+
"normalized": false,
|
| 71 |
+
"rstrip": false,
|
| 72 |
+
"single_word": false,
|
| 73 |
+
"special": true
|
| 74 |
+
}
|
| 75 |
+
},
|
| 76 |
+
"bos_token": "<|begin_of_sequence|>",
|
| 77 |
+
"clean_up_tokenization_spaces": true,
|
| 78 |
+
"eos_token": "<|end_of_sequence|>",
|
| 79 |
+
"model_max_length": 1000000000000000019884624838656,
|
| 80 |
+
"pad_token": "<|pad|>",
|
| 81 |
+
"tokenizer_class": "PreTrainedTokenizerFast"
|
| 82 |
+
}
|
checkpoints/fish-speech-1.4/README.md
ADDED
|
@@ -0,0 +1,61 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
---
|
| 2 |
+
tags:
|
| 3 |
+
- text-to-speech
|
| 4 |
+
license: cc-by-nc-sa-4.0
|
| 5 |
+
language:
|
| 6 |
+
- zh
|
| 7 |
+
- en
|
| 8 |
+
- de
|
| 9 |
+
- ja
|
| 10 |
+
- fr
|
| 11 |
+
- es
|
| 12 |
+
- ko
|
| 13 |
+
- ar
|
| 14 |
+
pipeline_tag: text-to-speech
|
| 15 |
+
inference: false
|
| 16 |
+
extra_gated_prompt: >-
|
| 17 |
+
You agree to not use the model to generate contents that violate DMCA or local
|
| 18 |
+
laws.
|
| 19 |
+
extra_gated_fields:
|
| 20 |
+
Country: country
|
| 21 |
+
Specific date: date_picker
|
| 22 |
+
I agree to use this model for non-commercial use ONLY: checkbox
|
| 23 |
+
---
|
| 24 |
+
|
| 25 |
+
|
| 26 |
+
# Fish Speech V1.4
|
| 27 |
+
|
| 28 |
+
**Fish Speech V1.4** is a leading text-to-speech (TTS) model trained on 700k hours of audio data in multiple languages.
|
| 29 |
+
|
| 30 |
+
Supported languages:
|
| 31 |
+
- English (en) ~300k hours
|
| 32 |
+
- Chinese (zh) ~300k hours
|
| 33 |
+
- German (de) ~20k hours
|
| 34 |
+
- Japanese (ja) ~20k hours
|
| 35 |
+
- French (fr) ~20k hours
|
| 36 |
+
- Spanish (es) ~20k hours
|
| 37 |
+
- Korean (ko) ~20k hours
|
| 38 |
+
- Arabic (ar) ~20k hours
|
| 39 |
+
|
| 40 |
+
Please refer to [Fish Speech Github](https://github.com/fishaudio/fish-speech) for more info.
|
| 41 |
+
Demo available at [Fish Audio](https://fish.audio/).
|
| 42 |
+
|
| 43 |
+
## Citation
|
| 44 |
+
|
| 45 |
+
If you found this repository useful, please consider citing this work:
|
| 46 |
+
|
| 47 |
+
```
|
| 48 |
+
@misc{fish-speech-v1.4,
|
| 49 |
+
author = {Shijia Liao, Tianyu Li, etc},
|
| 50 |
+
title = {Fish Speech V1.4},
|
| 51 |
+
year = {2024},
|
| 52 |
+
publisher = {GitHub},
|
| 53 |
+
journal = {GitHub repository},
|
| 54 |
+
howpublished = {\url{https://github.com/fishaudio/fish-speech}}
|
| 55 |
+
}
|
| 56 |
+
```
|
| 57 |
+
|
| 58 |
+
## License
|
| 59 |
+
|
| 60 |
+
This model is permissively licensed under the BY-CC-NC-SA-4.0 license.
|
| 61 |
+
The source code is released under BSD-3-Clause license.
|
checkpoints/fish-speech-1.4/config.json
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"attention_qkv_bias": false,
|
| 3 |
+
"codebook_size": 1024,
|
| 4 |
+
"dim": 1024,
|
| 5 |
+
"dropout": 0.1,
|
| 6 |
+
"head_dim": 64,
|
| 7 |
+
"initializer_range": 0.02,
|
| 8 |
+
"intermediate_size": 4096,
|
| 9 |
+
"max_seq_len": 4096,
|
| 10 |
+
"model_type": "dual_ar",
|
| 11 |
+
"n_fast_layer": 4,
|
| 12 |
+
"n_head": 16,
|
| 13 |
+
"n_layer": 24,
|
| 14 |
+
"n_local_heads": 2,
|
| 15 |
+
"norm_eps": 1e-06,
|
| 16 |
+
"num_codebooks": 8,
|
| 17 |
+
"rope_base": 1000000.0,
|
| 18 |
+
"tie_word_embeddings": false,
|
| 19 |
+
"use_gradient_checkpointing": true,
|
| 20 |
+
"vocab_size": 32000
|
| 21 |
+
}
|
checkpoints/fish-speech-1.4/firefly-gan-vq-fsq-8x1024-21hz-generator.pth
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:01b81dbf753224a156c3fe139b88bf0b9a0f54b11bee864f95e66511c3ccd754
|
| 3 |
+
size 188518579
|
checkpoints/fish-speech-1.4/model.pth
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:7d1cfa4b59c37f58d22e0626a53cec61db79390d7d0733b6402bf6f69fe58b93
|
| 3 |
+
size 988988542
|
checkpoints/fish-speech-1.4/special_tokens_map.json
ADDED
|
@@ -0,0 +1,23 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"bos_token": {
|
| 3 |
+
"content": "<|begin_of_sequence|>",
|
| 4 |
+
"lstrip": false,
|
| 5 |
+
"normalized": false,
|
| 6 |
+
"rstrip": false,
|
| 7 |
+
"single_word": false
|
| 8 |
+
},
|
| 9 |
+
"eos_token": {
|
| 10 |
+
"content": "<|end_of_sequence|>",
|
| 11 |
+
"lstrip": false,
|
| 12 |
+
"normalized": false,
|
| 13 |
+
"rstrip": false,
|
| 14 |
+
"single_word": false
|
| 15 |
+
},
|
| 16 |
+
"pad_token": {
|
| 17 |
+
"content": "<|pad|>",
|
| 18 |
+
"lstrip": false,
|
| 19 |
+
"normalized": false,
|
| 20 |
+
"rstrip": false,
|
| 21 |
+
"single_word": false
|
| 22 |
+
}
|
| 23 |
+
}
|
checkpoints/fish-speech-1.4/tokenizer.json
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
checkpoints/fish-speech-1.4/tokenizer_config.json
ADDED
|
@@ -0,0 +1,82 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"added_tokens_decoder": {
|
| 3 |
+
"0": {
|
| 4 |
+
"content": "<|begin_of_sequence|>",
|
| 5 |
+
"lstrip": false,
|
| 6 |
+
"normalized": false,
|
| 7 |
+
"rstrip": false,
|
| 8 |
+
"single_word": false,
|
| 9 |
+
"special": true
|
| 10 |
+
},
|
| 11 |
+
"1": {
|
| 12 |
+
"content": "<|end_of_sequence|>",
|
| 13 |
+
"lstrip": false,
|
| 14 |
+
"normalized": false,
|
| 15 |
+
"rstrip": false,
|
| 16 |
+
"single_word": false,
|
| 17 |
+
"special": true
|
| 18 |
+
},
|
| 19 |
+
"2": {
|
| 20 |
+
"content": "<|pad|>",
|
| 21 |
+
"lstrip": false,
|
| 22 |
+
"normalized": false,
|
| 23 |
+
"rstrip": false,
|
| 24 |
+
"single_word": false,
|
| 25 |
+
"special": true
|
| 26 |
+
},
|
| 27 |
+
"3": {
|
| 28 |
+
"content": "<|im_start|>",
|
| 29 |
+
"lstrip": false,
|
| 30 |
+
"normalized": false,
|
| 31 |
+
"rstrip": false,
|
| 32 |
+
"single_word": false,
|
| 33 |
+
"special": true
|
| 34 |
+
},
|
| 35 |
+
"4": {
|
| 36 |
+
"content": "<|im_end|>",
|
| 37 |
+
"lstrip": false,
|
| 38 |
+
"normalized": false,
|
| 39 |
+
"rstrip": false,
|
| 40 |
+
"single_word": false,
|
| 41 |
+
"special": true
|
| 42 |
+
},
|
| 43 |
+
"5": {
|
| 44 |
+
"content": "<|semantic|>",
|
| 45 |
+
"lstrip": false,
|
| 46 |
+
"normalized": false,
|
| 47 |
+
"rstrip": false,
|
| 48 |
+
"single_word": false,
|
| 49 |
+
"special": true
|
| 50 |
+
},
|
| 51 |
+
"6": {
|
| 52 |
+
"content": "<|mel|>",
|
| 53 |
+
"lstrip": false,
|
| 54 |
+
"normalized": false,
|
| 55 |
+
"rstrip": false,
|
| 56 |
+
"single_word": false,
|
| 57 |
+
"special": true
|
| 58 |
+
},
|
| 59 |
+
"7": {
|
| 60 |
+
"content": "<|reserve_0|>",
|
| 61 |
+
"lstrip": false,
|
| 62 |
+
"normalized": false,
|
| 63 |
+
"rstrip": false,
|
| 64 |
+
"single_word": false,
|
| 65 |
+
"special": true
|
| 66 |
+
},
|
| 67 |
+
"8": {
|
| 68 |
+
"content": "<|reserve_1|>",
|
| 69 |
+
"lstrip": false,
|
| 70 |
+
"normalized": false,
|
| 71 |
+
"rstrip": false,
|
| 72 |
+
"single_word": false,
|
| 73 |
+
"special": true
|
| 74 |
+
}
|
| 75 |
+
},
|
| 76 |
+
"bos_token": "<|begin_of_sequence|>",
|
| 77 |
+
"clean_up_tokenization_spaces": true,
|
| 78 |
+
"eos_token": "<|end_of_sequence|>",
|
| 79 |
+
"model_max_length": 1000000000000000019884624838656,
|
| 80 |
+
"pad_token": "<|pad|>",
|
| 81 |
+
"tokenizer_class": "PreTrainedTokenizerFast"
|
| 82 |
+
}
|
configs/base.yaml
ADDED
|
@@ -0,0 +1,87 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Base configuration for training a model
|
| 2 |
+
paths:
|
| 3 |
+
run_dir: results/${project}
|
| 4 |
+
ckpt_dir: ${paths.run_dir}/checkpoints
|
| 5 |
+
|
| 6 |
+
hydra:
|
| 7 |
+
run:
|
| 8 |
+
dir: ${paths.run_dir}
|
| 9 |
+
|
| 10 |
+
# Lightning Trainer
|
| 11 |
+
trainer:
|
| 12 |
+
_target_: lightning.pytorch.trainer.Trainer
|
| 13 |
+
|
| 14 |
+
default_root_dir: ${paths.run_dir}
|
| 15 |
+
accelerator: gpu
|
| 16 |
+
num_nodes: 1
|
| 17 |
+
devices: auto
|
| 18 |
+
strategy:
|
| 19 |
+
_target_: lightning.pytorch.strategies.DDPStrategy
|
| 20 |
+
process_group_backend: nccl # This should be override when training on windows
|
| 21 |
+
|
| 22 |
+
precision: bf16-mixed
|
| 23 |
+
|
| 24 |
+
# disable validation by epoch end
|
| 25 |
+
check_val_every_n_epoch: null
|
| 26 |
+
val_check_interval: 5000
|
| 27 |
+
max_steps: 100_000
|
| 28 |
+
|
| 29 |
+
# Use torch.backends.cudnn.benchmark to speed up training
|
| 30 |
+
benchmark: true
|
| 31 |
+
|
| 32 |
+
# Callbacks
|
| 33 |
+
callbacks:
|
| 34 |
+
model_checkpoint:
|
| 35 |
+
_target_: lightning.pytorch.callbacks.ModelCheckpoint
|
| 36 |
+
dirpath: ${paths.ckpt_dir}
|
| 37 |
+
filename: "step_{step:09d}"
|
| 38 |
+
save_last: false # additionally always save an exact copy of the last checkpoint to a file last.ckpt
|
| 39 |
+
save_top_k: 5 # save 5 latest checkpoints
|
| 40 |
+
monitor: step # use step to monitor checkpoints
|
| 41 |
+
mode: max # save the latest checkpoint with the highest global_step
|
| 42 |
+
every_n_epochs: null # don't save checkpoints by epoch end
|
| 43 |
+
every_n_train_steps: 5000 # save checkpoints every 5000 steps
|
| 44 |
+
auto_insert_metric_name: false
|
| 45 |
+
|
| 46 |
+
model_summary:
|
| 47 |
+
_target_: lightning.pytorch.callbacks.ModelSummary
|
| 48 |
+
max_depth: 2 # the maximum depth of layer nesting that the summary will include
|
| 49 |
+
|
| 50 |
+
learning_rate_monitor:
|
| 51 |
+
_target_: lightning.pytorch.callbacks.LearningRateMonitor
|
| 52 |
+
logging_interval: step
|
| 53 |
+
log_momentum: false
|
| 54 |
+
|
| 55 |
+
grad_norm_monitor:
|
| 56 |
+
_target_: fish_speech.callbacks.GradNormMonitor
|
| 57 |
+
norm_type: 2
|
| 58 |
+
logging_interval: step
|
| 59 |
+
|
| 60 |
+
# Logger
|
| 61 |
+
logger:
|
| 62 |
+
tensorboard:
|
| 63 |
+
_target_: lightning.pytorch.loggers.tensorboard.TensorBoardLogger
|
| 64 |
+
save_dir: "${paths.run_dir}/tensorboard/"
|
| 65 |
+
name: null
|
| 66 |
+
log_graph: false
|
| 67 |
+
default_hp_metric: true
|
| 68 |
+
prefix: ""
|
| 69 |
+
|
| 70 |
+
# wandb:
|
| 71 |
+
# _target_: lightning.pytorch.loggers.wandb.WandbLogger
|
| 72 |
+
# # name: "" # name of the run (normally generated by wandb)
|
| 73 |
+
# save_dir: "${paths.run_dir}"
|
| 74 |
+
# offline: False
|
| 75 |
+
# id: null # pass correct id to resume experiment!
|
| 76 |
+
# anonymous: null # enable anonymous logging
|
| 77 |
+
# project: "fish-speech"
|
| 78 |
+
# log_model: False # upload lightning ckpts
|
| 79 |
+
# prefix: "" # a string to put at the beginning of metric keys
|
| 80 |
+
# # entity: "" # set to name of your wandb team
|
| 81 |
+
# group: ""
|
| 82 |
+
# tags: ["vq", "hq", "finetune"]
|
| 83 |
+
# job_type: ""
|
| 84 |
+
|
| 85 |
+
# Loop
|
| 86 |
+
train: true
|
| 87 |
+
test: false
|
configs/firefly_gan_vq.yaml
ADDED
|
@@ -0,0 +1,33 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
_target_: fish_speech.models.vqgan.modules.firefly.FireflyArchitecture
|
| 2 |
+
spec_transform:
|
| 3 |
+
_target_: fish_speech.utils.spectrogram.LogMelSpectrogram
|
| 4 |
+
sample_rate: 44100
|
| 5 |
+
n_mels: 160
|
| 6 |
+
n_fft: 2048
|
| 7 |
+
hop_length: 512
|
| 8 |
+
win_length: 2048
|
| 9 |
+
backbone:
|
| 10 |
+
_target_: fish_speech.models.vqgan.modules.firefly.ConvNeXtEncoder
|
| 11 |
+
input_channels: 160
|
| 12 |
+
depths: [3, 3, 9, 3]
|
| 13 |
+
dims: [128, 256, 384, 512]
|
| 14 |
+
drop_path_rate: 0.2
|
| 15 |
+
kernel_size: 7
|
| 16 |
+
head:
|
| 17 |
+
_target_: fish_speech.models.vqgan.modules.firefly.HiFiGANGenerator
|
| 18 |
+
hop_length: 512
|
| 19 |
+
upsample_rates: [8, 8, 2, 2, 2] # aka. strides
|
| 20 |
+
upsample_kernel_sizes: [16, 16, 4, 4, 4]
|
| 21 |
+
resblock_kernel_sizes: [3, 7, 11]
|
| 22 |
+
resblock_dilation_sizes: [[1, 3, 5], [1, 3, 5], [1, 3, 5]]
|
| 23 |
+
num_mels: 512
|
| 24 |
+
upsample_initial_channel: 512
|
| 25 |
+
pre_conv_kernel_size: 13
|
| 26 |
+
post_conv_kernel_size: 13
|
| 27 |
+
quantizer:
|
| 28 |
+
_target_: fish_speech.models.vqgan.modules.fsq.DownsampleFiniteScalarQuantize
|
| 29 |
+
input_dim: 512
|
| 30 |
+
n_groups: 8
|
| 31 |
+
n_codebooks: 1
|
| 32 |
+
levels: [8, 5, 5, 5]
|
| 33 |
+
downsample_factor: [2, 2]
|
configs/lora/r_8_alpha_16.yaml
ADDED
|
@@ -0,0 +1,4 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
_target_: fish_speech.models.text2semantic.lora.LoraConfig
|
| 2 |
+
r: 8
|
| 3 |
+
lora_alpha: 16
|
| 4 |
+
lora_dropout: 0.01
|
configs/text2semantic_finetune.yaml
ADDED
|
@@ -0,0 +1,83 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
defaults:
|
| 2 |
+
- base
|
| 3 |
+
- _self_
|
| 4 |
+
|
| 5 |
+
project: text2semantic_finetune_dual_ar
|
| 6 |
+
max_length: 4096
|
| 7 |
+
pretrained_ckpt_path: checkpoints/fish-speech-1.4
|
| 8 |
+
|
| 9 |
+
# Lightning Trainer
|
| 10 |
+
trainer:
|
| 11 |
+
accumulate_grad_batches: 1
|
| 12 |
+
gradient_clip_val: 1.0
|
| 13 |
+
gradient_clip_algorithm: "norm"
|
| 14 |
+
max_steps: 1000
|
| 15 |
+
precision: bf16-true
|
| 16 |
+
limit_val_batches: 10
|
| 17 |
+
val_check_interval: 100
|
| 18 |
+
|
| 19 |
+
# Dataset Configuration
|
| 20 |
+
tokenizer:
|
| 21 |
+
_target_: transformers.AutoTokenizer.from_pretrained
|
| 22 |
+
pretrained_model_name_or_path: ${pretrained_ckpt_path}
|
| 23 |
+
|
| 24 |
+
# Dataset Configuration
|
| 25 |
+
train_dataset:
|
| 26 |
+
_target_: fish_speech.datasets.semantic.AutoTextSemanticInstructionDataset
|
| 27 |
+
proto_files:
|
| 28 |
+
- data/protos
|
| 29 |
+
tokenizer: ${tokenizer}
|
| 30 |
+
causal: true
|
| 31 |
+
max_length: ${max_length}
|
| 32 |
+
use_speaker: false
|
| 33 |
+
interactive_prob: 0.7
|
| 34 |
+
|
| 35 |
+
val_dataset:
|
| 36 |
+
_target_: fish_speech.datasets.semantic.AutoTextSemanticInstructionDataset
|
| 37 |
+
proto_files:
|
| 38 |
+
- data/protos
|
| 39 |
+
tokenizer: ${tokenizer}
|
| 40 |
+
causal: true
|
| 41 |
+
max_length: ${max_length}
|
| 42 |
+
use_speaker: false
|
| 43 |
+
interactive_prob: 0.7
|
| 44 |
+
|
| 45 |
+
data:
|
| 46 |
+
_target_: fish_speech.datasets.semantic.SemanticDataModule
|
| 47 |
+
train_dataset: ${train_dataset}
|
| 48 |
+
val_dataset: ${val_dataset}
|
| 49 |
+
num_workers: 4
|
| 50 |
+
batch_size: 8
|
| 51 |
+
tokenizer: ${tokenizer}
|
| 52 |
+
max_length: ${max_length}
|
| 53 |
+
|
| 54 |
+
# Model Configuration
|
| 55 |
+
model:
|
| 56 |
+
_target_: fish_speech.models.text2semantic.lit_module.TextToSemantic
|
| 57 |
+
model:
|
| 58 |
+
_target_: fish_speech.models.text2semantic.llama.BaseTransformer.from_pretrained
|
| 59 |
+
path: ${pretrained_ckpt_path}
|
| 60 |
+
load_weights: true
|
| 61 |
+
max_length: ${max_length}
|
| 62 |
+
lora_config: null
|
| 63 |
+
|
| 64 |
+
optimizer:
|
| 65 |
+
_target_: torch.optim.AdamW
|
| 66 |
+
_partial_: true
|
| 67 |
+
lr: 1e-4
|
| 68 |
+
weight_decay: 0
|
| 69 |
+
betas: [0.9, 0.95]
|
| 70 |
+
eps: 1e-5
|
| 71 |
+
|
| 72 |
+
lr_scheduler:
|
| 73 |
+
_target_: torch.optim.lr_scheduler.LambdaLR
|
| 74 |
+
_partial_: true
|
| 75 |
+
lr_lambda:
|
| 76 |
+
_target_: fish_speech.scheduler.get_constant_schedule_with_warmup_lr_lambda
|
| 77 |
+
_partial_: true
|
| 78 |
+
num_warmup_steps: 10
|
| 79 |
+
|
| 80 |
+
# Callbacks
|
| 81 |
+
callbacks:
|
| 82 |
+
model_checkpoint:
|
| 83 |
+
every_n_train_steps: ${trainer.val_check_interval}
|
examples/40_matthew-001-01.lab
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
Miyan qaniy qu binkgan bbinkesan na Yesu:Yesu Kristo ga kinbahan na Tabite, Tabite ga kinbahan na Aburaham.
|
examples/40_matthew-001-01.wav
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:173b0800783e09808e2a1c74f0a8f85fbccb705289986f5dd63ef60a821ea805
|
| 3 |
+
size 1200330
|
fish_speech/callbacks/__init__.py
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from .grad_norm import GradNormMonitor
|
| 2 |
+
|
| 3 |
+
__all__ = ["GradNormMonitor"]
|
fish_speech/callbacks/__pycache__/__init__.cpython-310.pyc
ADDED
|
Binary file (239 Bytes). View file
|
|
|
fish_speech/callbacks/__pycache__/grad_norm.cpython-310.pyc
ADDED
|
Binary file (3.79 kB). View file
|
|
|
fish_speech/callbacks/grad_norm.py
ADDED
|
@@ -0,0 +1,113 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from typing import Optional, Union
|
| 2 |
+
|
| 3 |
+
import lightning.pytorch as pl
|
| 4 |
+
import torch
|
| 5 |
+
from lightning import LightningModule, Trainer
|
| 6 |
+
from lightning.pytorch.callbacks import Callback
|
| 7 |
+
from torch import Tensor, nn
|
| 8 |
+
from torch.utils._foreach_utils import (
|
| 9 |
+
_group_tensors_by_device_and_dtype,
|
| 10 |
+
_has_foreach_support,
|
| 11 |
+
)
|
| 12 |
+
|
| 13 |
+
|
| 14 |
+
@torch.no_grad()
|
| 15 |
+
def grad_norm(
|
| 16 |
+
parameters: Union[Tensor, list[Tensor]],
|
| 17 |
+
norm_type: float = 2.0,
|
| 18 |
+
) -> float:
|
| 19 |
+
"""
|
| 20 |
+
Returns the norm of the gradients of the given parameters.
|
| 21 |
+
|
| 22 |
+
Args:
|
| 23 |
+
parameters (Iterable[Tensor] or Tensor): an iterable of Tensors or a
|
| 24 |
+
single Tensor that will have gradients normalized
|
| 25 |
+
norm_type (float): type of the used p-norm.
|
| 26 |
+
|
| 27 |
+
Returns:
|
| 28 |
+
Total norm of the parameter gradients (viewed as a single vector).
|
| 29 |
+
""" # noqa: E501
|
| 30 |
+
|
| 31 |
+
if isinstance(parameters, Tensor):
|
| 32 |
+
parameters = [parameters]
|
| 33 |
+
|
| 34 |
+
grads = [p.grad for p in parameters if p.grad is not None]
|
| 35 |
+
if len(grads) == 0:
|
| 36 |
+
return None
|
| 37 |
+
|
| 38 |
+
first_device = grads[0].device
|
| 39 |
+
grouped_grads: dict[
|
| 40 |
+
tuple[torch.device, torch.dtype], list[list[Tensor]]
|
| 41 |
+
] = _group_tensors_by_device_and_dtype(
|
| 42 |
+
[[g.detach() for g in grads]]
|
| 43 |
+
) # type: ignore[assignment]
|
| 44 |
+
|
| 45 |
+
norms = []
|
| 46 |
+
for (device, _), ([grads], _) in grouped_grads.items():
|
| 47 |
+
if _has_foreach_support(grads, device=device):
|
| 48 |
+
norms.extend(torch._foreach_norm(grads, norm_type))
|
| 49 |
+
else:
|
| 50 |
+
norms.extend([torch.norm(g, norm_type) for g in grads])
|
| 51 |
+
|
| 52 |
+
return torch.norm(torch.stack([norm.to(first_device) for norm in norms]), norm_type)
|
| 53 |
+
|
| 54 |
+
|
| 55 |
+
class GradNormMonitor(Callback):
|
| 56 |
+
"""
|
| 57 |
+
Callback that computes the gradient norm of the model parameters.
|
| 58 |
+
"""
|
| 59 |
+
|
| 60 |
+
def __init__(
|
| 61 |
+
self,
|
| 62 |
+
norm_type: float = 2.0,
|
| 63 |
+
logging_interval: str = "step",
|
| 64 |
+
sub_module: Optional[Union[str, list[str]]] = None,
|
| 65 |
+
) -> None:
|
| 66 |
+
"""
|
| 67 |
+
Args:
|
| 68 |
+
norm_type (float): type of the used p-norm.
|
| 69 |
+
logging_interval (str): "step" or "epoch".
|
| 70 |
+
"""
|
| 71 |
+
super().__init__()
|
| 72 |
+
|
| 73 |
+
self.norm_type = norm_type
|
| 74 |
+
self.logging_interval = logging_interval
|
| 75 |
+
self.sub_module = sub_module
|
| 76 |
+
|
| 77 |
+
def on_after_backward(self, trainer: Trainer, model: LightningModule) -> None:
|
| 78 |
+
"""
|
| 79 |
+
Computes the gradient norm of the model parameters and logs it to the logger.
|
| 80 |
+
|
| 81 |
+
Args:
|
| 82 |
+
trainer (Trainer): The trainer object
|
| 83 |
+
model (LightningModule): The current lightningModule
|
| 84 |
+
"""
|
| 85 |
+
|
| 86 |
+
lightning_model = model
|
| 87 |
+
|
| 88 |
+
if self.sub_module is None:
|
| 89 |
+
return self.log_sub_module_grad_norm(lightning_model, model, "")
|
| 90 |
+
|
| 91 |
+
sub_modules = self.sub_module
|
| 92 |
+
if isinstance(sub_modules, str):
|
| 93 |
+
sub_modules = [sub_modules]
|
| 94 |
+
|
| 95 |
+
for sub_module in sub_modules:
|
| 96 |
+
self.log_sub_module_grad_norm(
|
| 97 |
+
lightning_model, getattr(model, sub_module), f"/{sub_module}"
|
| 98 |
+
)
|
| 99 |
+
|
| 100 |
+
def log_sub_module_grad_norm(
|
| 101 |
+
self, lightning_model: LightningModule, model: nn.Module, path: str
|
| 102 |
+
) -> None:
|
| 103 |
+
grad_norm_val = grad_norm(model.parameters(), self.norm_type)
|
| 104 |
+
if grad_norm_val is None:
|
| 105 |
+
return
|
| 106 |
+
|
| 107 |
+
on_step = self.logging_interval == "step"
|
| 108 |
+
lightning_model.log(
|
| 109 |
+
f"train{path}/grad_norm",
|
| 110 |
+
grad_norm_val,
|
| 111 |
+
on_step=on_step,
|
| 112 |
+
on_epoch=not on_step,
|
| 113 |
+
)
|
fish_speech/configs/base.yaml
ADDED
|
@@ -0,0 +1,87 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Base configuration for training a model
|
| 2 |
+
paths:
|
| 3 |
+
run_dir: results/${project}
|
| 4 |
+
ckpt_dir: ${paths.run_dir}/checkpoints
|
| 5 |
+
|
| 6 |
+
hydra:
|
| 7 |
+
run:
|
| 8 |
+
dir: ${paths.run_dir}
|
| 9 |
+
|
| 10 |
+
# Lightning Trainer
|
| 11 |
+
trainer:
|
| 12 |
+
_target_: lightning.pytorch.trainer.Trainer
|
| 13 |
+
|
| 14 |
+
default_root_dir: ${paths.run_dir}
|
| 15 |
+
accelerator: gpu
|
| 16 |
+
num_nodes: 1
|
| 17 |
+
devices: auto
|
| 18 |
+
strategy:
|
| 19 |
+
_target_: lightning.pytorch.strategies.DDPStrategy
|
| 20 |
+
process_group_backend: nccl # This should be override when training on windows
|
| 21 |
+
|
| 22 |
+
precision: bf16-mixed
|
| 23 |
+
|
| 24 |
+
# disable validation by epoch end
|
| 25 |
+
check_val_every_n_epoch: null
|
| 26 |
+
val_check_interval: 5000
|
| 27 |
+
max_steps: 100_000
|
| 28 |
+
|
| 29 |
+
# Use torch.backends.cudnn.benchmark to speed up training
|
| 30 |
+
benchmark: true
|
| 31 |
+
|
| 32 |
+
# Callbacks
|
| 33 |
+
callbacks:
|
| 34 |
+
model_checkpoint:
|
| 35 |
+
_target_: lightning.pytorch.callbacks.ModelCheckpoint
|
| 36 |
+
dirpath: ${paths.ckpt_dir}
|
| 37 |
+
filename: "step_{step:09d}"
|
| 38 |
+
save_last: false # additionally always save an exact copy of the last checkpoint to a file last.ckpt
|
| 39 |
+
save_top_k: 5 # save 5 latest checkpoints
|
| 40 |
+
monitor: step # use step to monitor checkpoints
|
| 41 |
+
mode: max # save the latest checkpoint with the highest global_step
|
| 42 |
+
every_n_epochs: null # don't save checkpoints by epoch end
|
| 43 |
+
every_n_train_steps: 5000 # save checkpoints every 5000 steps
|
| 44 |
+
auto_insert_metric_name: false
|
| 45 |
+
|
| 46 |
+
model_summary:
|
| 47 |
+
_target_: lightning.pytorch.callbacks.ModelSummary
|
| 48 |
+
max_depth: 2 # the maximum depth of layer nesting that the summary will include
|
| 49 |
+
|
| 50 |
+
learning_rate_monitor:
|
| 51 |
+
_target_: lightning.pytorch.callbacks.LearningRateMonitor
|
| 52 |
+
logging_interval: step
|
| 53 |
+
log_momentum: false
|
| 54 |
+
|
| 55 |
+
grad_norm_monitor:
|
| 56 |
+
_target_: fish_speech.callbacks.GradNormMonitor
|
| 57 |
+
norm_type: 2
|
| 58 |
+
logging_interval: step
|
| 59 |
+
|
| 60 |
+
# Logger
|
| 61 |
+
logger:
|
| 62 |
+
tensorboard:
|
| 63 |
+
_target_: lightning.pytorch.loggers.tensorboard.TensorBoardLogger
|
| 64 |
+
save_dir: "${paths.run_dir}/tensorboard/"
|
| 65 |
+
name: null
|
| 66 |
+
log_graph: false
|
| 67 |
+
default_hp_metric: true
|
| 68 |
+
prefix: ""
|
| 69 |
+
|
| 70 |
+
# wandb:
|
| 71 |
+
# _target_: lightning.pytorch.loggers.wandb.WandbLogger
|
| 72 |
+
# # name: "" # name of the run (normally generated by wandb)
|
| 73 |
+
# save_dir: "${paths.run_dir}"
|
| 74 |
+
# offline: False
|
| 75 |
+
# id: null # pass correct id to resume experiment!
|
| 76 |
+
# anonymous: null # enable anonymous logging
|
| 77 |
+
# project: "fish-speech"
|
| 78 |
+
# log_model: False # upload lightning ckpts
|
| 79 |
+
# prefix: "" # a string to put at the beginning of metric keys
|
| 80 |
+
# # entity: "" # set to name of your wandb team
|
| 81 |
+
# group: ""
|
| 82 |
+
# tags: ["vq", "hq", "finetune"]
|
| 83 |
+
# job_type: ""
|
| 84 |
+
|
| 85 |
+
# Loop
|
| 86 |
+
train: true
|
| 87 |
+
test: false
|
fish_speech/configs/firefly_gan_vq.yaml
ADDED
|
@@ -0,0 +1,33 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
_target_: fish_speech.models.vqgan.modules.firefly.FireflyArchitecture
|
| 2 |
+
spec_transform:
|
| 3 |
+
_target_: fish_speech.utils.spectrogram.LogMelSpectrogram
|
| 4 |
+
sample_rate: 44100
|
| 5 |
+
n_mels: 160
|
| 6 |
+
n_fft: 2048
|
| 7 |
+
hop_length: 512
|
| 8 |
+
win_length: 2048
|
| 9 |
+
backbone:
|
| 10 |
+
_target_: fish_speech.models.vqgan.modules.firefly.ConvNeXtEncoder
|
| 11 |
+
input_channels: 160
|
| 12 |
+
depths: [3, 3, 9, 3]
|
| 13 |
+
dims: [128, 256, 384, 512]
|
| 14 |
+
drop_path_rate: 0.2
|
| 15 |
+
kernel_size: 7
|
| 16 |
+
head:
|
| 17 |
+
_target_: fish_speech.models.vqgan.modules.firefly.HiFiGANGenerator
|
| 18 |
+
hop_length: 512
|
| 19 |
+
upsample_rates: [8, 8, 2, 2, 2] # aka. strides
|
| 20 |
+
upsample_kernel_sizes: [16, 16, 4, 4, 4]
|
| 21 |
+
resblock_kernel_sizes: [3, 7, 11]
|
| 22 |
+
resblock_dilation_sizes: [[1, 3, 5], [1, 3, 5], [1, 3, 5]]
|
| 23 |
+
num_mels: 512
|
| 24 |
+
upsample_initial_channel: 512
|
| 25 |
+
pre_conv_kernel_size: 13
|
| 26 |
+
post_conv_kernel_size: 13
|
| 27 |
+
quantizer:
|
| 28 |
+
_target_: fish_speech.models.vqgan.modules.fsq.DownsampleFiniteScalarQuantize
|
| 29 |
+
input_dim: 512
|
| 30 |
+
n_groups: 8
|
| 31 |
+
n_codebooks: 1
|
| 32 |
+
levels: [8, 5, 5, 5]
|
| 33 |
+
downsample_factor: [2, 2]
|
fish_speech/configs/lora/r_8_alpha_16.yaml
ADDED
|
@@ -0,0 +1,4 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
_target_: fish_speech.models.text2semantic.lora.LoraConfig
|
| 2 |
+
r: 8
|
| 3 |
+
lora_alpha: 16
|
| 4 |
+
lora_dropout: 0.01
|
fish_speech/configs/text2semantic_finetune.yaml
ADDED
|
@@ -0,0 +1,83 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
defaults:
|
| 2 |
+
- base
|
| 3 |
+
- _self_
|
| 4 |
+
|
| 5 |
+
project: text2semantic_finetune_dual_ar
|
| 6 |
+
max_length: 4096
|
| 7 |
+
pretrained_ckpt_path: checkpoints/fish-speech-1.4
|
| 8 |
+
|
| 9 |
+
# Lightning Trainer
|
| 10 |
+
trainer:
|
| 11 |
+
accumulate_grad_batches: 1
|
| 12 |
+
gradient_clip_val: 1.0
|
| 13 |
+
gradient_clip_algorithm: "norm"
|
| 14 |
+
max_steps: 1000
|
| 15 |
+
precision: bf16-true
|
| 16 |
+
limit_val_batches: 10
|
| 17 |
+
val_check_interval: 100
|
| 18 |
+
|
| 19 |
+
# Dataset Configuration
|
| 20 |
+
tokenizer:
|
| 21 |
+
_target_: transformers.AutoTokenizer.from_pretrained
|
| 22 |
+
pretrained_model_name_or_path: ${pretrained_ckpt_path}
|
| 23 |
+
|
| 24 |
+
# Dataset Configuration
|
| 25 |
+
train_dataset:
|
| 26 |
+
_target_: fish_speech.datasets.semantic.AutoTextSemanticInstructionDataset
|
| 27 |
+
proto_files:
|
| 28 |
+
- data/protos
|
| 29 |
+
tokenizer: ${tokenizer}
|
| 30 |
+
causal: true
|
| 31 |
+
max_length: ${max_length}
|
| 32 |
+
use_speaker: false
|
| 33 |
+
interactive_prob: 0.7
|
| 34 |
+
|
| 35 |
+
val_dataset:
|
| 36 |
+
_target_: fish_speech.datasets.semantic.AutoTextSemanticInstructionDataset
|
| 37 |
+
proto_files:
|
| 38 |
+
- data/protos
|
| 39 |
+
tokenizer: ${tokenizer}
|
| 40 |
+
causal: true
|
| 41 |
+
max_length: ${max_length}
|
| 42 |
+
use_speaker: false
|
| 43 |
+
interactive_prob: 0.7
|
| 44 |
+
|
| 45 |
+
data:
|
| 46 |
+
_target_: fish_speech.datasets.semantic.SemanticDataModule
|
| 47 |
+
train_dataset: ${train_dataset}
|
| 48 |
+
val_dataset: ${val_dataset}
|
| 49 |
+
num_workers: 4
|
| 50 |
+
batch_size: 8
|
| 51 |
+
tokenizer: ${tokenizer}
|
| 52 |
+
max_length: ${max_length}
|
| 53 |
+
|
| 54 |
+
# Model Configuration
|
| 55 |
+
model:
|
| 56 |
+
_target_: fish_speech.models.text2semantic.lit_module.TextToSemantic
|
| 57 |
+
model:
|
| 58 |
+
_target_: fish_speech.models.text2semantic.llama.BaseTransformer.from_pretrained
|
| 59 |
+
path: ${pretrained_ckpt_path}
|
| 60 |
+
load_weights: true
|
| 61 |
+
max_length: ${max_length}
|
| 62 |
+
lora_config: null
|
| 63 |
+
|
| 64 |
+
optimizer:
|
| 65 |
+
_target_: torch.optim.AdamW
|
| 66 |
+
_partial_: true
|
| 67 |
+
lr: 1e-4
|
| 68 |
+
weight_decay: 0
|
| 69 |
+
betas: [0.9, 0.95]
|
| 70 |
+
eps: 1e-5
|
| 71 |
+
|
| 72 |
+
lr_scheduler:
|
| 73 |
+
_target_: torch.optim.lr_scheduler.LambdaLR
|
| 74 |
+
_partial_: true
|
| 75 |
+
lr_lambda:
|
| 76 |
+
_target_: fish_speech.scheduler.get_constant_schedule_with_warmup_lr_lambda
|
| 77 |
+
_partial_: true
|
| 78 |
+
num_warmup_steps: 10
|
| 79 |
+
|
| 80 |
+
# Callbacks
|
| 81 |
+
callbacks:
|
| 82 |
+
model_checkpoint:
|
| 83 |
+
every_n_train_steps: ${trainer.val_check_interval}
|
fish_speech/conversation.py
ADDED
|
@@ -0,0 +1,2 @@
|
|
|
|
|
|
|
|
|
|
| 1 |
+
SEMANTIC_TOKEN = "<|semantic|>"
|
| 2 |
+
CODEBOOK_PAD_TOKEN_ID = 0
|
fish_speech/datasets/__pycache__/semantic.cpython-310.pyc
ADDED
|
Binary file (12.4 kB). View file
|
|
|
fish_speech/datasets/concat_repeat.py
ADDED
|
@@ -0,0 +1,53 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import bisect
|
| 2 |
+
import random
|
| 3 |
+
from typing import Iterable
|
| 4 |
+
|
| 5 |
+
from torch.utils.data import Dataset, IterableDataset
|
| 6 |
+
|
| 7 |
+
|
| 8 |
+
class ConcatRepeatDataset(Dataset):
|
| 9 |
+
datasets: list[Dataset]
|
| 10 |
+
cumulative_sizes: list[int]
|
| 11 |
+
repeats: list[int]
|
| 12 |
+
|
| 13 |
+
@staticmethod
|
| 14 |
+
def cumsum(sequence, repeats):
|
| 15 |
+
r, s = [], 0
|
| 16 |
+
for dataset, repeat in zip(sequence, repeats):
|
| 17 |
+
l = len(dataset) * repeat
|
| 18 |
+
r.append(l + s)
|
| 19 |
+
s += l
|
| 20 |
+
return r
|
| 21 |
+
|
| 22 |
+
def __init__(self, datasets: Iterable[Dataset], repeats: list[int]):
|
| 23 |
+
super().__init__()
|
| 24 |
+
|
| 25 |
+
self.datasets = list(datasets)
|
| 26 |
+
self.repeats = repeats
|
| 27 |
+
|
| 28 |
+
assert len(self.datasets) > 0, "datasets should not be an empty iterable"
|
| 29 |
+
assert len(self.datasets) == len(
|
| 30 |
+
repeats
|
| 31 |
+
), "datasets and repeats should have the same length"
|
| 32 |
+
|
| 33 |
+
for d in self.datasets:
|
| 34 |
+
assert not isinstance(
|
| 35 |
+
d, IterableDataset
|
| 36 |
+
), "ConcatRepeatDataset does not support IterableDataset"
|
| 37 |
+
|
| 38 |
+
self.cumulative_sizes = self.cumsum(self.datasets, self.repeats)
|
| 39 |
+
|
| 40 |
+
def __len__(self):
|
| 41 |
+
return self.cumulative_sizes[-1]
|
| 42 |
+
|
| 43 |
+
def __getitem__(self, idx):
|
| 44 |
+
dataset_idx = bisect.bisect_right(self.cumulative_sizes, idx)
|
| 45 |
+
|
| 46 |
+
if dataset_idx == 0:
|
| 47 |
+
sample_idx = idx
|
| 48 |
+
else:
|
| 49 |
+
sample_idx = idx - self.cumulative_sizes[dataset_idx - 1]
|
| 50 |
+
|
| 51 |
+
dataset = self.datasets[dataset_idx]
|
| 52 |
+
|
| 53 |
+
return dataset[sample_idx % len(dataset)]
|
fish_speech/datasets/protos/__pycache__/text_data_pb2.cpython-310.pyc
ADDED
|
Binary file (1.26 kB). View file
|
|
|
fish_speech/datasets/protos/__pycache__/text_data_stream.cpython-310.pyc
ADDED
|
Binary file (1.13 kB). View file
|
|
|
fish_speech/datasets/protos/text-data.proto
ADDED
|
@@ -0,0 +1,24 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
syntax = "proto3";
|
| 2 |
+
|
| 3 |
+
package text_data;
|
| 4 |
+
|
| 5 |
+
message Semantics {
|
| 6 |
+
repeated uint32 values = 1;
|
| 7 |
+
}
|
| 8 |
+
|
| 9 |
+
message Sentence {
|
| 10 |
+
repeated string texts = 1;
|
| 11 |
+
repeated Semantics semantics = 3;
|
| 12 |
+
}
|
| 13 |
+
|
| 14 |
+
message TextData {
|
| 15 |
+
string source = 1;
|
| 16 |
+
string name = 2;
|
| 17 |
+
repeated Sentence sentences = 4;
|
| 18 |
+
}
|
| 19 |
+
|
| 20 |
+
message SampledData {
|
| 21 |
+
string source = 1;
|
| 22 |
+
string name = 2;
|
| 23 |
+
repeated Sentence samples = 3;
|
| 24 |
+
}
|
fish_speech/datasets/protos/text_data_pb2.py
ADDED
|
@@ -0,0 +1,33 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# -*- coding: utf-8 -*-
|
| 2 |
+
# Generated by the protocol buffer compiler. DO NOT EDIT!
|
| 3 |
+
# source: text-data.proto
|
| 4 |
+
# Protobuf Python Version: 4.25.1
|
| 5 |
+
"""Generated protocol buffer code."""
|
| 6 |
+
from google.protobuf import descriptor as _descriptor
|
| 7 |
+
from google.protobuf import descriptor_pool as _descriptor_pool
|
| 8 |
+
from google.protobuf import symbol_database as _symbol_database
|
| 9 |
+
from google.protobuf.internal import builder as _builder
|
| 10 |
+
|
| 11 |
+
# @@protoc_insertion_point(imports)
|
| 12 |
+
|
| 13 |
+
_sym_db = _symbol_database.Default()
|
| 14 |
+
|
| 15 |
+
|
| 16 |
+
DESCRIPTOR = _descriptor_pool.Default().AddSerializedFile(
|
| 17 |
+
b'\n\x0ftext-data.proto\x12\ttext_data"\x1b\n\tSemantics\x12\x0e\n\x06values\x18\x01 \x03(\r"B\n\x08Sentence\x12\r\n\x05texts\x18\x01 \x03(\t\x12\'\n\tsemantics\x18\x03 \x03(\x0b\x32\x14.text_data.Semantics"P\n\x08TextData\x12\x0e\n\x06source\x18\x01 \x01(\t\x12\x0c\n\x04name\x18\x02 \x01(\t\x12&\n\tsentences\x18\x04 \x03(\x0b\x32\x13.text_data.Sentence"Q\n\x0bSampledData\x12\x0e\n\x06source\x18\x01 \x01(\t\x12\x0c\n\x04name\x18\x02 \x01(\t\x12$\n\x07samples\x18\x03 \x03(\x0b\x32\x13.text_data.Sentenceb\x06proto3'
|
| 18 |
+
)
|
| 19 |
+
|
| 20 |
+
_globals = globals()
|
| 21 |
+
_builder.BuildMessageAndEnumDescriptors(DESCRIPTOR, _globals)
|
| 22 |
+
_builder.BuildTopDescriptorsAndMessages(DESCRIPTOR, "text_data_pb2", _globals)
|
| 23 |
+
if _descriptor._USE_C_DESCRIPTORS == False:
|
| 24 |
+
DESCRIPTOR._options = None
|
| 25 |
+
_globals["_SEMANTICS"]._serialized_start = 30
|
| 26 |
+
_globals["_SEMANTICS"]._serialized_end = 57
|
| 27 |
+
_globals["_SENTENCE"]._serialized_start = 59
|
| 28 |
+
_globals["_SENTENCE"]._serialized_end = 125
|
| 29 |
+
_globals["_TEXTDATA"]._serialized_start = 127
|
| 30 |
+
_globals["_TEXTDATA"]._serialized_end = 207
|
| 31 |
+
_globals["_SAMPLEDDATA"]._serialized_start = 209
|
| 32 |
+
_globals["_SAMPLEDDATA"]._serialized_end = 290
|
| 33 |
+
# @@protoc_insertion_point(module_scope)
|
fish_speech/datasets/protos/text_data_stream.py
ADDED
|
@@ -0,0 +1,36 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import struct
|
| 2 |
+
|
| 3 |
+
from .text_data_pb2 import TextData
|
| 4 |
+
|
| 5 |
+
|
| 6 |
+
def read_pb_stream(f):
|
| 7 |
+
while True:
|
| 8 |
+
buf = f.read(4)
|
| 9 |
+
if len(buf) == 0:
|
| 10 |
+
break
|
| 11 |
+
size = struct.unpack("I", buf)[0]
|
| 12 |
+
buf = f.read(size)
|
| 13 |
+
text_data = TextData()
|
| 14 |
+
text_data.ParseFromString(buf)
|
| 15 |
+
yield text_data
|
| 16 |
+
|
| 17 |
+
|
| 18 |
+
def write_pb_stream(f, text_data):
|
| 19 |
+
buf = text_data.SerializeToString()
|
| 20 |
+
f.write(struct.pack("I", len(buf)))
|
| 21 |
+
f.write(buf)
|
| 22 |
+
|
| 23 |
+
|
| 24 |
+
def pack_pb_stream(text_data):
|
| 25 |
+
buf = text_data.SerializeToString()
|
| 26 |
+
return struct.pack("I", len(buf)) + buf
|
| 27 |
+
|
| 28 |
+
|
| 29 |
+
def split_pb_stream(f):
|
| 30 |
+
while True:
|
| 31 |
+
head = f.read(4)
|
| 32 |
+
if len(head) == 0:
|
| 33 |
+
break
|
| 34 |
+
size = struct.unpack("I", head)[0]
|
| 35 |
+
buf = f.read(size)
|
| 36 |
+
yield head + buf
|
fish_speech/datasets/semantic.py
ADDED
|
@@ -0,0 +1,496 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import random
|
| 2 |
+
from dataclasses import dataclass
|
| 3 |
+
from itertools import chain
|
| 4 |
+
from pathlib import Path
|
| 5 |
+
from random import Random
|
| 6 |
+
from typing import Optional, Union
|
| 7 |
+
|
| 8 |
+
import numpy as np
|
| 9 |
+
import pyarrow.parquet as pq
|
| 10 |
+
import torch
|
| 11 |
+
import torch.nn.functional as F
|
| 12 |
+
from datasets.download.streaming_download_manager import xopen
|
| 13 |
+
from huggingface_hub import HfApi
|
| 14 |
+
from lightning import LightningDataModule
|
| 15 |
+
from torch.distributed import get_rank, get_world_size, is_initialized
|
| 16 |
+
from torch.utils.data import DataLoader, IterableDataset, get_worker_info
|
| 17 |
+
from transformers import AutoTokenizer
|
| 18 |
+
|
| 19 |
+
from fish_speech.conversation import CODEBOOK_PAD_TOKEN_ID
|
| 20 |
+
from fish_speech.datasets.protos.text_data_pb2 import SampledData
|
| 21 |
+
from fish_speech.datasets.protos.text_data_stream import read_pb_stream
|
| 22 |
+
from fish_speech.text.clean import clean_text
|
| 23 |
+
from fish_speech.utils import RankedLogger
|
| 24 |
+
from fish_speech.utils.braceexpand import braceexpand
|
| 25 |
+
|
| 26 |
+
log = RankedLogger(__name__, rank_zero_only=True)
|
| 27 |
+
|
| 28 |
+
|
| 29 |
+
def split_by_rank_worker(files):
|
| 30 |
+
# We need to know the total number of devices
|
| 31 |
+
# to split the data properly
|
| 32 |
+
|
| 33 |
+
total_devices = 1
|
| 34 |
+
if is_initialized():
|
| 35 |
+
total_devices = get_world_size()
|
| 36 |
+
|
| 37 |
+
worker_info = get_worker_info()
|
| 38 |
+
if worker_info is not None:
|
| 39 |
+
total_devices *= worker_info.num_workers
|
| 40 |
+
|
| 41 |
+
if len(files) < total_devices:
|
| 42 |
+
# Repeat the files N times to match the number of devices
|
| 43 |
+
files = files * (total_devices // len(files) + 1)
|
| 44 |
+
|
| 45 |
+
# DDP
|
| 46 |
+
if is_initialized():
|
| 47 |
+
files = files[get_rank() :: get_world_size()]
|
| 48 |
+
|
| 49 |
+
# Split by worker
|
| 50 |
+
if worker_info is not None:
|
| 51 |
+
files = files[worker_info.id :: worker_info.num_workers]
|
| 52 |
+
|
| 53 |
+
return files
|
| 54 |
+
|
| 55 |
+
|
| 56 |
+
class AutoTextSemanticInstructionDataset(IterableDataset):
|
| 57 |
+
"""
|
| 58 |
+
Auto Augment Dataset by Speaker
|
| 59 |
+
|
| 60 |
+
1. Random concatenate multiple sentences from the same speaker to form a longer sentence
|
| 61 |
+
2. Automatically normalize the text
|
| 62 |
+
|
| 63 |
+
For interactive mode, we use the following format (multiple sequences):
|
| 64 |
+
<s> [INST] [SPK: speaker] text [/INST] ... [INST] text [/INST] </s>
|
| 65 |
+
|
| 66 |
+
For non-interactive mode, we use the following format (one long sequence):
|
| 67 |
+
<s> [INST] text [/INST] ... </s>
|
| 68 |
+
"""
|
| 69 |
+
|
| 70 |
+
def __init__(
|
| 71 |
+
self,
|
| 72 |
+
proto_files: list[str],
|
| 73 |
+
seed: int = 42,
|
| 74 |
+
interactive_prob: float = 0.5,
|
| 75 |
+
max_length: int = 1024,
|
| 76 |
+
tokenizer: AutoTokenizer = None,
|
| 77 |
+
use_speaker: bool | float = True,
|
| 78 |
+
causal: bool = True,
|
| 79 |
+
num_codebooks: Optional[int] = None,
|
| 80 |
+
skip_text_prob: float = 0.0,
|
| 81 |
+
):
|
| 82 |
+
"""
|
| 83 |
+
Args:
|
| 84 |
+
proto_files: proto buf files if using local data
|
| 85 |
+
seed: random seed
|
| 86 |
+
interactive_prob: probability to use interactive mode
|
| 87 |
+
max_length: max length of the text
|
| 88 |
+
tokenizer: tokenizer
|
| 89 |
+
use_speaker: include speaker information in the prompt
|
| 90 |
+
causal: use causal sampling when using local data, disable will lead to random sampling
|
| 91 |
+
num_codebooks: number of codebooks, if None, it will be automatically detected
|
| 92 |
+
skip_text_prob: probability to skip the text (audio only), this only applies to interactive mode
|
| 93 |
+
"""
|
| 94 |
+
|
| 95 |
+
super().__init__()
|
| 96 |
+
|
| 97 |
+
assert 0 <= interactive_prob <= 1, "interactive_prob must be in [0, 1]"
|
| 98 |
+
|
| 99 |
+
self.seed = seed
|
| 100 |
+
self.max_length = max_length
|
| 101 |
+
self.tokenizer = tokenizer
|
| 102 |
+
self.interactive_prob = interactive_prob
|
| 103 |
+
self.use_speaker = use_speaker
|
| 104 |
+
self.proto_files = proto_files
|
| 105 |
+
self.causal = causal
|
| 106 |
+
self.num_codebooks = num_codebooks
|
| 107 |
+
self.skip_text_prob = skip_text_prob
|
| 108 |
+
|
| 109 |
+
self.semantic_token_id = self.tokenizer.convert_tokens_to_ids("<|semantic|>")
|
| 110 |
+
self.groups = None
|
| 111 |
+
|
| 112 |
+
def init_mock_data_server(self):
|
| 113 |
+
if self.groups is not None:
|
| 114 |
+
return
|
| 115 |
+
|
| 116 |
+
# Expand the proto files
|
| 117 |
+
expanded_proto_files = []
|
| 118 |
+
for filename in self.proto_files:
|
| 119 |
+
for i in braceexpand(filename):
|
| 120 |
+
i = Path(i)
|
| 121 |
+
if i.is_file():
|
| 122 |
+
expanded_proto_files.append(i)
|
| 123 |
+
elif i.is_dir():
|
| 124 |
+
expanded_proto_files.extend(i.rglob("*.proto"))
|
| 125 |
+
expanded_proto_files.extend(i.rglob("*.protos"))
|
| 126 |
+
else:
|
| 127 |
+
raise ValueError(f"{i} is not a file or directory")
|
| 128 |
+
|
| 129 |
+
expanded_proto_files = sorted(expanded_proto_files)
|
| 130 |
+
Random(self.seed).shuffle(expanded_proto_files)
|
| 131 |
+
|
| 132 |
+
self.groups = []
|
| 133 |
+
shard_proto_files = split_by_rank_worker(expanded_proto_files)
|
| 134 |
+
log.info(
|
| 135 |
+
f"Reading {len(shard_proto_files)} / {len(expanded_proto_files)} files"
|
| 136 |
+
)
|
| 137 |
+
|
| 138 |
+
count = 0
|
| 139 |
+
for filename in shard_proto_files:
|
| 140 |
+
with open(filename, "rb") as f:
|
| 141 |
+
for text_data in read_pb_stream(f):
|
| 142 |
+
self.groups.append(text_data)
|
| 143 |
+
count += 1
|
| 144 |
+
|
| 145 |
+
log.info(f"Read total {count} groups of data")
|
| 146 |
+
|
| 147 |
+
# Shuffle the lines
|
| 148 |
+
Random(self.seed).shuffle(self.groups)
|
| 149 |
+
self.group_weights = [len(i.sentences) for i in self.groups]
|
| 150 |
+
|
| 151 |
+
def __iter__(self):
|
| 152 |
+
while True:
|
| 153 |
+
yield self.augment()
|
| 154 |
+
|
| 155 |
+
def tokenize_sentence(self, sentence: str):
|
| 156 |
+
sentence = clean_text(sentence)
|
| 157 |
+
tokens = self.tokenizer.encode(
|
| 158 |
+
f"{sentence}",
|
| 159 |
+
max_length=10**6,
|
| 160 |
+
add_special_tokens=False,
|
| 161 |
+
truncation=False,
|
| 162 |
+
)
|
| 163 |
+
return sentence, len(tokens)
|
| 164 |
+
|
| 165 |
+
def sample_data(self):
|
| 166 |
+
if self.groups is None:
|
| 167 |
+
self.init_mock_data_server()
|
| 168 |
+
|
| 169 |
+
# Shuffle unique lines, estimate that each sample is at least 20 tokens
|
| 170 |
+
num_samples = self.max_length // 20
|
| 171 |
+
|
| 172 |
+
# choice group based on their number of samples
|
| 173 |
+
group = random.choices(self.groups, weights=self.group_weights, k=1)[0]
|
| 174 |
+
|
| 175 |
+
if self.causal:
|
| 176 |
+
# Sample in order
|
| 177 |
+
if num_samples >= len(group.sentences):
|
| 178 |
+
samples = group.sentences
|
| 179 |
+
else:
|
| 180 |
+
begin = random.randint(0, len(group.sentences) - num_samples)
|
| 181 |
+
samples = group.sentences[begin : begin + num_samples]
|
| 182 |
+
else:
|
| 183 |
+
samples = random.choices(
|
| 184 |
+
group.sentences, k=min(num_samples, len(group.sentences))
|
| 185 |
+
)
|
| 186 |
+
|
| 187 |
+
return SampledData(
|
| 188 |
+
source=group.source,
|
| 189 |
+
name=group.name,
|
| 190 |
+
samples=samples,
|
| 191 |
+
)
|
| 192 |
+
|
| 193 |
+
def augment(self):
|
| 194 |
+
final_text, final_semantic = [], []
|
| 195 |
+
response = self.sample_data()
|
| 196 |
+
if len(response.samples) == 0:
|
| 197 |
+
# Invalid group
|
| 198 |
+
return None
|
| 199 |
+
|
| 200 |
+
samples = list(response.samples)
|
| 201 |
+
idx = 0
|
| 202 |
+
use_interactive = random.random() < self.interactive_prob
|
| 203 |
+
|
| 204 |
+
if use_interactive is False:
|
| 205 |
+
# Random sample based on speaker using a truncated normal distribution
|
| 206 |
+
a = torch.tensor([0], dtype=torch.float32)
|
| 207 |
+
torch.nn.init.trunc_normal_(
|
| 208 |
+
a,
|
| 209 |
+
mean=self.max_length // 2,
|
| 210 |
+
std=self.max_length // 4,
|
| 211 |
+
a=10,
|
| 212 |
+
b=self.max_length,
|
| 213 |
+
)
|
| 214 |
+
remaining_tokens = a.long().item() - 4
|
| 215 |
+
else:
|
| 216 |
+
remaining_tokens = self.max_length
|
| 217 |
+
|
| 218 |
+
# Use speaker
|
| 219 |
+
if isinstance(self.use_speaker, float):
|
| 220 |
+
use_speaker = random.random() < self.use_speaker
|
| 221 |
+
else:
|
| 222 |
+
use_speaker = self.use_speaker
|
| 223 |
+
|
| 224 |
+
all_tokens, all_labels = [], []
|
| 225 |
+
while remaining_tokens > 0 and len(samples) > 0:
|
| 226 |
+
sentence = samples.pop(0)
|
| 227 |
+
|
| 228 |
+
text = random.choice(sentence.texts)
|
| 229 |
+
text, length = self.tokenize_sentence(text)
|
| 230 |
+
remaining_tokens -= length + len(sentence.semantics[0].values)
|
| 231 |
+
|
| 232 |
+
if use_interactive is False:
|
| 233 |
+
final_text.append(text)
|
| 234 |
+
final_semantic.append(sentence.semantics)
|
| 235 |
+
else:
|
| 236 |
+
# For interactive mode, we only apply speaker for the first sentence
|
| 237 |
+
# [INST] [SPK: speaker] text [/INST] ... [INST] text [/INST]
|
| 238 |
+
tokens, labels = self.pack_sentences(
|
| 239 |
+
sentences=[text],
|
| 240 |
+
semantics=[sentence.semantics],
|
| 241 |
+
speaker=response.name if use_speaker else None,
|
| 242 |
+
skip_text=random.random() < self.skip_text_prob,
|
| 243 |
+
)
|
| 244 |
+
|
| 245 |
+
all_tokens.append(tokens)
|
| 246 |
+
all_labels.append(labels)
|
| 247 |
+
|
| 248 |
+
idx += 1
|
| 249 |
+
|
| 250 |
+
if use_interactive is False:
|
| 251 |
+
tokens, labels = self.pack_sentences(
|
| 252 |
+
final_text,
|
| 253 |
+
semantics=final_semantic,
|
| 254 |
+
speaker=response.name if use_speaker else None,
|
| 255 |
+
)
|
| 256 |
+
all_tokens.append(tokens)
|
| 257 |
+
all_labels.append(labels)
|
| 258 |
+
|
| 259 |
+
tokens = torch.cat(all_tokens, dim=1)
|
| 260 |
+
labels = torch.cat(all_labels, dim=1)
|
| 261 |
+
|
| 262 |
+
# Verify that the length is correct
|
| 263 |
+
assert tokens.size(1) == labels.size(1), f"{tokens.size(1)} != {labels.size(1)}"
|
| 264 |
+
|
| 265 |
+
data = {"tokens": tokens, "labels": labels}
|
| 266 |
+
|
| 267 |
+
return data
|
| 268 |
+
|
| 269 |
+
def pack_sentences(
|
| 270 |
+
self,
|
| 271 |
+
sentences: list[str],
|
| 272 |
+
semantics: list,
|
| 273 |
+
speaker: Optional[str] = None,
|
| 274 |
+
skip_text: bool = False,
|
| 275 |
+
):
|
| 276 |
+
if speaker is None:
|
| 277 |
+
speaker = "assistant"
|
| 278 |
+
|
| 279 |
+
cated_sentences = " ".join(sentences)
|
| 280 |
+
if skip_text:
|
| 281 |
+
cated_sentences = "<|skip_text|>"
|
| 282 |
+
|
| 283 |
+
final_text = "<|im_start|>user\n" + cated_sentences + "<|im_end|>"
|
| 284 |
+
final_text = final_text + f"<|im_start|>{speaker}\n"
|
| 285 |
+
|
| 286 |
+
encoded = self.tokenizer.encode(
|
| 287 |
+
final_text,
|
| 288 |
+
add_special_tokens=False,
|
| 289 |
+
truncation=False,
|
| 290 |
+
max_length=10**6,
|
| 291 |
+
)
|
| 292 |
+
semantic_length = sum([len(i[0].values) for i in semantics])
|
| 293 |
+
prompt_length = len(encoded)
|
| 294 |
+
num_codebooks = (
|
| 295 |
+
len(semantics[0]) if self.num_codebooks is None else self.num_codebooks
|
| 296 |
+
)
|
| 297 |
+
|
| 298 |
+
# Pack the tokens and semantics (add <s> and </s> to semantic tokens)
|
| 299 |
+
tokens = (
|
| 300 |
+
encoded
|
| 301 |
+
+ [self.semantic_token_id] * semantic_length
|
| 302 |
+
+ self.tokenizer.convert_tokens_to_ids(["<|im_end|>"])
|
| 303 |
+
)
|
| 304 |
+
|
| 305 |
+
# Codebook bos/padding: 0, eos: 1
|
| 306 |
+
codes = [[CODEBOOK_PAD_TOKEN_ID] * prompt_length for _ in range(num_codebooks)]
|
| 307 |
+
for segment in semantics:
|
| 308 |
+
for book_idx, book in zip(range(num_codebooks), segment):
|
| 309 |
+
for j in book.values:
|
| 310 |
+
codes[book_idx].append(int(j) + 1)
|
| 311 |
+
|
| 312 |
+
for book in codes:
|
| 313 |
+
book.extend([CODEBOOK_PAD_TOKEN_ID] * 1)
|
| 314 |
+
|
| 315 |
+
tokens = [tokens] + codes
|
| 316 |
+
|
| 317 |
+
tokens = torch.tensor(tokens, dtype=torch.long)
|
| 318 |
+
labels = tokens.clone()
|
| 319 |
+
|
| 320 |
+
if skip_text:
|
| 321 |
+
# If text is not provided, the sentence is used for condition only, all labels are -100
|
| 322 |
+
torch.fill_(labels, -100)
|
| 323 |
+
return tokens, labels
|
| 324 |
+
|
| 325 |
+
# Mask out the <s> tokens for semantic, predict semantic tokens only
|
| 326 |
+
# Since we don't mask out the input tokens, the language modeling still works
|
| 327 |
+
labels[1:, :prompt_length] = -100
|
| 328 |
+
|
| 329 |
+
tokens = tokens[:, :-1]
|
| 330 |
+
labels = labels[:, 1:]
|
| 331 |
+
|
| 332 |
+
# Verify the padding is correct, and the last token is eos
|
| 333 |
+
assert (tokens[1:, :prompt_length] == CODEBOOK_PAD_TOKEN_ID).all()
|
| 334 |
+
assert (labels[1:, -1:] == CODEBOOK_PAD_TOKEN_ID).all()
|
| 335 |
+
|
| 336 |
+
return tokens, labels
|
| 337 |
+
|
| 338 |
+
|
| 339 |
+
@dataclass
|
| 340 |
+
class TextDataCollator:
|
| 341 |
+
tokenizer: AutoTokenizer
|
| 342 |
+
max_length: int = 1024
|
| 343 |
+
|
| 344 |
+
def __call__(self, examples):
|
| 345 |
+
if "negative_tokens" in examples:
|
| 346 |
+
positive_examples = []
|
| 347 |
+
negative_examples = []
|
| 348 |
+
|
| 349 |
+
for i in examples:
|
| 350 |
+
positive_examples.append(
|
| 351 |
+
{
|
| 352 |
+
"tokens": i["tokens"],
|
| 353 |
+
"labels": i["labels"],
|
| 354 |
+
}
|
| 355 |
+
)
|
| 356 |
+
negative_examples.append(
|
| 357 |
+
{
|
| 358 |
+
"tokens": i["negative_tokens"],
|
| 359 |
+
"labels": i["negative_labels"],
|
| 360 |
+
}
|
| 361 |
+
)
|
| 362 |
+
|
| 363 |
+
examples = positive_examples + negative_examples
|
| 364 |
+
|
| 365 |
+
return self.batchify(examples)
|
| 366 |
+
|
| 367 |
+
def batchify(self, examples, tokens_key="tokens", labels_key="labels"):
|
| 368 |
+
tokens, attention_masks, labels = [], [], []
|
| 369 |
+
|
| 370 |
+
# Calculate the max length
|
| 371 |
+
max_tokens_length = 0
|
| 372 |
+
for example in examples:
|
| 373 |
+
max_tokens_length = max(max_tokens_length, example[tokens_key].size(1))
|
| 374 |
+
max_tokens_length = min(max_tokens_length, self.max_length)
|
| 375 |
+
|
| 376 |
+
for example in examples:
|
| 377 |
+
_tokens = example[tokens_key][:, :max_tokens_length]
|
| 378 |
+
_labels = example[labels_key][:, :max_tokens_length]
|
| 379 |
+
_attention_mask = torch.ones((max_tokens_length,), dtype=torch.bool)
|
| 380 |
+
tokens_length = _tokens.size(1)
|
| 381 |
+
_attention_mask[:tokens_length] = False
|
| 382 |
+
|
| 383 |
+
assert tokens_length == _labels.size(
|
| 384 |
+
1
|
| 385 |
+
), f"{tokens_length} != {_labels.size(1)}"
|
| 386 |
+
|
| 387 |
+
if tokens_length < max_tokens_length:
|
| 388 |
+
_tokens = F.pad(
|
| 389 |
+
_tokens,
|
| 390 |
+
(0, max_tokens_length - tokens_length),
|
| 391 |
+
value=self.tokenizer.eos_token_id,
|
| 392 |
+
)
|
| 393 |
+
_tokens[1:, tokens_length:] = CODEBOOK_PAD_TOKEN_ID
|
| 394 |
+
_labels = F.pad(
|
| 395 |
+
_labels, (0, max_tokens_length - _labels.size(1)), value=-100
|
| 396 |
+
)
|
| 397 |
+
|
| 398 |
+
tokens.append(_tokens)
|
| 399 |
+
attention_masks.append(_attention_mask)
|
| 400 |
+
labels.append(_labels)
|
| 401 |
+
|
| 402 |
+
tokens = torch.stack(tokens, dim=0)
|
| 403 |
+
attention_masks = torch.stack(attention_masks, dim=0)
|
| 404 |
+
labels = torch.stack(labels, dim=0)
|
| 405 |
+
|
| 406 |
+
return {
|
| 407 |
+
"inputs": tokens,
|
| 408 |
+
"attention_masks": attention_masks,
|
| 409 |
+
"labels": labels,
|
| 410 |
+
}
|
| 411 |
+
|
| 412 |
+
|
| 413 |
+
class InterleaveDataset(IterableDataset):
|
| 414 |
+
def __init__(
|
| 415 |
+
self,
|
| 416 |
+
datasets: list[IterableDataset],
|
| 417 |
+
probabilities: list[float],
|
| 418 |
+
seed: int = 42,
|
| 419 |
+
):
|
| 420 |
+
super().__init__()
|
| 421 |
+
|
| 422 |
+
self.datasets = datasets
|
| 423 |
+
self.probabilities = probabilities
|
| 424 |
+
self.seed = seed
|
| 425 |
+
|
| 426 |
+
def __iter__(self):
|
| 427 |
+
rng = np.random.default_rng(self.seed)
|
| 428 |
+
dataset_iterators = [iter(dataset) for dataset in self.datasets]
|
| 429 |
+
|
| 430 |
+
while True:
|
| 431 |
+
# Random choice one
|
| 432 |
+
dataset_idx = rng.choice(len(self.datasets), p=self.probabilities)
|
| 433 |
+
dataset_iterator = dataset_iterators[dataset_idx]
|
| 434 |
+
|
| 435 |
+
try:
|
| 436 |
+
yield next(dataset_iterator)
|
| 437 |
+
except StopIteration:
|
| 438 |
+
# Exhausted, create a new iterator
|
| 439 |
+
dataset_iterators[dataset_idx] = iter(self.datasets[dataset_idx])
|
| 440 |
+
yield next(dataset_iterators[dataset_idx])
|
| 441 |
+
|
| 442 |
+
|
| 443 |
+
class SemanticDataModule(LightningDataModule):
|
| 444 |
+
def __init__(
|
| 445 |
+
self,
|
| 446 |
+
train_dataset: Union[AutoTextSemanticInstructionDataset, InterleaveDataset],
|
| 447 |
+
val_dataset: Union[AutoTextSemanticInstructionDataset, InterleaveDataset],
|
| 448 |
+
batch_size: int = 32,
|
| 449 |
+
tokenizer: AutoTokenizer = None,
|
| 450 |
+
max_length: int = 1024,
|
| 451 |
+
num_workers: int = 4,
|
| 452 |
+
):
|
| 453 |
+
super().__init__()
|
| 454 |
+
|
| 455 |
+
self.train_dataset = train_dataset
|
| 456 |
+
self.val_dataset = val_dataset
|
| 457 |
+
self.batch_size = batch_size
|
| 458 |
+
self.tokenizer = tokenizer
|
| 459 |
+
self.max_length = max_length
|
| 460 |
+
self.num_workers = num_workers
|
| 461 |
+
|
| 462 |
+
def train_dataloader(self):
|
| 463 |
+
return DataLoader(
|
| 464 |
+
self.train_dataset,
|
| 465 |
+
batch_size=self.batch_size,
|
| 466 |
+
collate_fn=TextDataCollator(self.tokenizer, self.max_length),
|
| 467 |
+
num_workers=self.num_workers,
|
| 468 |
+
persistent_workers=True,
|
| 469 |
+
)
|
| 470 |
+
|
| 471 |
+
def val_dataloader(self):
|
| 472 |
+
return DataLoader(
|
| 473 |
+
self.val_dataset,
|
| 474 |
+
batch_size=self.batch_size,
|
| 475 |
+
collate_fn=TextDataCollator(self.tokenizer, self.max_length),
|
| 476 |
+
num_workers=self.num_workers,
|
| 477 |
+
persistent_workers=True,
|
| 478 |
+
)
|
| 479 |
+
|
| 480 |
+
|
| 481 |
+
if __name__ == "__main__":
|
| 482 |
+
from tqdm import tqdm
|
| 483 |
+
|
| 484 |
+
ds = AutoTextSemanticInstructionDataset(
|
| 485 |
+
["data/protos"],
|
| 486 |
+
tokenizer=AutoTokenizer.from_pretrained("fishaudio/fish-speech-1"),
|
| 487 |
+
use_speaker=False,
|
| 488 |
+
interactive_prob=1.0,
|
| 489 |
+
skip_text_prob=0.5,
|
| 490 |
+
)
|
| 491 |
+
|
| 492 |
+
for i in ds:
|
| 493 |
+
print(ds.tokenizer.decode(i["tokens"][0], skip_special_tokens=False))
|
| 494 |
+
# i["labels"][0][i["labels"][0] == -100] = 0
|
| 495 |
+
# print(ds.tokenizer.decode(i["labels"][0], skip_special_tokens=False))
|
| 496 |
+
break
|
fish_speech/datasets/vqgan.py
ADDED
|
@@ -0,0 +1,147 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from dataclasses import dataclass
|
| 2 |
+
from pathlib import Path
|
| 3 |
+
from typing import Optional
|
| 4 |
+
|
| 5 |
+
import librosa
|
| 6 |
+
import numpy as np
|
| 7 |
+
import torch
|
| 8 |
+
from lightning import LightningDataModule
|
| 9 |
+
from torch.utils.data import DataLoader, Dataset
|
| 10 |
+
|
| 11 |
+
from fish_speech.utils import RankedLogger
|
| 12 |
+
|
| 13 |
+
logger = RankedLogger(__name__, rank_zero_only=False)
|
| 14 |
+
|
| 15 |
+
|
| 16 |
+
class VQGANDataset(Dataset):
|
| 17 |
+
def __init__(
|
| 18 |
+
self,
|
| 19 |
+
filelist: str,
|
| 20 |
+
sample_rate: int = 32000,
|
| 21 |
+
hop_length: int = 640,
|
| 22 |
+
slice_frames: Optional[int] = None,
|
| 23 |
+
):
|
| 24 |
+
super().__init__()
|
| 25 |
+
|
| 26 |
+
filelist = Path(filelist)
|
| 27 |
+
root = filelist.parent
|
| 28 |
+
|
| 29 |
+
self.files = [
|
| 30 |
+
root / line.strip()
|
| 31 |
+
for line in filelist.read_text(encoding="utf-8").splitlines()
|
| 32 |
+
if line.strip()
|
| 33 |
+
]
|
| 34 |
+
self.sample_rate = sample_rate
|
| 35 |
+
self.hop_length = hop_length
|
| 36 |
+
self.slice_frames = slice_frames
|
| 37 |
+
|
| 38 |
+
def __len__(self):
|
| 39 |
+
return len(self.files)
|
| 40 |
+
|
| 41 |
+
def get_item(self, idx):
|
| 42 |
+
file = self.files[idx]
|
| 43 |
+
|
| 44 |
+
audio, _ = librosa.load(file, sr=self.sample_rate, mono=True)
|
| 45 |
+
|
| 46 |
+
# Slice audio and features
|
| 47 |
+
if (
|
| 48 |
+
self.slice_frames is not None
|
| 49 |
+
and audio.shape[0] > self.slice_frames * self.hop_length
|
| 50 |
+
):
|
| 51 |
+
start = np.random.randint(
|
| 52 |
+
0, audio.shape[0] - self.slice_frames * self.hop_length
|
| 53 |
+
)
|
| 54 |
+
audio = audio[start : start + self.slice_frames * self.hop_length]
|
| 55 |
+
|
| 56 |
+
if len(audio) == 0:
|
| 57 |
+
return None
|
| 58 |
+
|
| 59 |
+
max_value = np.abs(audio).max()
|
| 60 |
+
if max_value > 1.0:
|
| 61 |
+
audio = audio / max_value
|
| 62 |
+
|
| 63 |
+
return {
|
| 64 |
+
"audio": torch.from_numpy(audio),
|
| 65 |
+
}
|
| 66 |
+
|
| 67 |
+
def __getitem__(self, idx):
|
| 68 |
+
try:
|
| 69 |
+
return self.get_item(idx)
|
| 70 |
+
except Exception as e:
|
| 71 |
+
import traceback
|
| 72 |
+
|
| 73 |
+
traceback.print_exc()
|
| 74 |
+
logger.error(f"Error loading {self.files[idx]}: {e}")
|
| 75 |
+
return None
|
| 76 |
+
|
| 77 |
+
|
| 78 |
+
@dataclass
|
| 79 |
+
class VQGANCollator:
|
| 80 |
+
def __call__(self, batch):
|
| 81 |
+
batch = [x for x in batch if x is not None]
|
| 82 |
+
|
| 83 |
+
audio_lengths = torch.tensor([len(x["audio"]) for x in batch])
|
| 84 |
+
audio_maxlen = audio_lengths.max()
|
| 85 |
+
|
| 86 |
+
# Rounds up to nearest multiple of 2 (audio_lengths)
|
| 87 |
+
audios = []
|
| 88 |
+
for x in batch:
|
| 89 |
+
audios.append(
|
| 90 |
+
torch.nn.functional.pad(x["audio"], (0, audio_maxlen - len(x["audio"])))
|
| 91 |
+
)
|
| 92 |
+
|
| 93 |
+
return {
|
| 94 |
+
"audios": torch.stack(audios),
|
| 95 |
+
"audio_lengths": audio_lengths,
|
| 96 |
+
}
|
| 97 |
+
|
| 98 |
+
|
| 99 |
+
class VQGANDataModule(LightningDataModule):
|
| 100 |
+
def __init__(
|
| 101 |
+
self,
|
| 102 |
+
train_dataset: VQGANDataset,
|
| 103 |
+
val_dataset: VQGANDataset,
|
| 104 |
+
batch_size: int = 32,
|
| 105 |
+
num_workers: int = 4,
|
| 106 |
+
val_batch_size: Optional[int] = None,
|
| 107 |
+
):
|
| 108 |
+
super().__init__()
|
| 109 |
+
|
| 110 |
+
self.train_dataset = train_dataset
|
| 111 |
+
self.val_dataset = val_dataset
|
| 112 |
+
self.batch_size = batch_size
|
| 113 |
+
self.val_batch_size = val_batch_size or batch_size
|
| 114 |
+
self.num_workers = num_workers
|
| 115 |
+
|
| 116 |
+
def train_dataloader(self):
|
| 117 |
+
return DataLoader(
|
| 118 |
+
self.train_dataset,
|
| 119 |
+
batch_size=self.batch_size,
|
| 120 |
+
collate_fn=VQGANCollator(),
|
| 121 |
+
num_workers=self.num_workers,
|
| 122 |
+
shuffle=True,
|
| 123 |
+
persistent_workers=True,
|
| 124 |
+
)
|
| 125 |
+
|
| 126 |
+
def val_dataloader(self):
|
| 127 |
+
return DataLoader(
|
| 128 |
+
self.val_dataset,
|
| 129 |
+
batch_size=self.val_batch_size,
|
| 130 |
+
collate_fn=VQGANCollator(),
|
| 131 |
+
num_workers=self.num_workers,
|
| 132 |
+
persistent_workers=True,
|
| 133 |
+
)
|
| 134 |
+
|
| 135 |
+
|
| 136 |
+
if __name__ == "__main__":
|
| 137 |
+
dataset = VQGANDataset("data/LibriTTS_R/vq_train_filelist.txt")
|
| 138 |
+
dataloader = DataLoader(
|
| 139 |
+
dataset, batch_size=4, shuffle=False, collate_fn=VQGANCollator()
|
| 140 |
+
)
|
| 141 |
+
|
| 142 |
+
for batch in dataloader:
|
| 143 |
+
print(batch["audios"].shape)
|
| 144 |
+
print(batch["features"].shape)
|
| 145 |
+
print(batch["audio_lengths"])
|
| 146 |
+
print(batch["feature_lengths"])
|
| 147 |
+
break
|
fish_speech/i18n/README.md
ADDED
|
@@ -0,0 +1,27 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
## i18n Folder Attribution
|
| 2 |
+
|
| 3 |
+
The `i18n` folder within the `fish_speech` directory contains files initially sourced from the RVC project. In compliance with the MIT license under which these files were released, we acknowledge the original authors and sources below:
|
| 4 |
+
|
| 5 |
+
### fish_speech/i18n/core.py
|
| 6 |
+
|
| 7 |
+
**Related code from RVC:**
|
| 8 |
+
[https://github.com/RVC-Project/Retrieval-based-Voice-Conversion-WebUI/blob/83d6a64e675d9bbd6e92ee450c5f807ed2bb54d8/i18n/i18n.py](https://github.com/RVC-Project/Retrieval-based-Voice-Conversion-WebUI/blob/83d6a64e675d9bbd6e92ee450c5f807ed2bb54d8/i18n/i18n.py)
|
| 9 |
+
|
| 10 |
+
**Initial commit:**
|
| 11 |
+
add localization(添加本地化) [RVC-Project/Retrieval-based-Voice-Conversion-WebUI#35](https://github.com/RVC-Project/Retrieval-based-Voice-Conversion-WebUI/pull/35)
|
| 12 |
+
|
| 13 |
+
**Initial author:**
|
| 14 |
+
[@L4Ph](https://github.com/L4Ph)
|
| 15 |
+
|
| 16 |
+
### fish_speech/i18n/scan.py
|
| 17 |
+
|
| 18 |
+
**Related code from RVC:**
|
| 19 |
+
[https://github.com/RVC-Project/Retrieval-based-Voice-Conversion-WebUI/blob/83d6a64e675d9bbd6e92ee450c5f807ed2bb54d8/i18n/scan_i18n.py](https://github.com/RVC-Project/Retrieval-based-Voice-Conversion-WebUI/blob/83d6a64e675d9bbd6e92ee450c5f807ed2bb54d8/i18n/scan_i18n.py)
|
| 20 |
+
|
| 21 |
+
**Initial commit:**
|
| 22 |
+
File for detecting i18n missing keys [RVC-Project/Retrieval-based-Voice-Conversion-WebUI#1058](https://github.com/RVC-Project/Retrieval-based-Voice-Conversion-WebUI/pull/1058)
|
| 23 |
+
|
| 24 |
+
**Initial author:**
|
| 25 |
+
[@towzeur](https://github.com/towzeur)
|
| 26 |
+
|
| 27 |
+
We appreciate the contributions of the RVC project and its authors.
|
fish_speech/i18n/__init__.py
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from .core import i18n
|
| 2 |
+
|
| 3 |
+
__all__ = ["i18n"]
|
fish_speech/i18n/__pycache__/__init__.cpython-310.pyc
ADDED
|
Binary file (218 Bytes). View file
|
|
|
fish_speech/i18n/__pycache__/core.cpython-310.pyc
ADDED
|
Binary file (1.44 kB). View file
|
|
|
fish_speech/i18n/core.py
ADDED
|
@@ -0,0 +1,40 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import json
|
| 2 |
+
import locale
|
| 3 |
+
from pathlib import Path
|
| 4 |
+
|
| 5 |
+
I18N_FILE_PATH = Path(__file__).parent / "locale"
|
| 6 |
+
DEFAULT_LANGUAGE = "en_US"
|
| 7 |
+
|
| 8 |
+
|
| 9 |
+
def load_language_list(language):
|
| 10 |
+
with open(I18N_FILE_PATH / f"{language}.json", "r", encoding="utf-8") as f:
|
| 11 |
+
language_list = json.load(f)
|
| 12 |
+
|
| 13 |
+
return language_list
|
| 14 |
+
|
| 15 |
+
|
| 16 |
+
class I18nAuto:
|
| 17 |
+
def __init__(self):
|
| 18 |
+
i18n_file = Path(".locale")
|
| 19 |
+
|
| 20 |
+
if i18n_file.exists():
|
| 21 |
+
with open(i18n_file, "r", encoding="utf-8") as f:
|
| 22 |
+
language = f.read().strip()
|
| 23 |
+
else:
|
| 24 |
+
# getlocale can't identify the system's language ((None, None))
|
| 25 |
+
language = locale.getdefaultlocale()[0]
|
| 26 |
+
|
| 27 |
+
if (I18N_FILE_PATH / f"{language}.json").exists() is False:
|
| 28 |
+
language = DEFAULT_LANGUAGE
|
| 29 |
+
|
| 30 |
+
self.language = language
|
| 31 |
+
self.language_map = load_language_list(language)
|
| 32 |
+
|
| 33 |
+
def __call__(self, key):
|
| 34 |
+
return self.language_map.get(key, key)
|
| 35 |
+
|
| 36 |
+
def __repr__(self):
|
| 37 |
+
return "Use Language: " + self.language
|
| 38 |
+
|
| 39 |
+
|
| 40 |
+
i18n = I18nAuto()
|
fish_speech/i18n/locale/en_US.json
ADDED
|
@@ -0,0 +1,122 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"16-mixed is recommended for 10+ series GPU": "16-mixed is recommended for 10+ series GPU",
|
| 3 |
+
"5 to 10 seconds of reference audio, useful for specifying speaker.": "5 to 10 seconds of reference audio, useful for specifying speaker.",
|
| 4 |
+
"A text-to-speech model based on VQ-GAN and Llama developed by [Fish Audio](https://fish.audio).": "A text-to-speech model based on VQ-GAN and Llama developed by [Fish Audio](https://fish.audio).",
|
| 5 |
+
"Accumulate Gradient Batches": "Accumulate Gradient Batches",
|
| 6 |
+
"Add to Processing Area": "Add to Processing Area",
|
| 7 |
+
"Added path successfully!": "Added path successfully!",
|
| 8 |
+
"Advanced Config": "Advanced Config",
|
| 9 |
+
"Base LLAMA Model": "Base LLAMA Model",
|
| 10 |
+
"Batch Inference": "Batch Inference",
|
| 11 |
+
"Batch Size": "Batch Size",
|
| 12 |
+
"Changing with the Model Path": "Changing with the Model Path",
|
| 13 |
+
"Chinese": "Chinese",
|
| 14 |
+
"Compile Model": "Compile Model",
|
| 15 |
+
"Compile the model can significantly reduce the inference time, but will increase cold start time": "Compile the model can significantly reduce the inference time, but will increase cold start time",
|
| 16 |
+
"Copy": "Copy",
|
| 17 |
+
"Data Preprocessing": "Data Preprocessing",
|
| 18 |
+
"Data Preprocessing Path": "Data Preprocessing Path",
|
| 19 |
+
"Data Source": "Data Source",
|
| 20 |
+
"Decoder Model Config": "Decoder Model Config",
|
| 21 |
+
"Decoder Model Path": "Decoder Model Path",
|
| 22 |
+
"Disabled": "Disabled",
|
| 23 |
+
"Enable Reference Audio": "Enable Reference Audio",
|
| 24 |
+
"English": "English",
|
| 25 |
+
"Error Message": "Error Message",
|
| 26 |
+
"File Preprocessing": "File Preprocessing",
|
| 27 |
+
"Generate": "Generate",
|
| 28 |
+
"Generated Audio": "Generated Audio",
|
| 29 |
+
"If there is no corresponding text for the audio, apply ASR for assistance, support .txt or .lab format": "If there is no corresponding text for the audio, apply ASR for assistance, support .txt or .lab format",
|
| 30 |
+
"Infer interface is closed": "Infer interface is closed",
|
| 31 |
+
"Inference Configuration": "Inference Configuration",
|
| 32 |
+
"Inference Server Configuration": "Inference Server Configuration",
|
| 33 |
+
"Inference Server Error": "Inference Server Error",
|
| 34 |
+
"Inferring interface is launched at {}": "Inferring interface is launched at {}",
|
| 35 |
+
"Initial Learning Rate": "Initial Learning Rate",
|
| 36 |
+
"Input Audio & Source Path for Transcription": "Input Audio & Source Path for Transcription",
|
| 37 |
+
"Input Text": "Input Text",
|
| 38 |
+
"Invalid path: {}": "Invalid path: {}",
|
| 39 |
+
"It is recommended to use CUDA, if you have low configuration, use CPU": "It is recommended to use CUDA, if you have low configuration, use CPU",
|
| 40 |
+
"Iterative Prompt Length, 0 means off": "Iterative Prompt Length, 0 means off",
|
| 41 |
+
"Japanese": "Japanese",
|
| 42 |
+
"LLAMA Configuration": "LLAMA Configuration",
|
| 43 |
+
"LLAMA Model Config": "LLAMA Model Config",
|
| 44 |
+
"LLAMA Model Path": "LLAMA Model Path",
|
| 45 |
+
"Labeling Device": "Labeling Device",
|
| 46 |
+
"LoRA Model to be merged": "LoRA Model to be merged",
|
| 47 |
+
"Maximum Audio Duration": "Maximum Audio Duration",
|
| 48 |
+
"Maximum Length per Sample": "Maximum Length per Sample",
|
| 49 |
+
"Maximum Training Steps": "Maximum Training Steps",
|
| 50 |
+
"Maximum tokens per batch, 0 means no limit": "Maximum tokens per batch, 0 means no limit",
|
| 51 |
+
"Merge": "Merge",
|
| 52 |
+
"Merge LoRA": "Merge LoRA",
|
| 53 |
+
"Merge successfully": "Merge successfully",
|
| 54 |
+
"Minimum Audio Duration": "Minimum Audio Duration",
|
| 55 |
+
"Model Output Path": "Model Output Path",
|
| 56 |
+
"Model Size": "Model Size",
|
| 57 |
+
"Move": "Move",
|
| 58 |
+
"Move files successfully": "Move files successfully",
|
| 59 |
+
"No audio generated, please check the input text.": "No audio generated, please check the input text.",
|
| 60 |
+
"No selected options": "No selected options",
|
| 61 |
+
"Number of Workers": "Number of Workers",
|
| 62 |
+
"Open Inference Server": "Open Inference Server",
|
| 63 |
+
"Open Labeler WebUI": "Open Labeler WebUI",
|
| 64 |
+
"Open Tensorboard": "Open Tensorboard",
|
| 65 |
+
"Opened labeler in browser": "Opened labeler in browser",
|
| 66 |
+
"Optional Label Language": "Optional Label Language",
|
| 67 |
+
"Optional online ver": "Optional online ver",
|
| 68 |
+
"Output Path": "Output Path",
|
| 69 |
+
"Path error, please check the model file exists in the corresponding path": "Path error, please check the model file exists in the corresponding path",
|
| 70 |
+
"Precision": "Precision",
|
| 71 |
+
"Probability of applying Speaker Condition": "Probability of applying Speaker Condition",
|
| 72 |
+
"Put your text here.": "Put your text here.",
|
| 73 |
+
"Reference Audio": "Reference Audio",
|
| 74 |
+
"Reference Text": "Reference Text",
|
| 75 |
+
"Related code and weights are released under CC BY-NC-SA 4.0 License.": "Related code and weights are released under CC BY-NC-SA 4.0 License.",
|
| 76 |
+
"Remove Selected Data": "Remove Selected Data",
|
| 77 |
+
"Removed path successfully!": "Removed path successfully!",
|
| 78 |
+
"Repetition Penalty": "Repetition Penalty",
|
| 79 |
+
"Save model every n steps": "Save model every n steps",
|
| 80 |
+
"Select LLAMA ckpt": "Select LLAMA ckpt",
|
| 81 |
+
"Select VITS ckpt": "Select VITS ckpt",
|
| 82 |
+
"Select VQGAN ckpt": "Select VQGAN ckpt",
|
| 83 |
+
"Select source file processing method": "Select source file processing method",
|
| 84 |
+
"Select the model to be trained (Depending on the Tab page you are on)": "Select the model to be trained (Depending on the Tab page you are on)",
|
| 85 |
+
"Selected: {}": "Selected: {}",
|
| 86 |
+
"Speaker": "Speaker",
|
| 87 |
+
"Speaker is identified by the folder name": "Speaker is identified by the folder name",
|
| 88 |
+
"Start Training": "Start Training",
|
| 89 |
+
"Streaming Audio": "Streaming Audio",
|
| 90 |
+
"Streaming Generate": "Streaming Generate",
|
| 91 |
+
"Tensorboard Host": "Tensorboard Host",
|
| 92 |
+
"Tensorboard Log Path": "Tensorboard Log Path",
|
| 93 |
+
"Tensorboard Port": "Tensorboard Port",
|
| 94 |
+
"Tensorboard interface is closed": "Tensorboard interface is closed",
|
| 95 |
+
"Tensorboard interface is launched at {}": "Tensorboard interface is launched at {}",
|
| 96 |
+
"Text is too long, please keep it under {} characters.": "Text is too long, please keep it under {} characters.",
|
| 97 |
+
"The path of the input folder on the left or the filelist. Whether checked or not, it will be used for subsequent training in this list.": "The path of the input folder on the left or the filelist. Whether checked or not, it will be used for subsequent training in this list.",
|
| 98 |
+
"Training Configuration": "Training Configuration",
|
| 99 |
+
"Training Error": "Training Error",
|
| 100 |
+
"Training stopped": "Training stopped",
|
| 101 |
+
"Type name of the speaker": "Type name of the speaker",
|
| 102 |
+
"Type the path or select from the dropdown": "Type the path or select from the dropdown",
|
| 103 |
+
"Use LoRA": "Use LoRA",
|
| 104 |
+
"Use LoRA can save GPU memory, but may reduce the quality of the model": "Use LoRA can save GPU memory, but may reduce the quality of the model",
|
| 105 |
+
"Use filelist": "Use filelist",
|
| 106 |
+
"Use large for 10G+ GPU, medium for 5G, small for 2G": "Use large for 10G+ GPU, medium for 5G, small for 2G",
|
| 107 |
+
"VITS Configuration": "VITS Configuration",
|
| 108 |
+
"VQGAN Configuration": "VQGAN Configuration",
|
| 109 |
+
"Validation Batch Size": "Validation Batch Size",
|
| 110 |
+
"View the status of the preprocessing folder (use the slider to control the depth of the tree)": "View the status of the preprocessing folder (use the slider to control the depth of the tree)",
|
| 111 |
+
"We are not responsible for any misuse of the model, please consider your local laws and regulations before using it.": "We are not responsible for any misuse of the model, please consider your local laws and regulations before using it.",
|
| 112 |
+
"WebUI Host": "WebUI Host",
|
| 113 |
+
"WebUI Port": "WebUI Port",
|
| 114 |
+
"Whisper Model": "Whisper Model",
|
| 115 |
+
"You can find the source code [here](https://github.com/fishaudio/fish-speech) and models [here](https://huggingface.co/fishaudio/fish-speech-1).": "You can find the source code [here](https://github.com/fishaudio/fish-speech) and models [here](https://huggingface.co/fishaudio/fish-speech-1).",
|
| 116 |
+
"bf16-true is recommended for 30+ series GPU, 16-mixed is recommended for 10+ series GPU": "bf16-true is recommended for 30+ series GPU, 16-mixed is recommended for 10+ series GPU",
|
| 117 |
+
"latest": "latest",
|
| 118 |
+
"new": "new",
|
| 119 |
+
"Realtime Transform Text": "Realtime Transform Text",
|
| 120 |
+
"Normalization Result Preview (Currently Only Chinese)": "Normalization Result Preview (Currently Only Chinese)",
|
| 121 |
+
"Text Normalization": "Text Normalization"
|
| 122 |
+
}
|
fish_speech/i18n/locale/es_ES.json
ADDED
|
@@ -0,0 +1,122 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"16-mixed is recommended for 10+ series GPU": "se recomienda 16-mixed para GPU de la serie 10+",
|
| 3 |
+
"5 to 10 seconds of reference audio, useful for specifying speaker.": "5 a 10 segundos de audio de referencia, útil para especificar el hablante.",
|
| 4 |
+
"A text-to-speech model based on VQ-GAN and Llama developed by [Fish Audio](https://fish.audio).": "Un modelo de texto a voz basado en VQ-GAN y Llama desarrollado por [Fish Audio](https://fish.audio).",
|
| 5 |
+
"Accumulate Gradient Batches": "Acumular lotes de gradientes",
|
| 6 |
+
"Add to Processing Area": "Agregar al Área de Procesamiento",
|
| 7 |
+
"Added path successfully!": "¡Ruta agregada exitosamente!",
|
| 8 |
+
"Advanced Config": "Configuración Avanzada",
|
| 9 |
+
"Base LLAMA Model": "Modelo Base LLAMA",
|
| 10 |
+
"Batch Inference": "Inferencia por Lote",
|
| 11 |
+
"Batch Size": "Tamaño del Lote",
|
| 12 |
+
"Changing with the Model Path": "Cambiando con la Ruta del Modelo",
|
| 13 |
+
"Chinese": "Chino",
|
| 14 |
+
"Compile Model": "Compilar Modelo",
|
| 15 |
+
"Compile the model can significantly reduce the inference time, but will increase cold start time": "Compilar el modelo puede reducir significativamente el tiempo de inferencia, pero aumentará el tiempo de inicio en frío",
|
| 16 |
+
"Copy": "Copiar",
|
| 17 |
+
"Data Preprocessing": "Preprocesamiento de Datos",
|
| 18 |
+
"Data Preprocessing Path": "Ruta de Preprocesamiento de Datos",
|
| 19 |
+
"Data Source": "Fuente de Datos",
|
| 20 |
+
"Decoder Model Config": "Configuración del modelo decodificador",
|
| 21 |
+
"Decoder Model Path": "Ruta del modelo decodificador",
|
| 22 |
+
"Disabled": "Desactivado",
|
| 23 |
+
"Enable Reference Audio": "Habilitar Audio de Referencia",
|
| 24 |
+
"English": "Inglés",
|
| 25 |
+
"Error Message": "Mensaje de Error",
|
| 26 |
+
"File Preprocessing": "Preprocesamiento de Archivos",
|
| 27 |
+
"Generate": "Generar",
|
| 28 |
+
"Generated Audio": "Audio Generado",
|
| 29 |
+
"If there is no corresponding text for the audio, apply ASR for assistance, support .txt or .lab format": "Si no hay texto correspondiente para el audio, aplique ASR para asistencia, soporte para formato .txt o .lab",
|
| 30 |
+
"Infer interface is closed": "La interfaz de inferencia está cerrada",
|
| 31 |
+
"Inference Configuration": "Configuración de Inferencia",
|
| 32 |
+
"Inference Server Configuration": "Configuración del Servidor de Inferencia",
|
| 33 |
+
"Inference Server Error": "Error del Servidor de Inferencia",
|
| 34 |
+
"Inferring interface is launched at {}": "La interfaz de inferencia se ha lanzado en {}",
|
| 35 |
+
"Initial Learning Rate": "Tasa de Aprendizaje Inicial",
|
| 36 |
+
"Input Audio & Source Path for Transcription": "Audio de Entrada y Ruta de Origen para Transcripción",
|
| 37 |
+
"Input Text": "Texto de Entrada",
|
| 38 |
+
"Invalid path: {}": "Ruta inválida: {}",
|
| 39 |
+
"It is recommended to use CUDA, if you have low configuration, use CPU": "Se recomienda usar CUDA, si tiene una configuración baja, use CPU",
|
| 40 |
+
"Iterative Prompt Length, 0 means off": "Longitud de la Indicación Iterativa, 0 significa apagado",
|
| 41 |
+
"Japanese": "Japonés",
|
| 42 |
+
"LLAMA Configuration": "Configuración de LLAMA",
|
| 43 |
+
"LLAMA Model Config": "Configuración del Modelo LLAMA",
|
| 44 |
+
"LLAMA Model Path": "Ruta del Modelo LLAMA",
|
| 45 |
+
"Labeling Device": "Dispositivo de Etiquetado",
|
| 46 |
+
"LoRA Model to be merged": "Modelo LoRA a fusionar",
|
| 47 |
+
"Maximum Audio Duration": "Duración máxima de audio",
|
| 48 |
+
"Maximum Length per Sample": "Longitud Máxima por Muestra",
|
| 49 |
+
"Maximum Training Steps": "Pasos Máximos de Entrenamiento",
|
| 50 |
+
"Maximum tokens per batch, 0 means no limit": "Máximo de tokens por lote, 0 significa sin límite",
|
| 51 |
+
"Merge": "Fusionar",
|
| 52 |
+
"Merge LoRA": "Fusionar LoRA",
|
| 53 |
+
"Merge successfully": "Fusionado exitosamente",
|
| 54 |
+
"Minimum Audio Duration": "Duración mínima de audio",
|
| 55 |
+
"Model Output Path": "Ruta de Salida del Modelo",
|
| 56 |
+
"Model Size": "Tamaño del Modelo",
|
| 57 |
+
"Move": "Mover",
|
| 58 |
+
"Move files successfully": "Archivos movidos exitosamente",
|
| 59 |
+
"No audio generated, please check the input text.": "No se generó audio, por favor verifique el texto de entrada.",
|
| 60 |
+
"No selected options": "No hay opciones seleccionadas",
|
| 61 |
+
"Number of Workers": "Número de Trabajadores",
|
| 62 |
+
"Open Inference Server": "Abrir Servidor de Inferencia",
|
| 63 |
+
"Open Labeler WebUI": "Abrir Interfaz Web del Etiquetador",
|
| 64 |
+
"Open Tensorboard": "Abrir Tensorboard",
|
| 65 |
+
"Opened labeler in browser": "Se abrió el etiquetador en el navegador",
|
| 66 |
+
"Optional Label Language": "Idioma de Etiquetado Opcional",
|
| 67 |
+
"Optional online ver": "Ver en línea opcional",
|
| 68 |
+
"Output Path": "Ruta de Salida",
|
| 69 |
+
"Path error, please check the model file exists in the corresponding path": "Error de ruta, por favor verifique que el archivo del modelo exista en la ruta correspondiente",
|
| 70 |
+
"Precision": "Precisión",
|
| 71 |
+
"Probability of applying Speaker Condition": "Probabilidad de aplicar Condición de Hablante",
|
| 72 |
+
"Put your text here.": "Ponga su texto aquí.",
|
| 73 |
+
"Reference Audio": "Audio de Referencia",
|
| 74 |
+
"Reference Text": "Texto de Referencia",
|
| 75 |
+
"Related code and weights are released under CC BY-NC-SA 4.0 License.": "El código relacionado y los pesos se publican bajo la Licencia CC BY-NC-SA 4.0.",
|
| 76 |
+
"Remove Selected Data": "Eliminar Datos Seleccionados",
|
| 77 |
+
"Removed path successfully!": "¡Ruta eliminada exitosamente!",
|
| 78 |
+
"Repetition Penalty": "Penalización por Repetición",
|
| 79 |
+
"Save model every n steps": "Guardar modelo cada n pasos",
|
| 80 |
+
"Select LLAMA ckpt": "Seleccionar punto de control LLAMA",
|
| 81 |
+
"Select VITS ckpt": "Seleccionar punto de control VITS",
|
| 82 |
+
"Select VQGAN ckpt": "Seleccionar punto de control VQGAN",
|
| 83 |
+
"Select source file processing method": "Seleccione el método de procesamiento de archivos fuente",
|
| 84 |
+
"Select the model to be trained (Depending on the Tab page you are on)": "Seleccione el modelo a entrenar (Dependiendo de la pestaña en la que se encuentre)",
|
| 85 |
+
"Selected: {}": "Seleccionado: {}",
|
| 86 |
+
"Speaker": "Hablante",
|
| 87 |
+
"Speaker is identified by the folder name": "El hablante se identifica por el nombre de la carpeta",
|
| 88 |
+
"Start Training": "Iniciar Entrenamiento",
|
| 89 |
+
"Streaming Audio": "transmisión de audio",
|
| 90 |
+
"Streaming Generate": "síntesis en flujo",
|
| 91 |
+
"Tensorboard Host": "Host de Tensorboard",
|
| 92 |
+
"Tensorboard Log Path": "Ruta de Registro de Tensorboard",
|
| 93 |
+
"Tensorboard Port": "Puerto de Tensorboard",
|
| 94 |
+
"Tensorboard interface is closed": "La interfaz de Tensorboard está cerrada",
|
| 95 |
+
"Tensorboard interface is launched at {}": "La interfaz de Tensorboard se ha lanzado en {}",
|
| 96 |
+
"Text is too long, please keep it under {} characters.": "El texto es demasiado largo, por favor manténgalo por debajo de {} caracteres.",
|
| 97 |
+
"The path of the input folder on the left or the filelist. Whether checked or not, it will be used for subsequent training in this list.": "La ruta de la carpeta de entrada a la izquierda o la lista de archivos. Ya sea que esté marcado o no, se utilizará para el entrenamiento posterior en esta lista.",
|
| 98 |
+
"Training Configuration": "Configuración de Entrenamiento",
|
| 99 |
+
"Training Error": "Error de Entrenamiento",
|
| 100 |
+
"Training stopped": "Entrenamiento detenido",
|
| 101 |
+
"Type name of the speaker": "Escriba el nombre del hablante",
|
| 102 |
+
"Type the path or select from the dropdown": "Escriba la ruta o seleccione de la lista desplegable",
|
| 103 |
+
"Use LoRA": "Usar LoRA",
|
| 104 |
+
"Use LoRA can save GPU memory, but may reduce the quality of the model": "Usar LoRA puede ahorrar memoria GPU, pero puede reducir la calidad del modelo",
|
| 105 |
+
"Use filelist": "Usar lista de archivos",
|
| 106 |
+
"Use large for 10G+ GPU, medium for 5G, small for 2G": "Use grande para GPU de 10G+, mediano para 5G, pequeño para 2G",
|
| 107 |
+
"VITS Configuration": "Configuración de VITS",
|
| 108 |
+
"VQGAN Configuration": "Configuración de VQGAN",
|
| 109 |
+
"Validation Batch Size": "Tamaño del Lote de Validación",
|
| 110 |
+
"View the status of the preprocessing folder (use the slider to control the depth of the tree)": "Vea el estado de la carpeta de preprocesamiento (use el control deslizante para controlar la profundidad del árbol)",
|
| 111 |
+
"We are not responsible for any misuse of the model, please consider your local laws and regulations before using it.": "No somos responsables de ningún mal uso del modelo, por favor considere sus leyes y regulaciones locales antes de usarlo.",
|
| 112 |
+
"WebUI Host": "Host de WebUI",
|
| 113 |
+
"WebUI Port": "Puerto de WebUI",
|
| 114 |
+
"Whisper Model": "Modelo Whisper",
|
| 115 |
+
"You can find the source code [here](https://github.com/fishaudio/fish-speech) and models [here](https://huggingface.co/fishaudio/fish-speech-1).": "Puede encontrar el código fuente [aquí](https://github.com/fishaudio/fish-speech) y los modelos [aquí](https://huggingface.co/fishaudio/fish-speech-1).",
|
| 116 |
+
"bf16-true is recommended for 30+ series GPU, 16-mixed is recommended for 10+ series GPU": "Se recomienda bf16-true para GPU de la serie 30+, se recomienda 16-mixed para GPU de la serie 10+",
|
| 117 |
+
"latest": "más reciente",
|
| 118 |
+
"new": "nuevo",
|
| 119 |
+
"Realtime Transform Text": "Transformación de Texto en Tiempo Real",
|
| 120 |
+
"Normalization Result Preview (Currently Only Chinese)": "Vista Previa del Resultado de Normalización (Actualmente Solo Chino)",
|
| 121 |
+
"Text Normalization": "Normalización de Texto"
|
| 122 |
+
}
|
fish_speech/i18n/locale/ja_JP.json
ADDED
|
@@ -0,0 +1,123 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"16-mixed is recommended for 10+ series GPU": "10シリーズ以降のGPUには16-mixedをお勧めします",
|
| 3 |
+
"5 to 10 seconds of reference audio, useful for specifying speaker.": "話者を指定するのに役立つ、5~10秒のリファレンスオーディオ。",
|
| 4 |
+
"A text-to-speech model based on VQ-GAN and Llama developed by [Fish Audio](https://fish.audio).": "[Fish Audio](https://fish.audio)が開発したVQ-GANとLlamaに基づくテキスト音声合成モデル。",
|
| 5 |
+
"Accumulate Gradient Batches": "勾配バッチの累積",
|
| 6 |
+
"Add to Processing Area": "処理エリアに追加",
|
| 7 |
+
"Added path successfully!": "パスの追加に成功しました!",
|
| 8 |
+
"Advanced Config": "詳細設定",
|
| 9 |
+
"Base LLAMA Model": "基本LLAMAモデル",
|
| 10 |
+
"Batch Inference": "バッチ推論",
|
| 11 |
+
"Batch Size": "バッチサイズ",
|
| 12 |
+
"Changing with the Model Path": "モデルのパスに伴って変化する",
|
| 13 |
+
"Chinese": "中国語",
|
| 14 |
+
"Compile Model": "モデルのコンパイル",
|
| 15 |
+
"Compile the model can significantly reduce the inference time, but will increase cold start time": "モデルをコンパイルすると推論時間を大幅に短縮できますが、コールドスタート時間が長くなります",
|
| 16 |
+
"Copy": "コピー",
|
| 17 |
+
"Data Preprocessing": "データ前処理",
|
| 18 |
+
"Data Preprocessing Path": "データ前処理パス",
|
| 19 |
+
"Data Source": "データソース",
|
| 20 |
+
"Decoder Model Config": "デコーダーモデルの構成",
|
| 21 |
+
"Decoder Model Path": "デコーダーモデルのパス",
|
| 22 |
+
"Disabled": "無効",
|
| 23 |
+
"Enable Reference Audio": "リファレンスオーディオを有効にする",
|
| 24 |
+
"English": "英語",
|
| 25 |
+
"Error Message": "エラーメッセージ",
|
| 26 |
+
"File Preprocessing": "文書前处理",
|
| 27 |
+
"Generate": "生成",
|
| 28 |
+
"Generated Audio": "生成されたオーディオ",
|
| 29 |
+
"If there is no corresponding text for the audio, apply ASR for assistance, support .txt or .lab format": "音声に対応するテキストがない場合は、ASRを適用してサポートします。.txtまたは.lab形式をサポートしています",
|
| 30 |
+
"Infer interface is closed": "推論インターフェースが閉じられています",
|
| 31 |
+
"Inference Configuration": "推論設定",
|
| 32 |
+
"Inference Server Configuration": "推論サーバー設定",
|
| 33 |
+
"Inference Server Error": "推論サーバーエラー",
|
| 34 |
+
"Inferring interface is launched at {}": "推論インターフェースが{}で起動しました",
|
| 35 |
+
"Initial Learning Rate": "初期学習率",
|
| 36 |
+
"Input Audio & Source Path for Transcription": "入力オーディオと文字起こしのソースパス",
|
| 37 |
+
"Input Text": "入力テキスト",
|
| 38 |
+
"Invalid path: {}": "無効なパス: {}",
|
| 39 |
+
"It is recommended to use CUDA, if you have low configuration, use CPU": "CUDAの使用をお勧めします。低い構成の場合はCPUを使用してください",
|
| 40 |
+
"Iterative Prompt Length, 0 means off": "反復プロンプト長。0はオフを意味します",
|
| 41 |
+
"Japanese": "日本語",
|
| 42 |
+
"LLAMA Configuration": "LLAMA設定",
|
| 43 |
+
"LLAMA Model Config": "LLAMAモデル設定",
|
| 44 |
+
"LLAMA Model Path": "LLAMAモデルパス",
|
| 45 |
+
"Labeling Device": "ラベリングデバイス",
|
| 46 |
+
"LoRA Model to be merged": "マージするLoRAモデル",
|
| 47 |
+
"Maximum Audio Duration": "最大オーディオの長さ",
|
| 48 |
+
"Maximum Length per Sample": "サンプルあたりの最大長",
|
| 49 |
+
"Maximum Training Steps": "最大トレーニングステップ数",
|
| 50 |
+
"Maximum tokens per batch, 0 means no limit": "バッチあたりの最大トークン数。0は制限なしを意味します",
|
| 51 |
+
"Merge": "マージ",
|
| 52 |
+
"Merge LoRA": "LoRAのマージ",
|
| 53 |
+
"Merge successfully": "マージに成功しました",
|
| 54 |
+
"Minimum Audio Duration": "最小オーディオの長さ",
|
| 55 |
+
"Model Output Path": "モデル出力パス",
|
| 56 |
+
"Model Size": "モデルサイズ",
|
| 57 |
+
"Move": "移動",
|
| 58 |
+
"Move files successfully": "ファイルの移動に成功しました",
|
| 59 |
+
"No audio generated, please check the input text.": "オーディオが生成されていません。入力テキストを確認してください。",
|
| 60 |
+
"No selected options": "選択されたオプションはありません",
|
| 61 |
+
"Number of Workers": "ワーカー数",
|
| 62 |
+
"Open Inference Server": "推論サーバーを開く",
|
| 63 |
+
"Open Labeler WebUI": "ラベラーWebUIを開く",
|
| 64 |
+
"Open Tensorboard": "Tensorboardを開く",
|
| 65 |
+
"Opened labeler in browser": "ブラウザでラベラーを開きました",
|
| 66 |
+
"Optional Label Language": "オプションのラベル言語",
|
| 67 |
+
"Optional online ver": "オプションのオンラインバージョン",
|
| 68 |
+
"Output Path": "出力パス",
|
| 69 |
+
"Path error, please check the model file exists in the corresponding path": "パスエラー。対応するパスにモデルファイルが存在するか確認してください",
|
| 70 |
+
"Precision": "精度",
|
| 71 |
+
"Probability of applying Speaker Condition": "話者条件を適用する確率",
|
| 72 |
+
"Put your text here.": "ここにテキストを入力してください。",
|
| 73 |
+
"Reference Audio": "リファレンスオーディオ",
|
| 74 |
+
"Reference Text": "リファレンステキスト",
|
| 75 |
+
"Related code and weights are released under CC BY-NC-SA 4.0 License.": "関連コードと重みはCC BY-NC-SA 4.0ライセンスの下でリリースされます。",
|
| 76 |
+
"Remove Selected Data": "選択したデータを削除",
|
| 77 |
+
"Removed path successfully!": "パスの削除に成功しました!",
|
| 78 |
+
"Repetition Penalty": "反復ペナルティ",
|
| 79 |
+
"Save model every n steps": "nステップごとにモデルを保存",
|
| 80 |
+
"Select LLAMA ckpt": " LLAMA チェックポイントを選択",
|
| 81 |
+
"Select VITS ckpt": "VITS チェックポイントを選択",
|
| 82 |
+
"Select VQGAN ckpt": "VQGAN チェックポイントを選択",
|
| 83 |
+
"Select source file processing method": "ソースファイルの処理方法を選択",
|
| 84 |
+
"Select the model to be trained (Depending on the Tab page you are on)": "タブページに応じてトレーニングするモデルを選択してください",
|
| 85 |
+
"Selected: {}": "選択済み: {}",
|
| 86 |
+
"Speaker": "話者",
|
| 87 |
+
"Speaker is identified by the folder name": "話者はフォルダ名で識別されます",
|
| 88 |
+
"Start Training": "トレーニング開始",
|
| 89 |
+
"Streaming Audio": "ストリーミングオーディオ",
|
| 90 |
+
"Streaming Generate": "ストリーミング合成",
|
| 91 |
+
"Tensorboard Host": "Tensorboardホスト",
|
| 92 |
+
"Tensorboard Log Path": "Tensorboardログパス",
|
| 93 |
+
"Tensorboard Port": "Tensorboardポート",
|
| 94 |
+
"Tensorboard interface is closed": "Tensorboardインターフェースが閉じられています",
|
| 95 |
+
"Tensorboard interface is launched at {}": "Tensorboardインターフェースが{}で起動されました",
|
| 96 |
+
"Text is too long, please keep it under {} characters.": "テキストが長すぎます。{}文字以内に抑えてください。",
|
| 97 |
+
"The path of the input folder on the left or the filelist. Whether checked or not, it will be used for subsequent training in this list.": "左側の入力フォルダまたはファイルリストのパス。チェックの有無にかかわらず、このリストの後続のトレーニングに使用されます。",
|
| 98 |
+
"Training Configuration": "トレーニング設定",
|
| 99 |
+
"Training Error": "トレーニングエラー",
|
| 100 |
+
"Training stopped": "トレーニングが停止しました",
|
| 101 |
+
"Type name of the speaker": "話者の名前を入力",
|
| 102 |
+
"Type the path or select from the dropdown": "パスを入力するか、ドロップダウンから選択してください",
|
| 103 |
+
"Use LoRA": "LoRAを使用",
|
| 104 |
+
"Use LoRA can save GPU memory, but may reduce the quality of the model": "LoRAを使用するとGPUメモリを節約できますが、モデルの品質が低下する可能性があります",
|
| 105 |
+
"Use filelist": "ファイルリストを使用",
|
| 106 |
+
"Use large for 10G+ GPU, medium for 5G, small for 2G": "10G以上のGPUには大、5Gには中、2Gには小を使用してください",
|
| 107 |
+
"VITS Configuration": "VITS の構成",
|
| 108 |
+
"VQGAN Configuration": "VQGAN の構成",
|
| 109 |
+
"Validation Batch Size": "検証バッチサイズ",
|
| 110 |
+
"View the status of the preprocessing folder (use the slider to control the depth of the tree)": "前処理フォルダの状態を表示(スライダーを使用してツリーの深さを制御)",
|
| 111 |
+
"We are not responsible for any misuse of the model, please consider your local laws and regulations before using it.": "モデルの誤用については一切責任を負いません。使用する前に、現地の法律と規制を考慮してください。",
|
| 112 |
+
"WebUI Host": "WebUIホスト",
|
| 113 |
+
"WebUI Port": "WebUIポート",
|
| 114 |
+
"Whisper Model": "Whisperモデル",
|
| 115 |
+
"You can find the source code [here](https://github.com/fishaudio/fish-speech) and models [here](https://huggingface.co/fishaudio/fish-speech-1).": "ソースコードは[こちら](https://github.com/fishaudio/fish-speech)、モデルは[こちら](https://huggingface.co/fishaudio/fish-speech-1)にあります。",
|
| 116 |
+
"bf16-true is recommended for 30+ series GPU, 16-mixed is recommended for 10+ series GPU": "30シリーズ以降のGPUにはbf16-trueを、10シリーズ以降のGPUには16-mixedをお勧めします",
|
| 117 |
+
"latest": "最新",
|
| 118 |
+
"new": "新規",
|
| 119 |
+
"Realtime Transform Text": "リアルタイム変換テキスト",
|
| 120 |
+
"Normalization Result Preview (Currently Only Chinese)": "正規化結果プレビュー(現在は中国語のみ)",
|
| 121 |
+
"Text Normalization": "テキスト正規化"
|
| 122 |
+
|
| 123 |
+
}
|
fish_speech/i18n/locale/pt_BR.json
ADDED
|
@@ -0,0 +1,133 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"5 to 10 seconds of reference audio, useful for specifying speaker.": "5 a 10 segundos de áudio de referência, útil para especificar o orador.",
|
| 3 |
+
"A text-to-speech model based on VQ-GAN and Llama developed by [Fish Audio](https://fish.audio).": "Um modelo de texto para fala baseado em VQ-GAN e Llama desenvolvido por [Fish Audio](https://fish.audio).",
|
| 4 |
+
"Accumulate Gradient Batches": "Acumular Lotes de Gradiente",
|
| 5 |
+
"Add to Processing Area": "Adicionar à Área de Processamento",
|
| 6 |
+
"Added path successfully!": "Caminho adicionado com sucesso!",
|
| 7 |
+
"Advanced Config": "Configuração Avançada",
|
| 8 |
+
"Base LLAMA Model": "Modelo LLAMA Base",
|
| 9 |
+
"Batch Inference": "Inferência em Lote",
|
| 10 |
+
"Batch Size": "Tamanho do Lote",
|
| 11 |
+
"Changing with the Model Path": "Alterando com o Caminho do Modelo",
|
| 12 |
+
|
| 13 |
+
"Compile Model": "Compilar Modelo",
|
| 14 |
+
"Compile the model can significantly reduce the inference time, but will increase cold start time": "Compilar o modelo pode reduzir significativamente o tempo de inferência, mas aumentará a latência inicial",
|
| 15 |
+
"Copy": "Copiar",
|
| 16 |
+
"Data Preprocessing": "Pré-processamento de Dados",
|
| 17 |
+
"Data Preprocessing Path": "Caminho de Pré-processamento de Dados",
|
| 18 |
+
"Data Source": "Fonte de Dados",
|
| 19 |
+
"Decoder Model Config": "Configuração do Modelo Decodificador",
|
| 20 |
+
"Decoder Model Path": "Caminho do Modelo Decodificador",
|
| 21 |
+
"Disabled": "Desativado",
|
| 22 |
+
"Enable Initial Prompt": "Habilitar Prompt Inicial",
|
| 23 |
+
"Enable Reference Audio": "Habilitar Áudio de Referência",
|
| 24 |
+
"English": "Inglês",
|
| 25 |
+
"Japanese": "Japonês",
|
| 26 |
+
"Chinese": "Chinês",
|
| 27 |
+
"Portuguese": "Português",
|
| 28 |
+
"Spanish": "Espanhol",
|
| 29 |
+
"Error Message": "Mensagem de Erro",
|
| 30 |
+
"Faster Whisper, Up to 5g GPU memory usage": "Faster Whisper (Usa até 5 GB de vRAM)",
|
| 31 |
+
"File Preprocessing": "Pré-processamento de Arquivos",
|
| 32 |
+
"Generate": "Gerar",
|
| 33 |
+
"Generated Audio": "Áudio Gerado",
|
| 34 |
+
"If there is no corresponding text for the audio, apply ASR for assistance, support .txt or .lab format": "Se não houver texto correspondente ao áudio, utilize o ASR para assistência (formatos .txt ou .lab)",
|
| 35 |
+
"Infer interface is closed": "A interface de inferência foi fechada",
|
| 36 |
+
"Inference Configuration": "Configuração de Inferência",
|
| 37 |
+
"Inference Server Configuration": "Configuração do Servidor de Inferência",
|
| 38 |
+
"Inference Server Error": "Erro do Servidor de Inferência",
|
| 39 |
+
"Inferring interface is launched at {}": "A interface de inferência foi iniciada em {}",
|
| 40 |
+
"Initial Learning Rate": "Taxa de Aprendizagem Inicial",
|
| 41 |
+
"Initial Prompt": "Prompt Inicial",
|
| 42 |
+
"Initial prompt can provide contextual or vocabulary-specific guidance to the model.": "O prompt inicial pode fornecer orientação contextual ou específica de vocabulário para o modelo.",
|
| 43 |
+
"Input Audio & Source Path for Transcription": "Entrada de Áudio/Caminho de Origem para Transcrição",
|
| 44 |
+
"Input Text": "Texto de Entrada",
|
| 45 |
+
"Invalid path: {}": "Caminho inválido: {}",
|
| 46 |
+
"It is recommended to use CUDA, if you have low configuration, use CPU": "Para GPUs Nvidia é recomendado usar CUDA. Se não tiver uma GPU Nvidia, use CPU",
|
| 47 |
+
"Iterative Prompt Length, 0 means off": "Comprimento do Prompt Iterativo (0 = desativado)",
|
| 48 |
+
"LLAMA Configuration": "Configuração do LLAMA",
|
| 49 |
+
"LLAMA Model Config": "Configuração do Modelo LLAMA",
|
| 50 |
+
"LLAMA Model Path": "Caminho do Modelo LLAMA",
|
| 51 |
+
"Labeling Device": "Dispositivo de Rotulagem",
|
| 52 |
+
"LoRA Model to be merged": "Modelo LoRA para mesclagem",
|
| 53 |
+
"Maximum Length per Sample": "Comprimento Máximo por Amostra",
|
| 54 |
+
"Maximum Training Steps": "Etapas Máximas de Treinamento",
|
| 55 |
+
"Maximum tokens per batch, 0 means no limit": "Número máximo de tokens por lote, 0 significa sem limite",
|
| 56 |
+
"Merge": "Mesclar",
|
| 57 |
+
"Merge LoRA": "Mesclar LoRA",
|
| 58 |
+
"Merge successfully": "Mesclado com sucesso",
|
| 59 |
+
"Model Output Path": "Caminho de Saída do Modelo",
|
| 60 |
+
"Model Quantization": "Quantização do Modelo",
|
| 61 |
+
"Model Size": "Tamanho do Modelo",
|
| 62 |
+
"Move": "Mover",
|
| 63 |
+
"Move files successfully": "Arquivos movidos com sucesso",
|
| 64 |
+
"No audio generated, please check the input text.": "Nenhum áudio gerado, verifique o texto de entrada.",
|
| 65 |
+
"No selected options": "Nenhuma opção selecionada",
|
| 66 |
+
"Normalization Result Preview (Currently Only Chinese)": "Pré-visualização do Resultado da Normalização (Atualmente Apenas Chinês)",
|
| 67 |
+
"Number of Workers": "Número de Processos",
|
| 68 |
+
"Open Inference Server": "Abrir Servidor de Inferência",
|
| 69 |
+
"Open Labeler WebUI": "Abrir WebUI de Rotulagem",
|
| 70 |
+
"Open Tensorboard": "Abrir Tensorboard",
|
| 71 |
+
"Opened labeler in browser": "WebUI de rotulagem aberta no navegador",
|
| 72 |
+
"Optional Label Language": "Idioma do Rótulo (Opcional)",
|
| 73 |
+
"Optional online ver": "Versão online (opcional)",
|
| 74 |
+
"Output Path": "Caminho de Saída",
|
| 75 |
+
"Path error, please check the model file exists in the corresponding path": "Erro de caminho, verifique se o arquivo do modelo existe no caminho correspondente",
|
| 76 |
+
"Post-quantification Precision": "Precisão Pós-quantização",
|
| 77 |
+
"Precision": "Precisão",
|
| 78 |
+
"Probability of applying Speaker Condition": "Probabilidade de Aplicar Condição de Orador",
|
| 79 |
+
"Put your text here.": "Insira seu texto aqui.",
|
| 80 |
+
"Quantify": "Quantizar",
|
| 81 |
+
"Quantify successfully": "Quantizado com sucesso",
|
| 82 |
+
"Realtime Transform Text": "Transformar Texto em Tempo Real",
|
| 83 |
+
"Reference Audio": "Áudio de Referência",
|
| 84 |
+
"Reference Text": "Texto de Referência",
|
| 85 |
+
"warning": "Aviso",
|
| 86 |
+
"Pre-processing begins...": "O pré-processamento começou!",
|
| 87 |
+
"Related code and weights are released under CC BY-NC-SA 4.0 License.": "O código relacionado e os pesos são licenciados sob a Licença CC BY-NC-SA 4.0.",
|
| 88 |
+
"Remove Selected Data": "Remover Dados Selecionados",
|
| 89 |
+
"Removed path successfully!": "Caminho removido com sucesso!",
|
| 90 |
+
"Repetition Penalty": "Penalidade de Repetição",
|
| 91 |
+
"Save model every n steps": "Salvar modelo a cada n etapas",
|
| 92 |
+
"Select LLAMA ckpt": "Selecionar .ckpt do LLAMA",
|
| 93 |
+
"Select source file processing method": "Escolha como processar o arquivo de origem",
|
| 94 |
+
"Select the model to be trained (Depending on the Tab page you are on)": "Selecione o modelo para o treinamento (dependendo da aba em que você está)",
|
| 95 |
+
"Selected: {}": "Selecionado: {}",
|
| 96 |
+
"Speaker is identified by the folder name": "O orador é identificado pelo nome da pasta",
|
| 97 |
+
"Start Training": "Iniciar Treinamento",
|
| 98 |
+
"Streaming Audio": "Áudio em Streaming",
|
| 99 |
+
"Streaming Generate": "Geração em Streaming",
|
| 100 |
+
"Tensorboard Host": "Host do Tensorboard",
|
| 101 |
+
"Tensorboard Log Path": "Caminho de Log do Tensorboard",
|
| 102 |
+
"Tensorboard Port": "Porta do Tensorboard",
|
| 103 |
+
"Tensorboard interface is closed": "A interface do Tensorboard está fechada",
|
| 104 |
+
"Tensorboard interface is launched at {}": "A interface do Tensorboard foi iniciada em {}",
|
| 105 |
+
"Text Normalization": "Normalização de Texto",
|
| 106 |
+
"Text is too long, please keep it under {} characters.": "O texto é muito longo. Mantenha-o com menos de {} caracteres.",
|
| 107 |
+
"The lower the quantitative precision, the more the effectiveness may decrease, but the greater the efficiency will increase": "Quanto menor a precisão quantitativa, mais a eficácia pode diminuir, mas maior será o aumento da eficiência",
|
| 108 |
+
"The path of the input folder on the left or the filelist. Whether checked or not, it will be used for subsequent training in this list.": "O caminho da pasta de entrada à esquerda ou a lista de arquivos. Independentemente de estar marcada ou não, ela será utilizada para o treinamento subsequente nesta lista.",
|
| 109 |
+
"Training Configuration": "Configuração de Treinamento",
|
| 110 |
+
"Training Error": "Erro de Treinamento",
|
| 111 |
+
"Training stopped": "Treinamento interrompido!",
|
| 112 |
+
"Type the path or select from the dropdown": "Digite o caminho ou selecione no menu suspenso",
|
| 113 |
+
"Use LoRA": "Usar LoRA",
|
| 114 |
+
"Use LoRA can save GPU memory, but may reduce the quality of the model": "O uso de LoRAs pode economizar memória da GPU, mas também pode reduzir a qualidade",
|
| 115 |
+
"Use filelist": "Usar lista de arquivos",
|
| 116 |
+
"VQGAN Configuration": "Configuração do VQGAN",
|
| 117 |
+
"View the status of the preprocessing folder (use the slider to control the depth of the tree)": "Visualizar o status da pasta de pré-processamento (use o controle deslizante para controlar a profundidade da árvore)",
|
| 118 |
+
"We are not responsible for any misuse of the model, please consider your local laws and regulations before using it.": "Não nos responsabilizamos por qualquer uso indevido do modelo. Por favor, considere as leis e regulamentações locais antes de usá-lo.",
|
| 119 |
+
"WebUI Host": "Host da WebUI",
|
| 120 |
+
"WebUI Port": "Porta da WebUI",
|
| 121 |
+
"Whisper Model": "Modelo Whisper",
|
| 122 |
+
"You can find the source code [here](https://github.com/fishaudio/fish-speech) and models [here](https://huggingface.co/fishaudio/fish-speech-1).": "Você pode encontrar o código fonte [aqui](https://github.com/fishaudio/fish-speech) e os modelos [aqui](https://huggingface.co/fishaudio/fish-speech-1).",
|
| 123 |
+
"auto": "automático",
|
| 124 |
+
"bf16-true is recommended for 30+ series GPU, 16-mixed is recommended for 10+ series GPU": "bf16-true é recomendado para GPUs da série 30+, 16-mixed é recomendado para GPUs da série 10+",
|
| 125 |
+
"latest": "mais recente",
|
| 126 |
+
"new": "novo",
|
| 127 |
+
"This audio introduces the basic concepts and applications of artificial intelligence and machine learning.": "Este áudio introduz os conceitos básicos e aplicações de inteligência artificial e aprendizado de máquina.",
|
| 128 |
+
"You don't need to train this model!": "Não é necessário treinar este modelo!",
|
| 129 |
+
"Yes": "Sim",
|
| 130 |
+
"No": "Não",
|
| 131 |
+
"version:": "versão:",
|
| 132 |
+
"author:": "autor:"
|
| 133 |
+
}
|