Upload SpeechT5 Korean TTS artifacts
Browse files- README.md +29 -0
- config.json +93 -0
- demo_inference.py +62 -0
- generation_config.json +9 -0
- jamo_vocab.txt +71 -0
- jamo_vocab.txt:Zone.Identifier +0 -0
- model.safetensors +3 -0
- preprocessor_config.json +19 -0
- speaker_embedding.pth +3 -0
- special_tokens_map.json +6 -0
- speecht5-korean.pth +3 -0
- speecht5-korean_epoch18_backup.pth +3 -0
- tokenizer.json +126 -0
- tokenizer_config.json +44 -0
- vocoder/config.json +48 -0
- vocoder/model.safetensors +3 -0
README.md
ADDED
|
@@ -0,0 +1,29 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
|
| 2 |
+
# Korean SpeechT5 (Jamo Tokenizer, KSS)
|
| 3 |
+
If you use this model in research or production,
|
| 4 |
+
please cite:
|
| 5 |
+
|
| 6 |
+
@misc{ahnhs2k_speecht5_korean,
|
| 7 |
+
author = {Ahn, Hosung},
|
| 8 |
+
title = {Korean SpeechT5 TTS Model},
|
| 9 |
+
year = {2025},
|
| 10 |
+
publisher = {Hugging Face},
|
| 11 |
+
url = {https://huggingface.co/ahnhs2k/...}
|
| 12 |
+
}
|
| 13 |
+
|
| 14 |
+
## 모델 특징
|
| 15 |
+
- Base Model: microsoft/speecht5_tts
|
| 16 |
+
- Dataset: Bingsu/KSS_Dataset
|
| 17 |
+
- Tokenizer: Jamo-based Korean tokenizer (character-level)
|
| 18 |
+
- Speaker Embedding: microsoft/wavlm-base-plus-sv
|
| 19 |
+
- Vocoder: microsoft/speecht5_hifigan
|
| 20 |
+
- Sample Rate: 16 kHz
|
| 21 |
+
- 단일 화자 한국어 TTS 모델
|
| 22 |
+
|
| 23 |
+
## 라이선스
|
| 24 |
+
This model is released under the **CC-BY-SA-4.0** License.
|
| 25 |
+
|
| 26 |
+
When using this model (including commercial usage), you must:
|
| 27 |
+
- Provide attribution: "Model fine-tuned by ahnhs2k (2025)"
|
| 28 |
+
- Include a link to this model page
|
| 29 |
+
- Distribute derivative models under the same license
|
config.json
ADDED
|
@@ -0,0 +1,93 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"activation_dropout": 0.1,
|
| 3 |
+
"apply_spec_augment": true,
|
| 4 |
+
"architectures": [
|
| 5 |
+
"SpeechT5ForTextToSpeech"
|
| 6 |
+
],
|
| 7 |
+
"attention_dropout": 0.1,
|
| 8 |
+
"bos_token_id": 0,
|
| 9 |
+
"conv_bias": false,
|
| 10 |
+
"conv_dim": [
|
| 11 |
+
512,
|
| 12 |
+
512,
|
| 13 |
+
512,
|
| 14 |
+
512,
|
| 15 |
+
512,
|
| 16 |
+
512,
|
| 17 |
+
512
|
| 18 |
+
],
|
| 19 |
+
"conv_kernel": [
|
| 20 |
+
10,
|
| 21 |
+
3,
|
| 22 |
+
3,
|
| 23 |
+
3,
|
| 24 |
+
3,
|
| 25 |
+
2,
|
| 26 |
+
2
|
| 27 |
+
],
|
| 28 |
+
"conv_stride": [
|
| 29 |
+
5,
|
| 30 |
+
2,
|
| 31 |
+
2,
|
| 32 |
+
2,
|
| 33 |
+
2,
|
| 34 |
+
2,
|
| 35 |
+
2
|
| 36 |
+
],
|
| 37 |
+
"decoder_attention_heads": 12,
|
| 38 |
+
"decoder_ffn_dim": 3072,
|
| 39 |
+
"decoder_layerdrop": 0.1,
|
| 40 |
+
"decoder_layers": 6,
|
| 41 |
+
"decoder_start_token_id": 2,
|
| 42 |
+
"dtype": "float32",
|
| 43 |
+
"encoder_attention_heads": 12,
|
| 44 |
+
"encoder_ffn_dim": 3072,
|
| 45 |
+
"encoder_layerdrop": 0.1,
|
| 46 |
+
"encoder_layers": 12,
|
| 47 |
+
"encoder_max_relative_position": 160,
|
| 48 |
+
"eos_token_id": 2,
|
| 49 |
+
"feat_extract_activation": "gelu",
|
| 50 |
+
"feat_extract_norm": "group",
|
| 51 |
+
"feat_proj_dropout": 0.0,
|
| 52 |
+
"guided_attention_loss_num_heads": 2,
|
| 53 |
+
"guided_attention_loss_scale": 10.0,
|
| 54 |
+
"guided_attention_loss_sigma": 0.4,
|
| 55 |
+
"hidden_act": "gelu",
|
| 56 |
+
"hidden_dropout": 0.1,
|
| 57 |
+
"hidden_size": 768,
|
| 58 |
+
"initializer_range": 0.02,
|
| 59 |
+
"is_encoder_decoder": true,
|
| 60 |
+
"layer_norm_eps": 1e-05,
|
| 61 |
+
"mask_feature_length": 10,
|
| 62 |
+
"mask_feature_min_masks": 0,
|
| 63 |
+
"mask_feature_prob": 0.0,
|
| 64 |
+
"mask_time_length": 10,
|
| 65 |
+
"mask_time_min_masks": 2,
|
| 66 |
+
"mask_time_prob": 0.05,
|
| 67 |
+
"max_length": null,
|
| 68 |
+
"max_speech_positions": 1876,
|
| 69 |
+
"max_text_positions": 600,
|
| 70 |
+
"model_type": "speecht5",
|
| 71 |
+
"num_conv_pos_embedding_groups": 16,
|
| 72 |
+
"num_conv_pos_embeddings": 128,
|
| 73 |
+
"num_feat_extract_layers": 7,
|
| 74 |
+
"num_mel_bins": 80,
|
| 75 |
+
"pad_token_id": 1,
|
| 76 |
+
"positional_dropout": 0.1,
|
| 77 |
+
"reduction_factor": 2,
|
| 78 |
+
"scale_embedding": false,
|
| 79 |
+
"speaker_embedding_dim": 512,
|
| 80 |
+
"speaker_embedding_path": "speaker_embedding.pth",
|
| 81 |
+
"speech_decoder_postnet_dropout": 0.5,
|
| 82 |
+
"speech_decoder_postnet_kernel": 5,
|
| 83 |
+
"speech_decoder_postnet_layers": 5,
|
| 84 |
+
"speech_decoder_postnet_units": 256,
|
| 85 |
+
"speech_decoder_prenet_dropout": 0.5,
|
| 86 |
+
"speech_decoder_prenet_layers": 2,
|
| 87 |
+
"speech_decoder_prenet_units": 256,
|
| 88 |
+
"transformers_version": "4.57.3",
|
| 89 |
+
"use_cache": true,
|
| 90 |
+
"use_guided_attention_loss": true,
|
| 91 |
+
"use_guided_attn_loss": false,
|
| 92 |
+
"vocab_size": 71
|
| 93 |
+
}
|
demo_inference.py
ADDED
|
@@ -0,0 +1,62 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import torch
|
| 2 |
+
import soundfile as sf
|
| 3 |
+
from pathlib import Path
|
| 4 |
+
import unicodedata
|
| 5 |
+
from transformers import (
|
| 6 |
+
SpeechT5ForTextToSpeech,
|
| 7 |
+
SpeechT5Processor,
|
| 8 |
+
SpeechT5HifiGan,
|
| 9 |
+
PreTrainedTokenizerFast,
|
| 10 |
+
)
|
| 11 |
+
|
| 12 |
+
MODEL_ID = "ahnhs2k/speecht5-korean"
|
| 13 |
+
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
| 14 |
+
|
| 15 |
+
|
| 16 |
+
def decompose_jamo(text):
|
| 17 |
+
result = []
|
| 18 |
+
for ch in text:
|
| 19 |
+
name = unicodedata.name(ch, "")
|
| 20 |
+
if "HANGUL SYLLABLE" in name:
|
| 21 |
+
code = ord(ch) - 0xAC00
|
| 22 |
+
result.append(chr(0x1100 + (code // 588)))
|
| 23 |
+
result.append(chr(0x1161 + ((code % 588) // 28)))
|
| 24 |
+
jong = code % 28
|
| 25 |
+
if jong > 0:
|
| 26 |
+
result.append(chr(0x11A7 + jong))
|
| 27 |
+
else:
|
| 28 |
+
result.append(ch)
|
| 29 |
+
return result
|
| 30 |
+
|
| 31 |
+
|
| 32 |
+
def main():
|
| 33 |
+
model = SpeechT5ForTextToSpeech.from_pretrained(MODEL_ID).to(DEVICE).eval()
|
| 34 |
+
|
| 35 |
+
# Processor는 항상 원본에서 로드
|
| 36 |
+
processor = SpeechT5Processor.from_pretrained("microsoft/speecht5_tts")
|
| 37 |
+
|
| 38 |
+
# Custom tokenizer 로드 후 processor에 덮어쓰기
|
| 39 |
+
tokenizer = PreTrainedTokenizerFast.from_pretrained(MODEL_ID)
|
| 40 |
+
processor.tokenizer = tokenizer
|
| 41 |
+
|
| 42 |
+
vocoder = SpeechT5HifiGan.from_pretrained(Path(__file__).resolve().parent / "vocoder").to(DEVICE).eval()
|
| 43 |
+
|
| 44 |
+
# speaker embedding
|
| 45 |
+
spk_path = Path(__file__).resolve().parent / "speaker_embedding.pth"
|
| 46 |
+
spk_emb = torch.load(spk_path).to(DEVICE)
|
| 47 |
+
|
| 48 |
+
text = "안녕하세요. 자모 토크나이저 기반 한국어 TTS 데모입니다."
|
| 49 |
+
jamo_seq = decompose_jamo(text)
|
| 50 |
+
|
| 51 |
+
enc = tokenizer(jamo_seq, is_split_into_words=True, add_special_tokens=True, return_tensors="pt")
|
| 52 |
+
enc = {k: v.to(DEVICE) for k, v in enc.items()}
|
| 53 |
+
|
| 54 |
+
with torch.no_grad():
|
| 55 |
+
gen = model.generate_speech(enc["input_ids"], speaker_embeddings=spk_emb.unsqueeze(0), vocoder=vocoder)
|
| 56 |
+
|
| 57 |
+
sf.write("demo_inference_output.wav", gen.cpu().numpy(), 16000)
|
| 58 |
+
print("Saved demo_inference_output.wav")
|
| 59 |
+
|
| 60 |
+
|
| 61 |
+
if __name__ == "__main__":
|
| 62 |
+
main()
|
generation_config.json
ADDED
|
@@ -0,0 +1,9 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"_from_model_config": true,
|
| 3 |
+
"bos_token_id": 0,
|
| 4 |
+
"decoder_start_token_id": 2,
|
| 5 |
+
"eos_token_id": 2,
|
| 6 |
+
"max_length": 1876,
|
| 7 |
+
"pad_token_id": 1,
|
| 8 |
+
"transformers_version": "4.57.3"
|
| 9 |
+
}
|
jamo_vocab.txt
ADDED
|
@@ -0,0 +1,71 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
<pad>
|
| 2 |
+
<unk>
|
| 3 |
+
<bos>
|
| 4 |
+
<eos>
|
| 5 |
+
ᄀ
|
| 6 |
+
ᄁ
|
| 7 |
+
ᄂ
|
| 8 |
+
ᄃ
|
| 9 |
+
ᄄ
|
| 10 |
+
ᄅ
|
| 11 |
+
ᄆ
|
| 12 |
+
ᄇ
|
| 13 |
+
ᄈ
|
| 14 |
+
ᄉ
|
| 15 |
+
ᄊ
|
| 16 |
+
ᄋ
|
| 17 |
+
ᄌ
|
| 18 |
+
ᄍ
|
| 19 |
+
ᄎ
|
| 20 |
+
ᄏ
|
| 21 |
+
ᄐ
|
| 22 |
+
ᄑ
|
| 23 |
+
ᄒ
|
| 24 |
+
ᅡ
|
| 25 |
+
ᅢ
|
| 26 |
+
ᅣ
|
| 27 |
+
ᅤ
|
| 28 |
+
ᅥ
|
| 29 |
+
ᅦ
|
| 30 |
+
ᅧ
|
| 31 |
+
ᅨ
|
| 32 |
+
ᅩ
|
| 33 |
+
ᅪ
|
| 34 |
+
ᅫ
|
| 35 |
+
ᅬ
|
| 36 |
+
ᅭ
|
| 37 |
+
ᅮ
|
| 38 |
+
ᅯ
|
| 39 |
+
ᅰ
|
| 40 |
+
ᅱ
|
| 41 |
+
ᅲ
|
| 42 |
+
ᅳ
|
| 43 |
+
ᅴ
|
| 44 |
+
ᅵ
|
| 45 |
+
ᆨ
|
| 46 |
+
ᆩ
|
| 47 |
+
ᆪ
|
| 48 |
+
ᆫ
|
| 49 |
+
ᆬ
|
| 50 |
+
ᆭ
|
| 51 |
+
ᆮ
|
| 52 |
+
ᆯ
|
| 53 |
+
ᆰ
|
| 54 |
+
ᆱ
|
| 55 |
+
ᆲ
|
| 56 |
+
ᆳ
|
| 57 |
+
ᆴ
|
| 58 |
+
ᆵ
|
| 59 |
+
ᆶ
|
| 60 |
+
ᆷ
|
| 61 |
+
ᆸ
|
| 62 |
+
ᆹ
|
| 63 |
+
ᆺ
|
| 64 |
+
ᆻ
|
| 65 |
+
ᆼ
|
| 66 |
+
ᆽ
|
| 67 |
+
ᆾ
|
| 68 |
+
ᆿ
|
| 69 |
+
ᇀ
|
| 70 |
+
ᇁ
|
| 71 |
+
ᇂ
|
jamo_vocab.txt:Zone.Identifier
ADDED
|
Binary file (25 Bytes). View file
|
|
|
model.safetensors
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:95c8fe3a5eed5283f7cdfcae48f8758dae3d8bd62934ba327e268ee178506871
|
| 3 |
+
size 577758600
|
preprocessor_config.json
ADDED
|
@@ -0,0 +1,19 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"do_normalize": false,
|
| 3 |
+
"feature_extractor_type": "SpeechT5FeatureExtractor",
|
| 4 |
+
"feature_size": 1,
|
| 5 |
+
"fmax": 7600,
|
| 6 |
+
"fmin": 80,
|
| 7 |
+
"frame_signal_scale": 1.0,
|
| 8 |
+
"hop_length": 16,
|
| 9 |
+
"mel_floor": 1e-10,
|
| 10 |
+
"num_mel_bins": 80,
|
| 11 |
+
"padding_side": "right",
|
| 12 |
+
"padding_value": 0.0,
|
| 13 |
+
"processor_class": "SpeechT5Processor",
|
| 14 |
+
"reduction_factor": 2,
|
| 15 |
+
"return_attention_mask": true,
|
| 16 |
+
"sampling_rate": 16000,
|
| 17 |
+
"win_function": "hann_window",
|
| 18 |
+
"win_length": 64
|
| 19 |
+
}
|
speaker_embedding.pth
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:f25d78ffcb19e7fcde2e62b416a1f2f68801998eb714d25e7520eeec5fab4314
|
| 3 |
+
size 3695
|
special_tokens_map.json
ADDED
|
@@ -0,0 +1,6 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"bos_token": "<bos>",
|
| 3 |
+
"eos_token": "<eos>",
|
| 4 |
+
"pad_token": "<pad>",
|
| 5 |
+
"unk_token": "<unk>"
|
| 6 |
+
}
|
speecht5-korean.pth
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:4708f3f325fb9aba87ff14e6251b7b678998207172c776c8857a283f5e4d49e0
|
| 3 |
+
size 577878135
|
speecht5-korean_epoch18_backup.pth
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:4708f3f325fb9aba87ff14e6251b7b678998207172c776c8857a283f5e4d49e0
|
| 3 |
+
size 577878135
|
tokenizer.json
ADDED
|
@@ -0,0 +1,126 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"version": "1.0",
|
| 3 |
+
"truncation": null,
|
| 4 |
+
"padding": null,
|
| 5 |
+
"added_tokens": [
|
| 6 |
+
{
|
| 7 |
+
"id": 0,
|
| 8 |
+
"content": "<pad>",
|
| 9 |
+
"single_word": false,
|
| 10 |
+
"lstrip": false,
|
| 11 |
+
"rstrip": false,
|
| 12 |
+
"normalized": false,
|
| 13 |
+
"special": true
|
| 14 |
+
},
|
| 15 |
+
{
|
| 16 |
+
"id": 1,
|
| 17 |
+
"content": "<unk>",
|
| 18 |
+
"single_word": false,
|
| 19 |
+
"lstrip": false,
|
| 20 |
+
"rstrip": false,
|
| 21 |
+
"normalized": false,
|
| 22 |
+
"special": true
|
| 23 |
+
},
|
| 24 |
+
{
|
| 25 |
+
"id": 2,
|
| 26 |
+
"content": "<bos>",
|
| 27 |
+
"single_word": false,
|
| 28 |
+
"lstrip": false,
|
| 29 |
+
"rstrip": false,
|
| 30 |
+
"normalized": false,
|
| 31 |
+
"special": true
|
| 32 |
+
},
|
| 33 |
+
{
|
| 34 |
+
"id": 3,
|
| 35 |
+
"content": "<eos>",
|
| 36 |
+
"single_word": false,
|
| 37 |
+
"lstrip": false,
|
| 38 |
+
"rstrip": false,
|
| 39 |
+
"normalized": false,
|
| 40 |
+
"special": true
|
| 41 |
+
}
|
| 42 |
+
],
|
| 43 |
+
"normalizer": null,
|
| 44 |
+
"pre_tokenizer": {
|
| 45 |
+
"type": "Whitespace"
|
| 46 |
+
},
|
| 47 |
+
"post_processor": null,
|
| 48 |
+
"decoder": null,
|
| 49 |
+
"model": {
|
| 50 |
+
"type": "WordLevel",
|
| 51 |
+
"vocab": {
|
| 52 |
+
"<pad>": 0,
|
| 53 |
+
"<unk>": 1,
|
| 54 |
+
"<bos>": 2,
|
| 55 |
+
"<eos>": 3,
|
| 56 |
+
"ᄀ": 4,
|
| 57 |
+
"ᄁ": 5,
|
| 58 |
+
"ᄂ": 6,
|
| 59 |
+
"ᄃ": 7,
|
| 60 |
+
"ᄄ": 8,
|
| 61 |
+
"ᄅ": 9,
|
| 62 |
+
"ᄆ": 10,
|
| 63 |
+
"ᄇ": 11,
|
| 64 |
+
"ᄈ": 12,
|
| 65 |
+
"ᄉ": 13,
|
| 66 |
+
"ᄊ": 14,
|
| 67 |
+
"ᄋ": 15,
|
| 68 |
+
"ᄌ": 16,
|
| 69 |
+
"ᄍ": 17,
|
| 70 |
+
"ᄎ": 18,
|
| 71 |
+
"ᄏ": 19,
|
| 72 |
+
"ᄐ": 20,
|
| 73 |
+
"ᄑ": 21,
|
| 74 |
+
"ᄒ": 22,
|
| 75 |
+
"ᅡ": 23,
|
| 76 |
+
"ᅢ": 24,
|
| 77 |
+
"ᅣ": 25,
|
| 78 |
+
"ᅤ": 26,
|
| 79 |
+
"ᅥ": 27,
|
| 80 |
+
"ᅦ": 28,
|
| 81 |
+
"ᅧ": 29,
|
| 82 |
+
"ᅨ": 30,
|
| 83 |
+
"ᅩ": 31,
|
| 84 |
+
"ᅪ": 32,
|
| 85 |
+
"ᅫ": 33,
|
| 86 |
+
"ᅬ": 34,
|
| 87 |
+
"ᅭ": 35,
|
| 88 |
+
"ᅮ": 36,
|
| 89 |
+
"ᅯ": 37,
|
| 90 |
+
"ᅰ": 38,
|
| 91 |
+
"ᅱ": 39,
|
| 92 |
+
"ᅲ": 40,
|
| 93 |
+
"ᅳ": 41,
|
| 94 |
+
"ᅴ": 42,
|
| 95 |
+
"ᅵ": 43,
|
| 96 |
+
"ᆨ": 44,
|
| 97 |
+
"ᆩ": 45,
|
| 98 |
+
"ᆪ": 46,
|
| 99 |
+
"ᆫ": 47,
|
| 100 |
+
"ᆬ": 48,
|
| 101 |
+
"ᆭ": 49,
|
| 102 |
+
"ᆮ": 50,
|
| 103 |
+
"ᆯ": 51,
|
| 104 |
+
"ᆰ": 52,
|
| 105 |
+
"ᆱ": 53,
|
| 106 |
+
"ᆲ": 54,
|
| 107 |
+
"ᆳ": 55,
|
| 108 |
+
"ᆴ": 56,
|
| 109 |
+
"ᆵ": 57,
|
| 110 |
+
"ᆶ": 58,
|
| 111 |
+
"ᆷ": 59,
|
| 112 |
+
"ᆸ": 60,
|
| 113 |
+
"ᆹ": 61,
|
| 114 |
+
"ᆺ": 62,
|
| 115 |
+
"ᆻ": 63,
|
| 116 |
+
"ᆼ": 64,
|
| 117 |
+
"ᆽ": 65,
|
| 118 |
+
"ᆾ": 66,
|
| 119 |
+
"ᆿ": 67,
|
| 120 |
+
"ᇀ": 68,
|
| 121 |
+
"ᇁ": 69,
|
| 122 |
+
"ᇂ": 70
|
| 123 |
+
},
|
| 124 |
+
"unk_token": "<unk>"
|
| 125 |
+
}
|
| 126 |
+
}
|
tokenizer_config.json
ADDED
|
@@ -0,0 +1,44 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"added_tokens_decoder": {
|
| 3 |
+
"0": {
|
| 4 |
+
"content": "<pad>",
|
| 5 |
+
"lstrip": false,
|
| 6 |
+
"normalized": false,
|
| 7 |
+
"rstrip": false,
|
| 8 |
+
"single_word": false,
|
| 9 |
+
"special": true
|
| 10 |
+
},
|
| 11 |
+
"1": {
|
| 12 |
+
"content": "<unk>",
|
| 13 |
+
"lstrip": false,
|
| 14 |
+
"normalized": false,
|
| 15 |
+
"rstrip": false,
|
| 16 |
+
"single_word": false,
|
| 17 |
+
"special": true
|
| 18 |
+
},
|
| 19 |
+
"2": {
|
| 20 |
+
"content": "<bos>",
|
| 21 |
+
"lstrip": false,
|
| 22 |
+
"normalized": false,
|
| 23 |
+
"rstrip": false,
|
| 24 |
+
"single_word": false,
|
| 25 |
+
"special": true
|
| 26 |
+
},
|
| 27 |
+
"3": {
|
| 28 |
+
"content": "<eos>",
|
| 29 |
+
"lstrip": false,
|
| 30 |
+
"normalized": false,
|
| 31 |
+
"rstrip": false,
|
| 32 |
+
"single_word": false,
|
| 33 |
+
"special": true
|
| 34 |
+
}
|
| 35 |
+
},
|
| 36 |
+
"bos_token": "<bos>",
|
| 37 |
+
"clean_up_tokenization_spaces": false,
|
| 38 |
+
"eos_token": "<eos>",
|
| 39 |
+
"extra_special_tokens": {},
|
| 40 |
+
"model_max_length": 1000000000000000019884624838656,
|
| 41 |
+
"pad_token": "<pad>",
|
| 42 |
+
"tokenizer_class": "PreTrainedTokenizerFast",
|
| 43 |
+
"unk_token": "<unk>"
|
| 44 |
+
}
|
vocoder/config.json
ADDED
|
@@ -0,0 +1,48 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"architectures": [
|
| 3 |
+
"SpeechT5HifiGan"
|
| 4 |
+
],
|
| 5 |
+
"dtype": "float32",
|
| 6 |
+
"initializer_range": 0.01,
|
| 7 |
+
"leaky_relu_slope": 0.1,
|
| 8 |
+
"model_in_dim": 80,
|
| 9 |
+
"model_type": "hifigan",
|
| 10 |
+
"normalize_before": true,
|
| 11 |
+
"resblock_dilation_sizes": [
|
| 12 |
+
[
|
| 13 |
+
1,
|
| 14 |
+
3,
|
| 15 |
+
5
|
| 16 |
+
],
|
| 17 |
+
[
|
| 18 |
+
1,
|
| 19 |
+
3,
|
| 20 |
+
5
|
| 21 |
+
],
|
| 22 |
+
[
|
| 23 |
+
1,
|
| 24 |
+
3,
|
| 25 |
+
5
|
| 26 |
+
]
|
| 27 |
+
],
|
| 28 |
+
"resblock_kernel_sizes": [
|
| 29 |
+
3,
|
| 30 |
+
7,
|
| 31 |
+
11
|
| 32 |
+
],
|
| 33 |
+
"sampling_rate": 16000,
|
| 34 |
+
"transformers_version": "4.57.3",
|
| 35 |
+
"upsample_initial_channel": 512,
|
| 36 |
+
"upsample_kernel_sizes": [
|
| 37 |
+
8,
|
| 38 |
+
8,
|
| 39 |
+
8,
|
| 40 |
+
8
|
| 41 |
+
],
|
| 42 |
+
"upsample_rates": [
|
| 43 |
+
4,
|
| 44 |
+
4,
|
| 45 |
+
4,
|
| 46 |
+
4
|
| 47 |
+
]
|
| 48 |
+
}
|
vocoder/model.safetensors
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:5e1ae998705b24b74b79b55bd9015f458ee85dbc1e98448aa7f0e0e066eba1bd
|
| 3 |
+
size 50640724
|