Upload SpeechT5 Korean TTS artifacts

Browse files

Files changed (16) hide show

README.md +29 -0
config.json +93 -0
demo_inference.py +62 -0
generation_config.json +9 -0
jamo_vocab.txt +71 -0
jamo_vocab.txt:Zone.Identifier +0 -0
model.safetensors +3 -0
preprocessor_config.json +19 -0
speaker_embedding.pth +3 -0
special_tokens_map.json +6 -0
speecht5-korean.pth +3 -0
speecht5-korean_epoch18_backup.pth +3 -0
tokenizer.json +126 -0
tokenizer_config.json +44 -0
vocoder/config.json +48 -0
vocoder/model.safetensors +3 -0

README.md ADDED Viewed

	@@ -0,0 +1,29 @@

+# Korean SpeechT5 (Jamo Tokenizer, KSS)
+If you use this model in research or production,
+please cite:
+@misc{ahnhs2k_speecht5_korean,
+  author = {Ahn, Hosung},
+  title = {Korean SpeechT5 TTS Model},
+  year = {2025},
+  publisher = {Hugging Face},
+  url = {https://huggingface.co/ahnhs2k/...}
+}
+## 모델 특징
+- Base Model: microsoft/speecht5_tts
+- Dataset: Bingsu/KSS_Dataset
+- Tokenizer: Jamo-based Korean tokenizer (character-level)
+- Speaker Embedding: microsoft/wavlm-base-plus-sv
+- Vocoder: microsoft/speecht5_hifigan
+- Sample Rate: 16 kHz
+- 단일 화자 한국어 TTS 모델
+## 라이선스
+This model is released under the **CC-BY-SA-4.0** License.
+When using this model (including commercial usage), you must:
+- Provide attribution: "Model fine-tuned by ahnhs2k (2025)"
+- Include a link to this model page
+- Distribute derivative models under the same license

config.json ADDED Viewed

	@@ -0,0 +1,93 @@

+{
+  "activation_dropout": 0.1,
+  "apply_spec_augment": true,
+  "architectures": [
+    "SpeechT5ForTextToSpeech"
+  ],
+  "attention_dropout": 0.1,
+  "bos_token_id": 0,
+  "conv_bias": false,
+  "conv_dim": [
+    512,
+    512,
+    512,
+    512,
+    512,
+    512,
+    512
+  ],
+  "conv_kernel": [
+    10,
+    3,
+    3,
+    3,
+    3,
+    2,
+    2
+  ],
+  "conv_stride": [
+    5,
+    2,
+    2,
+    2,
+    2,
+    2,
+    2
+  ],
+  "decoder_attention_heads": 12,
+  "decoder_ffn_dim": 3072,
+  "decoder_layerdrop": 0.1,
+  "decoder_layers": 6,
+  "decoder_start_token_id": 2,
+  "dtype": "float32",
+  "encoder_attention_heads": 12,
+  "encoder_ffn_dim": 3072,
+  "encoder_layerdrop": 0.1,
+  "encoder_layers": 12,
+  "encoder_max_relative_position": 160,
+  "eos_token_id": 2,
+  "feat_extract_activation": "gelu",
+  "feat_extract_norm": "group",
+  "feat_proj_dropout": 0.0,
+  "guided_attention_loss_num_heads": 2,
+  "guided_attention_loss_scale": 10.0,
+  "guided_attention_loss_sigma": 0.4,
+  "hidden_act": "gelu",
+  "hidden_dropout": 0.1,
+  "hidden_size": 768,
+  "initializer_range": 0.02,
+  "is_encoder_decoder": true,
+  "layer_norm_eps": 1e-05,
+  "mask_feature_length": 10,
+  "mask_feature_min_masks": 0,
+  "mask_feature_prob": 0.0,
+  "mask_time_length": 10,
+  "mask_time_min_masks": 2,
+  "mask_time_prob": 0.05,
+  "max_length": null,
+  "max_speech_positions": 1876,
+  "max_text_positions": 600,
+  "model_type": "speecht5",
+  "num_conv_pos_embedding_groups": 16,
+  "num_conv_pos_embeddings": 128,
+  "num_feat_extract_layers": 7,
+  "num_mel_bins": 80,
+  "pad_token_id": 1,
+  "positional_dropout": 0.1,
+  "reduction_factor": 2,
+  "scale_embedding": false,
+  "speaker_embedding_dim": 512,
+  "speaker_embedding_path": "speaker_embedding.pth",
+  "speech_decoder_postnet_dropout": 0.5,
+  "speech_decoder_postnet_kernel": 5,
+  "speech_decoder_postnet_layers": 5,
+  "speech_decoder_postnet_units": 256,
+  "speech_decoder_prenet_dropout": 0.5,
+  "speech_decoder_prenet_layers": 2,
+  "speech_decoder_prenet_units": 256,
+  "transformers_version": "4.57.3",
+  "use_cache": true,
+  "use_guided_attention_loss": true,
+  "use_guided_attn_loss": false,
+  "vocab_size": 71
+}

demo_inference.py ADDED Viewed

	@@ -0,0 +1,62 @@

+import torch
+import soundfile as sf
+from pathlib import Path
+import unicodedata
+from transformers import (
+    SpeechT5ForTextToSpeech,
+    SpeechT5Processor,
+    SpeechT5HifiGan,
+    PreTrainedTokenizerFast,
+)
+MODEL_ID = "ahnhs2k/speecht5-korean"
+DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+def decompose_jamo(text):
+    result = []
+    for ch in text:
+        name = unicodedata.name(ch, "")
+        if "HANGUL SYLLABLE" in name:
+            code = ord(ch) - 0xAC00
+            result.append(chr(0x1100 + (code // 588)))
+            result.append(chr(0x1161 + ((code % 588) // 28)))
+            jong = code % 28
+            if jong > 0:
+                result.append(chr(0x11A7 + jong))
+        else:
+            result.append(ch)
+    return result
+def main():
+    model = SpeechT5ForTextToSpeech.from_pretrained(MODEL_ID).to(DEVICE).eval()
+    # Processor는 항상 원본에서 로드
+    processor = SpeechT5Processor.from_pretrained("microsoft/speecht5_tts")
+    # Custom tokenizer 로드 후 processor에 덮어쓰기
+    tokenizer = PreTrainedTokenizerFast.from_pretrained(MODEL_ID)
+    processor.tokenizer = tokenizer
+    vocoder = SpeechT5HifiGan.from_pretrained(Path(__file__).resolve().parent / "vocoder").to(DEVICE).eval()
+    # speaker embedding
+    spk_path = Path(__file__).resolve().parent / "speaker_embedding.pth"
+    spk_emb = torch.load(spk_path).to(DEVICE)
+    text = "안녕하세요. 자모 토크나이저 기반 한국어 TTS 데모입니다."
+    jamo_seq = decompose_jamo(text)
+    enc = tokenizer(jamo_seq, is_split_into_words=True, add_special_tokens=True, return_tensors="pt")
+    enc = {k: v.to(DEVICE) for k, v in enc.items()}
+    with torch.no_grad():
+        gen = model.generate_speech(enc["input_ids"], speaker_embeddings=spk_emb.unsqueeze(0), vocoder=vocoder)
+    sf.write("demo_inference_output.wav", gen.cpu().numpy(), 16000)
+    print("Saved demo_inference_output.wav")
+if __name__ == "__main__":
+    main()

generation_config.json ADDED Viewed

	@@ -0,0 +1,9 @@

+{
+  "_from_model_config": true,
+  "bos_token_id": 0,
+  "decoder_start_token_id": 2,
+  "eos_token_id": 2,
+  "max_length": 1876,
+  "pad_token_id": 1,
+  "transformers_version": "4.57.3"
+}

jamo_vocab.txt ADDED Viewed

	@@ -0,0 +1,71 @@

+<pad>
+<unk>
+<bos>
+<eos>
+ᄀ
+ᄁ
+ᄂ
+ᄃ
+ᄄ
+ᄅ
+ᄆ
+ᄇ
+ᄈ
+ᄉ
+ᄊ
+ᄋ
+ᄌ
+ᄍ
+ᄎ
+ᄏ
+ᄐ
+ᄑ
+ᄒ
+ᅡ
+ᅢ
+ᅣ
+ᅤ
+ᅥ
+ᅦ
+ᅧ
+ᅨ
+ᅩ
+ᅪ
+ᅫ
+ᅬ
+ᅭ
+ᅮ
+ᅯ
+ᅰ
+ᅱ
+ᅲ
+ᅳ
+ᅴ
+ᅵ
+ᆨ
+ᆩ
+ᆪ
+ᆫ
+ᆬ
+ᆭ
+ᆮ
+ᆯ
+ᆰ
+ᆱ
+ᆲ
+ᆳ
+ᆴ
+ᆵ
+ᆶ
+ᆷ
+ᆸ
+ᆹ
+ᆺ
+ᆻ
+ᆼ
+ᆽ
+ᆾ
+ᆿ
+ᇀ
+ᇁ
+ᇂ

jamo_vocab.txt:Zone.Identifier ADDED Viewed

Binary file (25 Bytes). View file

model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:95c8fe3a5eed5283f7cdfcae48f8758dae3d8bd62934ba327e268ee178506871
+size 577758600

preprocessor_config.json ADDED Viewed

	@@ -0,0 +1,19 @@

+{
+  "do_normalize": false,
+  "feature_extractor_type": "SpeechT5FeatureExtractor",
+  "feature_size": 1,
+  "fmax": 7600,
+  "fmin": 80,
+  "frame_signal_scale": 1.0,
+  "hop_length": 16,
+  "mel_floor": 1e-10,
+  "num_mel_bins": 80,
+  "padding_side": "right",
+  "padding_value": 0.0,
+  "processor_class": "SpeechT5Processor",
+  "reduction_factor": 2,
+  "return_attention_mask": true,
+  "sampling_rate": 16000,
+  "win_function": "hann_window",
+  "win_length": 64
+}

speaker_embedding.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:f25d78ffcb19e7fcde2e62b416a1f2f68801998eb714d25e7520eeec5fab4314
+size 3695

special_tokens_map.json ADDED Viewed

	@@ -0,0 +1,6 @@

+{
+  "bos_token": "<bos>",
+  "eos_token": "<eos>",
+  "pad_token": "<pad>",
+  "unk_token": "<unk>"
+}

speecht5-korean.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:4708f3f325fb9aba87ff14e6251b7b678998207172c776c8857a283f5e4d49e0
+size 577878135

speecht5-korean_epoch18_backup.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:4708f3f325fb9aba87ff14e6251b7b678998207172c776c8857a283f5e4d49e0
+size 577878135

tokenizer.json ADDED Viewed

	@@ -0,0 +1,126 @@

+{
+  "version": "1.0",
+  "truncation": null,
+  "padding": null,
+  "added_tokens": [
+    {
+      "id": 0,
+      "content": "<pad>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    {
+      "id": 1,
+      "content": "<unk>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    {
+      "id": 2,
+      "content": "<bos>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    {
+      "id": 3,
+      "content": "<eos>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    }
+  ],
+  "normalizer": null,
+  "pre_tokenizer": {
+    "type": "Whitespace"
+  },
+  "post_processor": null,
+  "decoder": null,
+  "model": {
+    "type": "WordLevel",
+    "vocab": {
+      "<pad>": 0,
+      "<unk>": 1,
+      "<bos>": 2,
+      "<eos>": 3,
+      "ᄀ": 4,
+      "ᄁ": 5,
+      "ᄂ": 6,
+      "ᄃ": 7,
+      "ᄄ": 8,
+      "ᄅ": 9,
+      "ᄆ": 10,
+      "ᄇ": 11,
+      "ᄈ": 12,
+      "ᄉ": 13,
+      "ᄊ": 14,
+      "ᄋ": 15,
+      "ᄌ": 16,
+      "ᄍ": 17,
+      "ᄎ": 18,
+      "ᄏ": 19,
+      "ᄐ": 20,
+      "ᄑ": 21,
+      "ᄒ": 22,
+      "ᅡ": 23,
+      "ᅢ": 24,
+      "ᅣ": 25,
+      "ᅤ": 26,
+      "ᅥ": 27,
+      "ᅦ": 28,
+      "ᅧ": 29,
+      "ᅨ": 30,
+      "ᅩ": 31,
+      "ᅪ": 32,
+      "ᅫ": 33,
+      "ᅬ": 34,
+      "ᅭ": 35,
+      "ᅮ": 36,
+      "ᅯ": 37,
+      "ᅰ": 38,
+      "ᅱ": 39,
+      "ᅲ": 40,
+      "ᅳ": 41,
+      "ᅴ": 42,
+      "ᅵ": 43,
+      "ᆨ": 44,
+      "ᆩ": 45,
+      "ᆪ": 46,
+      "ᆫ": 47,
+      "ᆬ": 48,
+      "ᆭ": 49,
+      "ᆮ": 50,
+      "ᆯ": 51,
+      "ᆰ": 52,
+      "ᆱ": 53,
+      "ᆲ": 54,
+      "ᆳ": 55,
+      "ᆴ": 56,
+      "ᆵ": 57,
+      "ᆶ": 58,
+      "ᆷ": 59,
+      "ᆸ": 60,
+      "ᆹ": 61,
+      "ᆺ": 62,
+      "ᆻ": 63,
+      "ᆼ": 64,
+      "ᆽ": 65,
+      "ᆾ": 66,
+      "ᆿ": 67,
+      "ᇀ": 68,
+      "ᇁ": 69,
+      "ᇂ": 70
+    },
+    "unk_token": "<unk>"
+  }
+}

tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,44 @@

+{
+  "added_tokens_decoder": {
+    "0": {
+      "content": "<pad>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "1": {
+      "content": "<unk>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "2": {
+      "content": "<bos>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "3": {
+      "content": "<eos>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    }
+  },
+  "bos_token": "<bos>",
+  "clean_up_tokenization_spaces": false,
+  "eos_token": "<eos>",
+  "extra_special_tokens": {},
+  "model_max_length": 1000000000000000019884624838656,
+  "pad_token": "<pad>",
+  "tokenizer_class": "PreTrainedTokenizerFast",
+  "unk_token": "<unk>"
+}

vocoder/config.json ADDED Viewed

	@@ -0,0 +1,48 @@

+{
+  "architectures": [
+    "SpeechT5HifiGan"
+  ],
+  "dtype": "float32",
+  "initializer_range": 0.01,
+  "leaky_relu_slope": 0.1,
+  "model_in_dim": 80,
+  "model_type": "hifigan",
+  "normalize_before": true,
+  "resblock_dilation_sizes": [
+    [
+      1,
+      3,
+      5
+    ],
+    [
+      1,
+      3,
+      5
+    ],
+    [
+      1,
+      3,
+      5
+    ]
+  ],
+  "resblock_kernel_sizes": [
+    3,
+    7,
+    11
+  ],
+  "sampling_rate": 16000,
+  "transformers_version": "4.57.3",
+  "upsample_initial_channel": 512,
+  "upsample_kernel_sizes": [
+    8,
+    8,
+    8,
+    8
+  ],
+  "upsample_rates": [
+    4,
+    4,
+    4,
+    4
+  ]
+}

vocoder/model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:5e1ae998705b24b74b79b55bd9015f458ee85dbc1e98448aa7f0e0e066eba1bd
+size 50640724