ahnhs2k commited on
Commit
49fa254
·
verified ·
1 Parent(s): 9d587cf

Upload SpeechT5 Korean TTS artifacts

Browse files
README.md ADDED
@@ -0,0 +1,29 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ # Korean SpeechT5 (Jamo Tokenizer, KSS)
3
+ If you use this model in research or production,
4
+ please cite:
5
+
6
+ @misc{ahnhs2k_speecht5_korean,
7
+ author = {Ahn, Hosung},
8
+ title = {Korean SpeechT5 TTS Model},
9
+ year = {2025},
10
+ publisher = {Hugging Face},
11
+ url = {https://huggingface.co/ahnhs2k/...}
12
+ }
13
+
14
+ ## 모델 특징
15
+ - Base Model: microsoft/speecht5_tts
16
+ - Dataset: Bingsu/KSS_Dataset
17
+ - Tokenizer: Jamo-based Korean tokenizer (character-level)
18
+ - Speaker Embedding: microsoft/wavlm-base-plus-sv
19
+ - Vocoder: microsoft/speecht5_hifigan
20
+ - Sample Rate: 16 kHz
21
+ - 단일 화자 한국어 TTS 모델
22
+
23
+ ## 라이선스
24
+ This model is released under the **CC-BY-SA-4.0** License.
25
+
26
+ When using this model (including commercial usage), you must:
27
+ - Provide attribution: "Model fine-tuned by ahnhs2k (2025)"
28
+ - Include a link to this model page
29
+ - Distribute derivative models under the same license
config.json ADDED
@@ -0,0 +1,93 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "activation_dropout": 0.1,
3
+ "apply_spec_augment": true,
4
+ "architectures": [
5
+ "SpeechT5ForTextToSpeech"
6
+ ],
7
+ "attention_dropout": 0.1,
8
+ "bos_token_id": 0,
9
+ "conv_bias": false,
10
+ "conv_dim": [
11
+ 512,
12
+ 512,
13
+ 512,
14
+ 512,
15
+ 512,
16
+ 512,
17
+ 512
18
+ ],
19
+ "conv_kernel": [
20
+ 10,
21
+ 3,
22
+ 3,
23
+ 3,
24
+ 3,
25
+ 2,
26
+ 2
27
+ ],
28
+ "conv_stride": [
29
+ 5,
30
+ 2,
31
+ 2,
32
+ 2,
33
+ 2,
34
+ 2,
35
+ 2
36
+ ],
37
+ "decoder_attention_heads": 12,
38
+ "decoder_ffn_dim": 3072,
39
+ "decoder_layerdrop": 0.1,
40
+ "decoder_layers": 6,
41
+ "decoder_start_token_id": 2,
42
+ "dtype": "float32",
43
+ "encoder_attention_heads": 12,
44
+ "encoder_ffn_dim": 3072,
45
+ "encoder_layerdrop": 0.1,
46
+ "encoder_layers": 12,
47
+ "encoder_max_relative_position": 160,
48
+ "eos_token_id": 2,
49
+ "feat_extract_activation": "gelu",
50
+ "feat_extract_norm": "group",
51
+ "feat_proj_dropout": 0.0,
52
+ "guided_attention_loss_num_heads": 2,
53
+ "guided_attention_loss_scale": 10.0,
54
+ "guided_attention_loss_sigma": 0.4,
55
+ "hidden_act": "gelu",
56
+ "hidden_dropout": 0.1,
57
+ "hidden_size": 768,
58
+ "initializer_range": 0.02,
59
+ "is_encoder_decoder": true,
60
+ "layer_norm_eps": 1e-05,
61
+ "mask_feature_length": 10,
62
+ "mask_feature_min_masks": 0,
63
+ "mask_feature_prob": 0.0,
64
+ "mask_time_length": 10,
65
+ "mask_time_min_masks": 2,
66
+ "mask_time_prob": 0.05,
67
+ "max_length": null,
68
+ "max_speech_positions": 1876,
69
+ "max_text_positions": 600,
70
+ "model_type": "speecht5",
71
+ "num_conv_pos_embedding_groups": 16,
72
+ "num_conv_pos_embeddings": 128,
73
+ "num_feat_extract_layers": 7,
74
+ "num_mel_bins": 80,
75
+ "pad_token_id": 1,
76
+ "positional_dropout": 0.1,
77
+ "reduction_factor": 2,
78
+ "scale_embedding": false,
79
+ "speaker_embedding_dim": 512,
80
+ "speaker_embedding_path": "speaker_embedding.pth",
81
+ "speech_decoder_postnet_dropout": 0.5,
82
+ "speech_decoder_postnet_kernel": 5,
83
+ "speech_decoder_postnet_layers": 5,
84
+ "speech_decoder_postnet_units": 256,
85
+ "speech_decoder_prenet_dropout": 0.5,
86
+ "speech_decoder_prenet_layers": 2,
87
+ "speech_decoder_prenet_units": 256,
88
+ "transformers_version": "4.57.3",
89
+ "use_cache": true,
90
+ "use_guided_attention_loss": true,
91
+ "use_guided_attn_loss": false,
92
+ "vocab_size": 71
93
+ }
demo_inference.py ADDED
@@ -0,0 +1,62 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ import soundfile as sf
3
+ from pathlib import Path
4
+ import unicodedata
5
+ from transformers import (
6
+ SpeechT5ForTextToSpeech,
7
+ SpeechT5Processor,
8
+ SpeechT5HifiGan,
9
+ PreTrainedTokenizerFast,
10
+ )
11
+
12
+ MODEL_ID = "ahnhs2k/speecht5-korean"
13
+ DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
14
+
15
+
16
+ def decompose_jamo(text):
17
+ result = []
18
+ for ch in text:
19
+ name = unicodedata.name(ch, "")
20
+ if "HANGUL SYLLABLE" in name:
21
+ code = ord(ch) - 0xAC00
22
+ result.append(chr(0x1100 + (code // 588)))
23
+ result.append(chr(0x1161 + ((code % 588) // 28)))
24
+ jong = code % 28
25
+ if jong > 0:
26
+ result.append(chr(0x11A7 + jong))
27
+ else:
28
+ result.append(ch)
29
+ return result
30
+
31
+
32
+ def main():
33
+ model = SpeechT5ForTextToSpeech.from_pretrained(MODEL_ID).to(DEVICE).eval()
34
+
35
+ # Processor는 항상 원본에서 로드
36
+ processor = SpeechT5Processor.from_pretrained("microsoft/speecht5_tts")
37
+
38
+ # Custom tokenizer 로드 후 processor에 덮어쓰기
39
+ tokenizer = PreTrainedTokenizerFast.from_pretrained(MODEL_ID)
40
+ processor.tokenizer = tokenizer
41
+
42
+ vocoder = SpeechT5HifiGan.from_pretrained(Path(__file__).resolve().parent / "vocoder").to(DEVICE).eval()
43
+
44
+ # speaker embedding
45
+ spk_path = Path(__file__).resolve().parent / "speaker_embedding.pth"
46
+ spk_emb = torch.load(spk_path).to(DEVICE)
47
+
48
+ text = "안녕하세요. 자모 토크나이저 기반 한국어 TTS 데모입니다."
49
+ jamo_seq = decompose_jamo(text)
50
+
51
+ enc = tokenizer(jamo_seq, is_split_into_words=True, add_special_tokens=True, return_tensors="pt")
52
+ enc = {k: v.to(DEVICE) for k, v in enc.items()}
53
+
54
+ with torch.no_grad():
55
+ gen = model.generate_speech(enc["input_ids"], speaker_embeddings=spk_emb.unsqueeze(0), vocoder=vocoder)
56
+
57
+ sf.write("demo_inference_output.wav", gen.cpu().numpy(), 16000)
58
+ print("Saved demo_inference_output.wav")
59
+
60
+
61
+ if __name__ == "__main__":
62
+ main()
generation_config.json ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_from_model_config": true,
3
+ "bos_token_id": 0,
4
+ "decoder_start_token_id": 2,
5
+ "eos_token_id": 2,
6
+ "max_length": 1876,
7
+ "pad_token_id": 1,
8
+ "transformers_version": "4.57.3"
9
+ }
jamo_vocab.txt ADDED
@@ -0,0 +1,71 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ <pad>
2
+ <unk>
3
+ <bos>
4
+ <eos>
5
+
6
+
7
+
8
+
9
+
10
+
11
+
12
+
13
+
14
+
15
+
16
+
17
+
18
+
19
+
20
+
21
+
22
+
23
+
24
+
25
+
26
+
27
+
28
+
29
+
30
+
31
+
32
+
33
+
34
+
35
+
36
+
37
+
38
+
39
+
40
+
41
+
42
+
43
+
44
+
45
+
46
+
47
+
48
+
49
+
50
+
51
+
52
+
53
+
54
+
55
+
56
+
57
+
58
+
59
+
60
+
61
+
62
+
63
+
64
+
65
+
66
+
67
+
68
+
69
+
70
+
71
+
jamo_vocab.txt:Zone.Identifier ADDED
Binary file (25 Bytes). View file
 
model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:95c8fe3a5eed5283f7cdfcae48f8758dae3d8bd62934ba327e268ee178506871
3
+ size 577758600
preprocessor_config.json ADDED
@@ -0,0 +1,19 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "do_normalize": false,
3
+ "feature_extractor_type": "SpeechT5FeatureExtractor",
4
+ "feature_size": 1,
5
+ "fmax": 7600,
6
+ "fmin": 80,
7
+ "frame_signal_scale": 1.0,
8
+ "hop_length": 16,
9
+ "mel_floor": 1e-10,
10
+ "num_mel_bins": 80,
11
+ "padding_side": "right",
12
+ "padding_value": 0.0,
13
+ "processor_class": "SpeechT5Processor",
14
+ "reduction_factor": 2,
15
+ "return_attention_mask": true,
16
+ "sampling_rate": 16000,
17
+ "win_function": "hann_window",
18
+ "win_length": 64
19
+ }
speaker_embedding.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f25d78ffcb19e7fcde2e62b416a1f2f68801998eb714d25e7520eeec5fab4314
3
+ size 3695
special_tokens_map.json ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token": "<bos>",
3
+ "eos_token": "<eos>",
4
+ "pad_token": "<pad>",
5
+ "unk_token": "<unk>"
6
+ }
speecht5-korean.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4708f3f325fb9aba87ff14e6251b7b678998207172c776c8857a283f5e4d49e0
3
+ size 577878135
speecht5-korean_epoch18_backup.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4708f3f325fb9aba87ff14e6251b7b678998207172c776c8857a283f5e4d49e0
3
+ size 577878135
tokenizer.json ADDED
@@ -0,0 +1,126 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "version": "1.0",
3
+ "truncation": null,
4
+ "padding": null,
5
+ "added_tokens": [
6
+ {
7
+ "id": 0,
8
+ "content": "<pad>",
9
+ "single_word": false,
10
+ "lstrip": false,
11
+ "rstrip": false,
12
+ "normalized": false,
13
+ "special": true
14
+ },
15
+ {
16
+ "id": 1,
17
+ "content": "<unk>",
18
+ "single_word": false,
19
+ "lstrip": false,
20
+ "rstrip": false,
21
+ "normalized": false,
22
+ "special": true
23
+ },
24
+ {
25
+ "id": 2,
26
+ "content": "<bos>",
27
+ "single_word": false,
28
+ "lstrip": false,
29
+ "rstrip": false,
30
+ "normalized": false,
31
+ "special": true
32
+ },
33
+ {
34
+ "id": 3,
35
+ "content": "<eos>",
36
+ "single_word": false,
37
+ "lstrip": false,
38
+ "rstrip": false,
39
+ "normalized": false,
40
+ "special": true
41
+ }
42
+ ],
43
+ "normalizer": null,
44
+ "pre_tokenizer": {
45
+ "type": "Whitespace"
46
+ },
47
+ "post_processor": null,
48
+ "decoder": null,
49
+ "model": {
50
+ "type": "WordLevel",
51
+ "vocab": {
52
+ "<pad>": 0,
53
+ "<unk>": 1,
54
+ "<bos>": 2,
55
+ "<eos>": 3,
56
+ "ᄀ": 4,
57
+ "ᄁ": 5,
58
+ "ᄂ": 6,
59
+ "ᄃ": 7,
60
+ "ᄄ": 8,
61
+ "ᄅ": 9,
62
+ "ᄆ": 10,
63
+ "ᄇ": 11,
64
+ "ᄈ": 12,
65
+ "ᄉ": 13,
66
+ "ᄊ": 14,
67
+ "ᄋ": 15,
68
+ "ᄌ": 16,
69
+ "ᄍ": 17,
70
+ "ᄎ": 18,
71
+ "ᄏ": 19,
72
+ "ᄐ": 20,
73
+ "ᄑ": 21,
74
+ "ᄒ": 22,
75
+ "ᅡ": 23,
76
+ "ᅢ": 24,
77
+ "ᅣ": 25,
78
+ "ᅤ": 26,
79
+ "ᅥ": 27,
80
+ "ᅦ": 28,
81
+ "ᅧ": 29,
82
+ "ᅨ": 30,
83
+ "ᅩ": 31,
84
+ "ᅪ": 32,
85
+ "ᅫ": 33,
86
+ "ᅬ": 34,
87
+ "ᅭ": 35,
88
+ "ᅮ": 36,
89
+ "ᅯ": 37,
90
+ "ᅰ": 38,
91
+ "ᅱ": 39,
92
+ "ᅲ": 40,
93
+ "ᅳ": 41,
94
+ "ᅴ": 42,
95
+ "ᅵ": 43,
96
+ "ᆨ": 44,
97
+ "ᆩ": 45,
98
+ "ᆪ": 46,
99
+ "ᆫ": 47,
100
+ "ᆬ": 48,
101
+ "ᆭ": 49,
102
+ "ᆮ": 50,
103
+ "ᆯ": 51,
104
+ "ᆰ": 52,
105
+ "ᆱ": 53,
106
+ "ᆲ": 54,
107
+ "ᆳ": 55,
108
+ "ᆴ": 56,
109
+ "ᆵ": 57,
110
+ "ᆶ": 58,
111
+ "ᆷ": 59,
112
+ "ᆸ": 60,
113
+ "ᆹ": 61,
114
+ "ᆺ": 62,
115
+ "ᆻ": 63,
116
+ "ᆼ": 64,
117
+ "ᆽ": 65,
118
+ "ᆾ": 66,
119
+ "ᆿ": 67,
120
+ "ᇀ": 68,
121
+ "ᇁ": 69,
122
+ "ᇂ": 70
123
+ },
124
+ "unk_token": "<unk>"
125
+ }
126
+ }
tokenizer_config.json ADDED
@@ -0,0 +1,44 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "added_tokens_decoder": {
3
+ "0": {
4
+ "content": "<pad>",
5
+ "lstrip": false,
6
+ "normalized": false,
7
+ "rstrip": false,
8
+ "single_word": false,
9
+ "special": true
10
+ },
11
+ "1": {
12
+ "content": "<unk>",
13
+ "lstrip": false,
14
+ "normalized": false,
15
+ "rstrip": false,
16
+ "single_word": false,
17
+ "special": true
18
+ },
19
+ "2": {
20
+ "content": "<bos>",
21
+ "lstrip": false,
22
+ "normalized": false,
23
+ "rstrip": false,
24
+ "single_word": false,
25
+ "special": true
26
+ },
27
+ "3": {
28
+ "content": "<eos>",
29
+ "lstrip": false,
30
+ "normalized": false,
31
+ "rstrip": false,
32
+ "single_word": false,
33
+ "special": true
34
+ }
35
+ },
36
+ "bos_token": "<bos>",
37
+ "clean_up_tokenization_spaces": false,
38
+ "eos_token": "<eos>",
39
+ "extra_special_tokens": {},
40
+ "model_max_length": 1000000000000000019884624838656,
41
+ "pad_token": "<pad>",
42
+ "tokenizer_class": "PreTrainedTokenizerFast",
43
+ "unk_token": "<unk>"
44
+ }
vocoder/config.json ADDED
@@ -0,0 +1,48 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "architectures": [
3
+ "SpeechT5HifiGan"
4
+ ],
5
+ "dtype": "float32",
6
+ "initializer_range": 0.01,
7
+ "leaky_relu_slope": 0.1,
8
+ "model_in_dim": 80,
9
+ "model_type": "hifigan",
10
+ "normalize_before": true,
11
+ "resblock_dilation_sizes": [
12
+ [
13
+ 1,
14
+ 3,
15
+ 5
16
+ ],
17
+ [
18
+ 1,
19
+ 3,
20
+ 5
21
+ ],
22
+ [
23
+ 1,
24
+ 3,
25
+ 5
26
+ ]
27
+ ],
28
+ "resblock_kernel_sizes": [
29
+ 3,
30
+ 7,
31
+ 11
32
+ ],
33
+ "sampling_rate": 16000,
34
+ "transformers_version": "4.57.3",
35
+ "upsample_initial_channel": 512,
36
+ "upsample_kernel_sizes": [
37
+ 8,
38
+ 8,
39
+ 8,
40
+ 8
41
+ ],
42
+ "upsample_rates": [
43
+ 4,
44
+ 4,
45
+ 4,
46
+ 4
47
+ ]
48
+ }
vocoder/model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5e1ae998705b24b74b79b55bd9015f458ee85dbc1e98448aa7f0e0e066eba1bd
3
+ size 50640724