Plana-Archive commited on
Commit
c999898
·
verified ·
1 Parent(s): 957c16c

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +52 -108
app.py CHANGED
@@ -1,7 +1,6 @@
1
  import json
2
  import os
3
  import re
4
-
5
  import librosa
6
  import numpy as np
7
  import torch
@@ -12,9 +11,15 @@ import gradio as gr
12
  from models import SynthesizerTrn
13
  from text import text_to_sequence, _clean_text
14
  from mel_processing import spectrogram_torch
 
15
 
16
- limitation = os.getenv("SYSTEM") == "spaces" # limit text and audio length in huggingface spaces
 
 
17
 
 
 
 
18
 
19
  def get_text(text, hps, is_phoneme):
20
  text_norm = text_to_sequence(text, hps.symbols, [] if is_phoneme else hps.data.text_cleaners)
@@ -23,7 +28,6 @@ def get_text(text, hps, is_phoneme):
23
  text_norm = LongTensor(text_norm)
24
  return text_norm
25
 
26
-
27
  def create_tts_fn(model, hps, speaker_ids):
28
  def tts_fn(text, speaker, speed, is_phoneme):
29
  if limitation:
@@ -31,143 +35,83 @@ def create_tts_fn(model, hps, speaker_ids):
31
  max_len = 500
32
  if is_phoneme:
33
  max_len *= 3
34
- else:
35
- if len(hps.data.text_cleaners) > 0 and hps.data.text_cleaners[0] == "zh_ja_mixture_cleaners":
36
- text_len = len(re.sub("(\[ZH\]|\[JA\])", "", text))
37
  if text_len > max_len:
38
  return "Error: Text is too long", None
39
 
40
  speaker_id = speaker_ids[speaker]
41
  stn_tst = get_text(text, hps, is_phoneme)
42
  with no_grad():
43
- x_tst = stn_tst.cuda().unsqueeze(0)
44
- x_tst_lengths = LongTensor([stn_tst.size(0)]).cuda()
45
- sid = LongTensor([speaker_id]).cuda()
46
- audio = model.infer(x_tst, x_tst_lengths, sid=sid, noise_scale=.667, noise_scale_w=0.8,
47
- length_scale=1.0 / speed)[0][0, 0].data.cpu().float().numpy()
48
  del stn_tst, x_tst, x_tst_lengths, sid
49
  return "Success", (hps.data.sampling_rate, audio)
50
-
51
  return tts_fn
52
 
53
-
54
-
55
-
56
-
57
  def create_to_phoneme_fn(hps):
58
  def to_phoneme_fn(text):
59
  return _clean_text(text, hps.data.text_cleaners) if text != "" else ""
60
-
61
  return to_phoneme_fn
62
 
63
-
64
  css = """
65
- #advanced-btn {
66
- color: white;
67
- border-color: black;
68
- background: black;
69
- font-size: .7rem !important;
70
- line-height: 19px;
71
- margin-top: 24px;
72
- margin-bottom: 12px;
73
- padding: 2px 8px;
74
- border-radius: 14px !important;
75
- }
76
- #advanced-options {
77
- display: none;
78
- margin-bottom: 20px;
79
- }
80
  """
81
 
82
  if __name__ == '__main__':
83
- models_tts = []
84
- models_vc = []
85
- models_soft_vc = []
86
- # {"title": "ハミダシクリエイティブ", "lang": "日本語 (Japanese)", "example": "こんにちは。", "type": "vits"}
87
- name = 'プロセカ TTS'
88
- lang = '日本語 (Japanese)'
89
- example = 'こんにちは。'
90
- config_path = f"saved_model/config.json"
91
- model_path = f"saved_model/model.pth"
92
- cover_path = f"saved_model/cover.png"
93
  hps = utils.get_hparams_from_file(config_path)
94
  model = SynthesizerTrn(
95
  len(hps.symbols),
96
  hps.data.filter_length // 2 + 1,
97
  hps.train.segment_size // hps.data.hop_length,
98
  n_speakers=hps.data.n_speakers,
99
- **hps.model).cuda()
 
100
  utils.load_checkpoint(model_path, model, None)
101
  model.eval()
 
102
  speaker_ids = [sid for sid, name in enumerate(hps.speakers) if name != "None"]
103
  speakers = [name for sid, name in enumerate(hps.speakers) if name != "None"]
104
 
105
- t = 'vits'
106
- models_tts.append((name, cover_path, speakers, lang, example,
107
- hps.symbols, create_tts_fn(model, hps, speaker_ids),
108
- create_to_phoneme_fn(hps)))
109
-
110
 
111
  app = gr.Blocks(css=css)
112
-
113
  with app:
114
- gr.Markdown("# Project Sekai TTS Using VITS Model\n\n"
115
- "![visitor badge](https://visitor-badge.glitch.me/badge?page_id=kdrkdrkdr.ProsekaTTS)\n\n")
116
  with gr.Tabs():
117
  with gr.TabItem("TTS"):
118
- with gr.Tabs():
119
- for i, (name, cover_path, speakers, lang, example, symbols, tts_fn,
120
- to_phoneme_fn) in enumerate(models_tts):
121
- with gr.TabItem(f"Proseka"):
122
- with gr.Column():
123
- gr.Markdown(f"## {name}\n\n"
124
- f"![cover](file/{cover_path})\n\n"
125
- f"lang: {lang}")
126
- tts_input1 = gr.TextArea(label="Text (500 words limitation)", value=example,
127
- elem_id=f"tts-input{i}")
128
- tts_input2 = gr.Dropdown(label="Speaker", choices=speakers,
129
- type="index", value=speakers[0])
130
- tts_input3 = gr.Slider(label="Speed", value=1, minimum=0.1, maximum=2, step=0.1)
131
- with gr.Accordion(label="Advanced Options", open=False):
132
- phoneme_input = gr.Checkbox(value=False, label="Phoneme input")
133
- to_phoneme_btn = gr.Button("Covert text to phoneme")
134
- phoneme_list = gr.Dataset(label="Phoneme list", components=[tts_input1],
135
- samples=[[x] for x in symbols],
136
- elem_id=f"phoneme-list{i}")
137
- phoneme_list_json = gr.Json(value=symbols, visible=False)
138
- tts_submit = gr.Button("Generate", variant="primary")
139
- tts_output1 = gr.Textbox(label="Output Message")
140
- tts_output2 = gr.Audio(label="Output Audio")
141
- tts_submit.click(tts_fn, [tts_input1, tts_input2, tts_input3, phoneme_input],
142
- [tts_output1, tts_output2])
143
- to_phoneme_btn.click(to_phoneme_fn, [tts_input1], [tts_input1])
144
- phoneme_list.click(None, [phoneme_list, phoneme_list_json], [],
145
- _js=f"""
146
- (i,phonemes) => {{
147
- let root = document.querySelector("body > gradio-app");
148
- if (root.shadowRoot != null)
149
- root = root.shadowRoot;
150
- let text_input = root.querySelector("#tts-input{i}").querySelector("textarea");
151
- let startPos = text_input.selectionStart;
152
- let endPos = text_input.selectionEnd;
153
- let oldTxt = text_input.value;
154
- let result = oldTxt.substring(0, startPos) + phonemes[i] + oldTxt.substring(endPos);
155
- text_input.value = result;
156
- let x = window.scrollX, y = window.scrollY;
157
- text_input.focus();
158
- text_input.selectionStart = startPos + phonemes[i].length;
159
- text_input.selectionEnd = startPos + phonemes[i].length;
160
- text_input.blur();
161
- window.scrollTo(x, y);
162
- return [];
163
- }}""")
164
-
165
-
166
- gr.Markdown(
167
- "Official User Page \n\n"
168
- "- [https://github.com/kdrkdrkdr/ProsekaTTS](https://github.com/kdrkdrkdr/ProsekaTTS)\n\n"
169
- "Reference \n\n"
170
- "- [https://huggingface.co/spaces/skytnt/moe-tts](https://huggingface.co/spaces/skytnt/moe-tts)"
171
-
172
- )
173
- app.queue(concurrency_count=3).launch(show_api=False)
 
1
  import json
2
  import os
3
  import re
 
4
  import librosa
5
  import numpy as np
6
  import torch
 
11
  from models import SynthesizerTrn
12
  from text import text_to_sequence, _clean_text
13
  from mel_processing import spectrogram_torch
14
+ from huggingface_hub import hf_hub_download
15
 
16
+ # Konfigurasi repositori sumber
17
+ REPO_ID = "Plana-Archive/Plana-TTS"
18
+ SUBFOLDER = "Prosekai-TTS/saved_model"
19
 
20
+ # Pastikan berjalan di CPU untuk Space gratis
21
+ device = torch.device("cpu")
22
+ limitation = os.getenv("SYSTEM") == "spaces"
23
 
24
  def get_text(text, hps, is_phoneme):
25
  text_norm = text_to_sequence(text, hps.symbols, [] if is_phoneme else hps.data.text_cleaners)
 
28
  text_norm = LongTensor(text_norm)
29
  return text_norm
30
 
 
31
  def create_tts_fn(model, hps, speaker_ids):
32
  def tts_fn(text, speaker, speed, is_phoneme):
33
  if limitation:
 
35
  max_len = 500
36
  if is_phoneme:
37
  max_len *= 3
 
 
 
38
  if text_len > max_len:
39
  return "Error: Text is too long", None
40
 
41
  speaker_id = speaker_ids[speaker]
42
  stn_tst = get_text(text, hps, is_phoneme)
43
  with no_grad():
44
+ x_tst = stn_tst.unsqueeze(0).to(device)
45
+ x_tst_lengths = LongTensor([stn_tst.size(0)]).to(device)
46
+ sid = LongTensor([speaker_id]).to(device)
47
+ audio = model.infer(x_tst, x_tst_lengths, sid=sid, noise_scale=.667,
48
+ noise_scale_w=0.8, length_scale=1.0 / speed)[0][0, 0].data.cpu().float().numpy()
49
  del stn_tst, x_tst, x_tst_lengths, sid
50
  return "Success", (hps.data.sampling_rate, audio)
 
51
  return tts_fn
52
 
 
 
 
 
53
  def create_to_phoneme_fn(hps):
54
  def to_phoneme_fn(text):
55
  return _clean_text(text, hps.data.text_cleaners) if text != "" else ""
 
56
  return to_phoneme_fn
57
 
 
58
  css = """
59
+ #advanced-btn { color: white; border-color: black; background: black; font-size: .7rem !important; border-radius: 14px !important; }
60
+ #advanced-options { display: none; margin-bottom: 20px; }
 
 
 
 
 
 
 
 
 
 
 
 
 
61
  """
62
 
63
  if __name__ == '__main__':
64
+ # Download file model dari Hugging Face Hub secara otomatis
65
+ print("[*] Downloading model assets from Hub...")
66
+ config_path = hf_hub_download(repo_id=REPO_ID, filename="config.json", subfolder=SUBFOLDER)
67
+ model_path = hf_hub_download(repo_id=REPO_ID, filename="model.pth", subfolder=SUBFOLDER)
68
+ cover_path = hf_hub_download(repo_id=REPO_ID, filename="cover.png", subfolder=SUBFOLDER)
69
+
 
 
 
 
70
  hps = utils.get_hparams_from_file(config_path)
71
  model = SynthesizerTrn(
72
  len(hps.symbols),
73
  hps.data.filter_length // 2 + 1,
74
  hps.train.segment_size // hps.data.hop_length,
75
  n_speakers=hps.data.n_speakers,
76
+ **hps.model).to(device)
77
+
78
  utils.load_checkpoint(model_path, model, None)
79
  model.eval()
80
+
81
  speaker_ids = [sid for sid, name in enumerate(hps.speakers) if name != "None"]
82
  speakers = [name for sid, name in enumerate(hps.speakers) if name != "None"]
83
 
84
+ models_tts = [('プロセカ TTS', cover_path, speakers, '日本語 (Japanese)', 'こんにちは。',
85
+ hps.symbols, create_tts_fn(model, hps, speaker_ids),
86
+ create_to_phoneme_fn(hps))]
 
 
87
 
88
  app = gr.Blocks(css=css)
 
89
  with app:
90
+ gr.Markdown("# Project Sekai TTS Using VITS Model (CPU Mode)\n\n")
 
91
  with gr.Tabs():
92
  with gr.TabItem("TTS"):
93
+ for i, (name, cover, spks, lang, ex, syms, tts_fn, to_phoneme_fn) in enumerate(models_tts):
94
+ with gr.TabItem("Proseka"):
95
+ with gr.Column():
96
+ gr.Markdown(f"## {name}\n\n![cover](file/{cover})\n\nlang: {lang}")
97
+ tts_input1 = gr.TextArea(label="Text", value=ex, elem_id=f"tts-input{i}")
98
+ tts_input2 = gr.Dropdown(label="Speaker", choices=spks, type="index", value=spks[0])
99
+ tts_input3 = gr.Slider(label="Speed", value=1, minimum=0.1, maximum=2, step=0.1)
100
+
101
+ with gr.Accordion(label="Advanced Options", open=False):
102
+ phoneme_input = gr.Checkbox(value=False, label="Phoneme input")
103
+ to_phoneme_btn = gr.Button("Convert text to phoneme")
104
+ phoneme_list = gr.Dataset(label="Phoneme list", components=[tts_input1],
105
+ samples=[[x] for x in syms])
106
+ phoneme_list_json = gr.Json(value=syms, visible=False)
107
+
108
+ tts_submit = gr.Button("Generate", variant="primary")
109
+ tts_output1 = gr.Textbox(label="Output Message")
110
+ tts_output2 = gr.Audio(label="Output Audio")
111
+
112
+ tts_submit.click(tts_fn, [tts_input1, tts_input2, tts_input3, phoneme_input], [tts_output1, tts_output2])
113
+ to_phoneme_btn.click(to_phoneme_fn, [tts_input1], [tts_input1])
114
+
115
+ gr.Markdown("Official User Page: [Plana-Archive](https://huggingface.co/Plana-Archive)")
116
+
117
+ app.queue().launch(show_api=False)