elielsilva commited on
Commit
2f093db
·
verified ·
1 Parent(s): e7ee8d9

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +119 -175
app.py CHANGED
@@ -1,6 +1,7 @@
1
  import os
2
  import re
3
  import uuid
 
4
  import numpy as np
5
  import wave
6
  import gradio as gr
@@ -10,7 +11,7 @@ from deep_translator import GoogleTranslator
10
  from pydub import AudioSegment
11
  from pydub.silence import split_on_silence
12
 
13
- # --- Configurações Iniciais ---
14
 
15
  language_map_local = {
16
  "Brazilian Portuguese": "pt",
@@ -24,7 +25,6 @@ language_map_local = {
24
  "Mandarin Chinese": "zh-CN"
25
  }
26
 
27
- # Mapeamento do Idioma para o Prefixo da Voz (ex: Brazilian Portuguese -> 'p')
28
  language_map = {
29
  "Brazilian Portuguese": "p",
30
  "American English": "a",
@@ -38,232 +38,176 @@ language_map = {
38
  }
39
 
40
  last_used_language = "p"
41
- pipeline = None
42
-
43
- # Lista global para armazenar todas as vozes carregadas
44
  ALL_VOICES = []
45
 
46
- # --- Funções Auxiliares de Tradução e Texto ---
47
 
48
  def bulk_translate(text, target_language, chunk_size=500, MAX_ALLOWED_CHARACTERS=10000):
49
  if len(text) >= MAX_ALLOWED_CHARACTERS:
50
- gr.Warning("[WARNING] Text too long skipping translation.")
51
  return text
52
-
53
  lang_code = language_map_local.get(target_language)
54
  if not lang_code:
55
  return text
56
 
57
  sentences = re.split(r'(?<=[.!?])\s+', text)
58
- chunks = []
59
- current_chunk = ""
60
 
61
- for sentence in sentences:
62
- if len(current_chunk) + len(sentence) <= chunk_size:
63
- current_chunk += " " + sentence
64
  else:
65
- chunks.append(current_chunk.strip())
66
- current_chunk = sentence
67
 
68
- if current_chunk:
69
- chunks.append(current_chunk.strip())
70
 
71
  try:
72
- translated_chunks = [GoogleTranslator(target=lang_code).translate(chunk) for chunk in chunks]
73
- result = " ".join(translated_chunks)
74
- return result.strip()
75
  except Exception as e:
76
- gr.Warning(f"Translation failed: {e}")
77
  return text
78
 
79
  def clean_text(text):
80
- replacements = {
81
- "–": " ", "-": " ", "**": " ", "*": " ", "#": " "
82
- }
83
- for old, new in replacements.items():
84
- text = text.replace(old, new)
85
-
86
- emoji_pattern = re.compile(r'[^\w\s,.:;?!@\'"()-]', flags=re.UNICODE)
87
- text = emoji_pattern.sub(r'', text)
88
- text = re.sub(r'\s+', ' ', text).strip()
89
- return text
90
 
91
- # --- Gerenciamento de Arquivos e Pipeline ---
92
 
93
  def create_audio_dir():
94
- root_dir = os.getcwd()
95
- audio_dir = os.path.join(root_dir, "kokoro_audio")
96
- os.makedirs(audio_dir, exist_ok=True)
97
- return audio_dir
98
 
99
  temp_folder = create_audio_dir()
100
 
101
- def update_pipeline(Language):
102
  global pipeline, last_used_language
103
- new_lang = language_map.get(Language, "p")
104
-
105
- if new_lang != last_used_language or pipeline is None:
106
- try:
107
- pipeline = KPipeline(lang_code=new_lang)
108
- last_used_language = new_lang
109
- except Exception as e:
110
- gr.Warning(f"Error loading {Language}. Fallback to English.")
111
- pipeline = KPipeline(lang_code="a")
112
- last_used_language = "a"
113
 
114
  def get_voice_names(repo_id):
115
- """Obtém todas as vozes disponíveis."""
116
  try:
117
- return [os.path.splitext(file.replace("voices/", ""))[0] for file in list_repo_files(repo_id) if file.startswith("voices/")]
 
 
 
 
118
  except:
119
- # Fallback manual com algumas vozes conhecidas
120
- return ["pf_dora", "pm_alex","pm_santa", "af_bella", "af_sarah", "bf_isabella", "ff_siwis", "ef_dora", "jf_nezumi", "zf_xiaoni"]
121
 
122
  def filter_voices_by_language(language):
123
- """Filtra a lista global ALL_VOICES baseada no prefixo do idioma selecionado."""
124
- prefix = language_map.get(language, "a") # padrão 'a' se falhar
125
-
126
- # Filtra vozes que começam com o prefixo (ex: 'p' para 'pf_dora')
127
  filtered = [v for v in ALL_VOICES if v.startswith(prefix)]
128
-
129
- if not filtered:
130
- return gr.Dropdown(choices=ALL_VOICES, value=ALL_VOICES[0])
131
-
132
  return gr.Dropdown(choices=filtered, value=filtered[0])
133
 
134
  def tts_file_name(text, language):
135
- global temp_folder
136
- clean_t = re.sub(r'[^a-zA-Z\s]', '', text).lower().strip().replace(" ", "_")
137
- lang_clean = language.replace(" ", "_").strip()
138
- truncated_text = clean_t[:20] if len(clean_t) > 0 else lang_clean
139
- random_string = uuid.uuid4().hex[:8].upper()
140
- return f"{temp_folder}/{truncated_text}_{random_string}.wav"
141
-
142
- # --- Processamento de Áudio ---
143
-
144
- def remove_silence_function(file_path, minimum_silence=50):
145
- output_path = file_path.replace(".wav", "_no_silence.wav")
146
- sound = AudioSegment.from_file(file_path, format="wav")
147
- audio_chunks = split_on_silence(sound, min_silence_len=100, silence_thresh=-45, keep_silence=minimum_silence)
148
-
149
- combined = AudioSegment.empty()
150
- for chunk in audio_chunks:
151
- combined += chunk
152
- combined.export(output_path, format="wav")
153
- return output_path
154
-
155
- def generate_and_save_audio(text, Language, voice, speed, remove_silence, keep_silence_up_to):
 
 
 
 
 
 
 
 
 
 
 
 
156
  text = clean_text(text)
157
- update_pipeline(Language)
158
-
159
- # Gerar áudio
160
- generator = pipeline(text, voice=voice, speed=speed, split_pattern=r'\n+')
161
- save_path = tts_file_name(text, Language)
162
-
163
- with wave.open(save_path, 'wb') as wav_file:
164
- wav_file.setnchannels(1)
165
- wav_file.setsampwidth(2)
166
- wav_file.setframerate(24000)
167
-
168
- for i, result in enumerate(generator):
169
- audio = result.audio
170
- audio_np = audio.numpy()
171
- audio_int16 = (audio_np * 32767).astype(np.int16)
172
- wav_file.writeframes(audio_int16.tobytes())
173
 
174
  if remove_silence:
175
- keep_silence = int(keep_silence_up_to * 1000)
176
- new_wave_file = remove_silence_function(save_path, minimum_silence=keep_silence)
177
- return new_wave_file
178
-
179
- return save_path
180
 
181
- # --- API Principal para a UI ---
 
182
 
183
- def KOKORO_TTS_API(text, Language, voice, speed, translate_text, remove_silence):
184
- if not Language: Language = "Brazilian Portuguese"
185
- if not voice: voice = "pf_dora"
186
 
187
- if translate_text:
188
- text = bulk_translate(text, Language, chunk_size=500)
189
-
190
- save_path = generate_and_save_audio(
191
- text=text, Language=Language, voice=voice, speed=speed,
192
- remove_silence=remove_silence, keep_silence_up_to=0.05
193
- )
194
-
195
- return save_path, save_path
196
 
197
- # --- Interface Gradio ---
 
 
 
 
198
 
199
- def toggle_autoplay(autoplay):
200
- return gr.Audio(interactive=False, label='Output Audio', autoplay=autoplay)
201
 
202
  def ui():
203
  global ALL_VOICES
204
- lang_list = list(language_map.keys())
205
-
206
- # Carrega todas as vozes uma única vez
207
  ALL_VOICES = get_voice_names("hexgrad/Kokoro-82M")
208
-
209
- # Define valores iniciais para PT-BR
210
- initial_lang = "Brazilian Portuguese"
211
- initial_voices = [v for v in ALL_VOICES if v.startswith(language_map[initial_lang])]
212
- initial_voice_value = "pf_dora" if "pf_dora" in initial_voices else (initial_voices[0] if initial_voices else ALL_VOICES[0])
213
-
214
- dummy_examples = [
215
- ["Olá! Hoje é um ótimo dia para estudar e aprender coisas novas.", "Brazilian Portuguese", "pf_dora"],
216
- ["Olá! Hoje é um ótimo dia para estudar e aprender coisas novas.", "Brazilian Portuguese", "pm_alex"],
217
- ["Olá! Hoje é um ótimo dia para estudar e aprender coisas novas.", "Brazilian Portuguese", "pm_santa"],
218
- ]
219
 
220
  with gr.Blocks(title="Kokoro TTS") as demo:
221
- gr.Markdown("## Kokoro TTS (Audio Only)")
222
-
223
- with gr.Row():
224
- with gr.Column():
225
- text = gr.Textbox(label='📝 Texto de Entrada', lines=3, placeholder="Digite seu texto aqui...")
226
-
227
- with gr.Row():
228
- language_name = gr.Dropdown(lang_list, label="🌍 Selecionar Idioma", value=initial_lang)
229
-
230
- with gr.Row():
231
- # Começa preenchido apenas com vozes em Português
232
- voice_name = gr.Dropdown(initial_voices, label="🎙️ Escolher Voz", value=initial_voice_value)
233
-
234
- with gr.Row():
235
- generate_btn = gr.Button('🚀 Gerar Áudio', variant='primary')
236
-
237
- with gr.Accordion('🎛️ Configurações de Áudio', open=False):
238
- speed = gr.Slider(minimum=0.5, maximum=2, value=1, step=0.1, label='⚡️ Velocidade')
239
- translate_text = gr.Checkbox(value=False, label='🌐 Traduzir texto para o idioma selecionado')
240
- remove_silence = gr.Checkbox(value=False, label='✂️ Remover Silêncio')
241
-
242
- with gr.Column():
243
- audio = gr.Audio(interactive=False, label='🔊 Áudio Gerado', autoplay=True)
244
- audio_file = gr.File(label='📥 Baixar Áudio')
245
-
246
- with gr.Row():
247
- autoplay = gr.Checkbox(value=True, label='▶️ Autoplay')
248
- autoplay.change(toggle_autoplay, inputs=[autoplay], outputs=[audio])
249
-
250
- # --- EVENTOS ---
251
- # Quando mudar o idioma, atualiza a lista de vozes
252
- language_name.change(filter_voices_by_language, inputs=[language_name], outputs=[voice_name])
253
-
254
- inputs = [text, language_name, voice_name, speed, translate_text, remove_silence]
255
- outputs = [audio, audio_file]
256
-
257
- text.submit(KOKORO_TTS_API, inputs=inputs, outputs=outputs)
258
- generate_btn.click(KOKORO_TTS_API, inputs=inputs, outputs=outputs)
259
-
260
- gr.Examples(examples=dummy_examples, inputs=[text, language_name, voice_name])
261
 
262
  return demo
263
 
 
 
264
  if __name__ == "__main__":
265
- print("Inicializando pipeline em Português...")
266
  update_pipeline("Brazilian Portuguese")
267
-
268
- demo = ui()
269
- demo.queue().launch(show_api=False)
 
1
  import os
2
  import re
3
  import uuid
4
+ import subprocess
5
  import numpy as np
6
  import wave
7
  import gradio as gr
 
11
  from pydub import AudioSegment
12
  from pydub.silence import split_on_silence
13
 
14
+ # ================= CONFIGURAÇÕES =================
15
 
16
  language_map_local = {
17
  "Brazilian Portuguese": "pt",
 
25
  "Mandarin Chinese": "zh-CN"
26
  }
27
 
 
28
  language_map = {
29
  "Brazilian Portuguese": "p",
30
  "American English": "a",
 
38
  }
39
 
40
  last_used_language = "p"
41
+ pipeline = None
 
 
42
  ALL_VOICES = []
43
 
44
+ # ================= TEXTO =================
45
 
46
  def bulk_translate(text, target_language, chunk_size=500, MAX_ALLOWED_CHARACTERS=10000):
47
  if len(text) >= MAX_ALLOWED_CHARACTERS:
48
+ gr.Warning("Texto muito longotradução ignorada.")
49
  return text
50
+
51
  lang_code = language_map_local.get(target_language)
52
  if not lang_code:
53
  return text
54
 
55
  sentences = re.split(r'(?<=[.!?])\s+', text)
56
+ chunks, current = [], ""
 
57
 
58
+ for s in sentences:
59
+ if len(current) + len(s) <= chunk_size:
60
+ current += " " + s
61
  else:
62
+ chunks.append(current.strip())
63
+ current = s
64
 
65
+ if current:
66
+ chunks.append(current.strip())
67
 
68
  try:
69
+ translated = [GoogleTranslator(target=lang_code).translate(c) for c in chunks]
70
+ return " ".join(translated)
 
71
  except Exception as e:
72
+ gr.Warning(f"Erro na tradução: {e}")
73
  return text
74
 
75
  def clean_text(text):
76
+ text = re.sub(r'[–\-*#]', ' ', text)
77
+ text = re.sub(r'[^\w\s,.:;?!@\'"()-]', '', text)
78
+ return re.sub(r'\s+', ' ', text).strip()
 
 
 
 
 
 
 
79
 
80
+ # ================= PIPELINE =================
81
 
82
  def create_audio_dir():
83
+ path = os.path.join(os.getcwd(), "kokoro_audio")
84
+ os.makedirs(path, exist_ok=True)
85
+ return path
 
86
 
87
  temp_folder = create_audio_dir()
88
 
89
+ def update_pipeline(language):
90
  global pipeline, last_used_language
91
+ lang = language_map.get(language, "p")
92
+ if pipeline is None or lang != last_used_language:
93
+ pipeline = KPipeline(lang_code=lang)
94
+ last_used_language = lang
 
 
 
 
 
 
95
 
96
  def get_voice_names(repo_id):
 
97
  try:
98
+ return [
99
+ os.path.splitext(f.replace("voices/", ""))[0]
100
+ for f in list_repo_files(repo_id)
101
+ if f.startswith("voices/")
102
+ ]
103
  except:
104
+ return ["pf_dora", "pm_alex", "pm_santa"]
 
105
 
106
  def filter_voices_by_language(language):
107
+ prefix = language_map.get(language, "p")
 
 
 
108
  filtered = [v for v in ALL_VOICES if v.startswith(prefix)]
 
 
 
 
109
  return gr.Dropdown(choices=filtered, value=filtered[0])
110
 
111
  def tts_file_name(text, language):
112
+ clean = re.sub(r'[^a-zA-Z]', '', text).lower()[:20]
113
+ uid = uuid.uuid4().hex[:8]
114
+ return f"{temp_folder}/{clean}_{uid}.wav"
115
+
116
+ # ================= ÁUDIO =================
117
+
118
+ def remove_silence_function(path, keep_ms):
119
+ sound = AudioSegment.from_wav(path)
120
+ chunks = split_on_silence(sound, min_silence_len=100, silence_thresh=-45, keep_silence=keep_ms)
121
+ out = AudioSegment.empty()
122
+ for c in chunks:
123
+ out += c
124
+ new = path.replace(".wav", "_nosil.wav")
125
+ out.export(new, format="wav")
126
+ return new
127
+
128
+ def apply_ffmpeg_rubberband(input_wav, pitch=1.09):
129
+ output_wav = input_wav.replace(".wav", "_rb.wav")
130
+ cmd = [
131
+ "ffmpeg", "-y",
132
+ "-i", input_wav,
133
+ "-af", f"rubberband=pitch={pitch}:formant=preserved",
134
+ output_wav
135
+ ]
136
+ try:
137
+ subprocess.run(cmd, check=True, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)
138
+ return output_wav
139
+ except:
140
+ gr.Warning("FFmpeg Rubberband falhou")
141
+ return input_wav
142
+
143
+ def generate_and_save_audio(text, language, voice, speed, remove_silence, use_ffmpeg):
144
+ update_pipeline(language)
145
  text = clean_text(text)
146
+
147
+ generator = pipeline(text, voice=voice, speed=speed)
148
+ path = tts_file_name(text, language)
149
+
150
+ with wave.open(path, "wb") as w:
151
+ w.setnchannels(1)
152
+ w.setsampwidth(2)
153
+ w.setframerate(24000)
154
+ for r in generator:
155
+ audio = (r.audio.numpy() * 32767).astype(np.int16)
156
+ w.writeframes(audio.tobytes())
157
+
158
+ final = path
 
 
 
159
 
160
  if remove_silence:
161
+ final = remove_silence_function(final, keep_ms=50)
 
 
 
 
162
 
163
+ if use_ffmpeg:
164
+ final = apply_ffmpeg_rubberband(final)
165
 
166
+ return final
 
 
167
 
168
+ # ================= API =================
 
 
 
 
 
 
 
 
169
 
170
+ def KOKORO_TTS_API(text, language, voice, speed, translate, remove_silence, use_ffmpeg):
171
+ if translate:
172
+ text = bulk_translate(text, language)
173
+ path = generate_and_save_audio(text, language, voice, speed, remove_silence, use_ffmpeg)
174
+ return path, path
175
 
176
+ # ================= UI =================
 
177
 
178
  def ui():
179
  global ALL_VOICES
 
 
 
180
  ALL_VOICES = get_voice_names("hexgrad/Kokoro-82M")
 
 
 
 
 
 
 
 
 
 
 
181
 
182
  with gr.Blocks(title="Kokoro TTS") as demo:
183
+ gr.Markdown("## Kokoro TTS + FFmpeg Rubberband")
184
+
185
+ text = gr.Textbox(lines=3, label="Texto")
186
+ language = gr.Dropdown(list(language_map.keys()), value="Brazilian Portuguese")
187
+ voice = gr.Dropdown([v for v in ALL_VOICES if v.startswith("p")], value="pf_dora")
188
+ speed = gr.Slider(0.5, 2, value=1, step=0.1)
189
+
190
+ with gr.Accordion("🎛️ Áudio", open=False):
191
+ translate = gr.Checkbox(label="Traduzir texto")
192
+ remove_silence = gr.Checkbox(label="Remover silêncio")
193
+ use_ffmpeg = gr.Checkbox(label="FFmpeg Rubberband (Pitch + Formant)")
194
+
195
+ btn = gr.Button("Gerar")
196
+ audio = gr.Audio()
197
+ file = gr.File()
198
+
199
+ language.change(filter_voices_by_language, inputs=language, outputs=voice)
200
+
201
+ btn.click(
202
+ KOKORO_TTS_API,
203
+ inputs=[text, language, voice, speed, translate, remove_silence, use_ffmpeg],
204
+ outputs=[audio, file]
205
+ )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
206
 
207
  return demo
208
 
209
+ # ================= MAIN =================
210
+
211
  if __name__ == "__main__":
 
212
  update_pipeline("Brazilian Portuguese")
213
+ ui().queue().launch()