tts_PTBR / app.py
elielsilva's picture
Update app.py
c1758a5 verified
import os
import re
import uuid
import subprocess
import numpy as np
import wave
import gradio as gr
from huggingface_hub import list_repo_files
from kokoro import KPipeline
from deep_translator import GoogleTranslator
from pydub import AudioSegment
from pydub.silence import split_on_silence
# --- Configurações Iniciais ---
language_map_local = {
"Brazilian Portuguese": "pt",
"American English": "en",
"British English": "en",
"Hindi": "hi",
"Spanish": "es",
"French": "fr",
"Italian": "it",
"Japanese": "ja",
"Mandarin Chinese": "zh-CN"
}
VOICE_PRESETS = {
"Manual": {
"pitch": None,
"speed": None
},
"Natural (Padrão)": {
"pitch": 1.00,
"speed": 1.00
},
"Levemente Jovem": {
"pitch": 1.06,
"speed": 1.02
},
"Jovem / ElevenLabs-like": {
"pitch": 1.09,
"speed": 1.05
},
"Grave / Narrador": {
"pitch": 0.92,
"speed": 0.95
},
"Muito Grave": {
"pitch": 0.88,
"speed": 0.92
},
"Rápido Comercial": {
"pitch": 1.03,
"speed": 1.12
}
}
# Mapeamento do Idioma para o Prefixo da Voz (ex: Brazilian Portuguese -> 'p')
language_map = {
"Brazilian Portuguese": "p",
"American English": "a",
"British English": "b",
"Hindi": "h",
"Spanish": "e",
"French": "f",
"Italian": "i",
"Japanese": "j",
"Mandarin Chinese": "z"
}
last_used_language = "p"
pipeline = None
# Lista global para armazenar todas as vozes carregadas
ALL_VOICES = []
# --- Funções Auxiliares de Tradução e Texto ---
def bulk_translate(text, target_language, chunk_size=500, MAX_ALLOWED_CHARACTERS=10000):
if len(text) >= MAX_ALLOWED_CHARACTERS:
gr.Warning("[WARNING] Text too long — skipping translation.")
return text
lang_code = language_map_local.get(target_language)
if not lang_code:
return text
sentences = re.split(r'(?<=[.!?])\s+', text)
chunks = []
current_chunk = ""
for sentence in sentences:
if len(current_chunk) + len(sentence) <= chunk_size:
current_chunk += " " + sentence
else:
chunks.append(current_chunk.strip())
current_chunk = sentence
if current_chunk:
chunks.append(current_chunk.strip())
try:
translated_chunks = [GoogleTranslator(target=lang_code).translate(chunk) for chunk in chunks]
result = " ".join(translated_chunks)
return result.strip()
except Exception as e:
gr.Warning(f"Translation failed: {e}")
return text
def clean_text(text):
replacements = {
"–": " ", "-": " ", "**": " ", "*": " ", "#": " "
}
for old, new in replacements.items():
text = text.replace(old, new)
emoji_pattern = re.compile(r'[^\w\s,.:;?!@\'"()-]', flags=re.UNICODE)
text = emoji_pattern.sub(r'', text)
text = re.sub(r'\s+', ' ', text).strip()
return text
# --- Gerenciamento de Arquivos e Pipeline ---
def create_audio_dir():
root_dir = os.getcwd()
audio_dir = os.path.join(root_dir, "kokoro_audio")
os.makedirs(audio_dir, exist_ok=True)
return audio_dir
temp_folder = create_audio_dir()
def update_pipeline(Language):
global pipeline, last_used_language
new_lang = language_map.get(Language, "p")
if new_lang != last_used_language or pipeline is None:
try:
pipeline = KPipeline(lang_code=new_lang)
last_used_language = new_lang
except Exception as e:
gr.Warning(f"Error loading {Language}. Fallback to English.")
pipeline = KPipeline(lang_code="a")
last_used_language = "a"
def get_voice_names(repo_id):
"""Obtém todas as vozes disponíveis."""
try:
return [os.path.splitext(file.replace("voices/", ""))[0] for file in list_repo_files(repo_id) if file.startswith("voices/")]
except:
# Fallback manual com algumas vozes conhecidas
return ["pf_dora", "pm_alex","pm_santa", "af_bella", "af_sarah", "bf_isabella", "ff_siwis", "ef_dora", "jf_nezumi", "zf_xiaoni"]
def filter_voices_by_language(language):
"""Filtra a lista global ALL_VOICES baseada no prefixo do idioma selecionado."""
prefix = language_map.get(language, "a") # padrão 'a' se falhar
# Filtra vozes que começam com o prefixo (ex: 'p' para 'pf_dora')
filtered = [v for v in ALL_VOICES if v.startswith(prefix)]
if not filtered:
return gr.Dropdown(choices=ALL_VOICES, value=ALL_VOICES[0])
return gr.Dropdown(choices=filtered, value=filtered[0])
def tts_file_name(text, language):
global temp_folder
clean_t = re.sub(r'[^a-zA-Z\s]', '', text).lower().strip().replace(" ", "_")
lang_clean = language.replace(" ", "_").strip()
truncated_text = clean_t[:20] if len(clean_t) > 0 else lang_clean
random_string = uuid.uuid4().hex[:8].upper()
return f"{temp_folder}/{truncated_text}_{random_string}.wav"
# --- Processamento de Áudio ---
def remove_silence_function(file_path,minimum_silence=50):
# Extract file name and format from the provided path
output_path = file_path.replace(".wav", "_no_silence.wav")
audio_format = "wav"
# Reading and splitting the audio file into chunks
sound = AudioSegment.from_file(file_path, format=audio_format)
audio_chunks = split_on_silence(sound,
min_silence_len=100,
silence_thresh=-45,
keep_silence=minimum_silence)
# Putting the file back together
combined = AudioSegment.empty()
for chunk in audio_chunks:
combined += chunk
combined.export(output_path, format=audio_format)
return output_path
def apply_ffmpeg_audio_fx(input_wav, pitch=1.09, loudnorm=True):
output_wav = input_wav.replace(".wav", "_fx.wav")
filters = []
# Pitch + Formant
filters.append(f"rubberband=pitch={pitch}:formant=preserved")
# Loudnorm estilo ElevenLabs
if loudnorm:
filters.append("loudnorm=I=-16:TP=-1.5:LRA=11")
af_filter = ",".join(filters)
cmd = [
"ffmpeg", "-y",
"-i", input_wav,
"-af", af_filter,
output_wav
]
try:
subprocess.run(
cmd,
check=True,
stdout=subprocess.DEVNULL,
stderr=subprocess.DEVNULL
)
return output_wav
except Exception as e:
gr.Warning(f"FFmpeg FX falhou: {e}")
return input_wav
def generate_and_save_audio(text, Language, voice, speed, remove_silence, keep_silence_up_to,use_ffmpeg,pitch, use_loudnorm):
text = clean_text(text)
update_pipeline(Language)
# Gerar áudio
generator = pipeline(text, voice=voice, speed=speed, split_pattern=r'\n+')
save_path = tts_file_name(text, Language)
with wave.open(save_path, 'wb') as wav_file:
wav_file.setnchannels(1)
wav_file.setsampwidth(2)
wav_file.setframerate(24000)
for i, result in enumerate(generator):
audio = result.audio
audio_np = audio.numpy()
audio_int16 = (audio_np * 32767).astype(np.int16)
wav_file.writeframes(audio_int16.tobytes())
final = save_path
if remove_silence:
keep_silence = int(keep_silence_up_to * 1000)
final = remove_silence_function(final, minimum_silence=keep_silence)
if use_ffmpeg:
final = apply_ffmpeg_audio_fx(
final,
pitch=pitch,
loudnorm=use_loudnorm
)
return final
# --- API Principal para a UI ---
def KOKORO_TTS_API(text, Language, voice, speed, translate_text, remove_silence,use_ffmpeg, pitch, preset, use_loudnorm):
if not Language: Language = "Brazilian Portuguese"
if not voice: voice = "pf_dora"
if translate_text:
text = bulk_translate(text, Language, chunk_size=500)
if preset in VOICE_PRESETS and preset != "Manual":
preset_cfg = VOICE_PRESETS[preset]
if preset_cfg["pitch"] is not None:
pitch = preset_cfg["pitch"]
if preset_cfg["speed"] is not None:
speed = preset_cfg["speed"]
save_path = generate_and_save_audio(
text=text, Language=Language, voice=voice, speed=speed,
remove_silence=remove_silence, keep_silence_up_to=0.05,
use_ffmpeg=use_ffmpeg, pitch=pitch, use_loudnorm = use_loudnorm)
return save_path, save_path
# --- Interface Gradio ---
def toggle_autoplay(autoplay):
return gr.Audio(interactive=False, label='Output Audio', autoplay=autoplay)
def ui():
global ALL_VOICES
lang_list = list(language_map.keys())
# Carrega todas as vozes uma única vez
ALL_VOICES = get_voice_names("hexgrad/Kokoro-82M")
# Define valores iniciais para PT-BR
initial_lang = "Brazilian Portuguese"
initial_voices = [v for v in ALL_VOICES if v.startswith(language_map[initial_lang])]
initial_voice_value = "pf_dora" if "pf_dora" in initial_voices else (initial_voices[0] if initial_voices else ALL_VOICES[0])
dummy_examples = [
["Olá! Hoje é um ótimo dia para estudar e aprender coisas novas.", "Brazilian Portuguese", "pf_dora"],
["Olá! Hoje é um ótimo dia para estudar e aprender coisas novas.", "Brazilian Portuguese", "pm_alex"],
["Olá! Hoje é um ótimo dia para estudar e aprender coisas novas.", "Brazilian Portuguese", "pm_santa"],
]
with gr.Blocks(title="Kokoro TTS") as demo:
gr.Markdown("## Kokoro TTS (Audio Only)")
with gr.Row():
with gr.Column():
text = gr.Textbox(label='📝 Texto de Entrada', lines=3, placeholder="Digite seu texto aqui...")
with gr.Row():
language_name = gr.Dropdown(lang_list, label="🌍 Selecionar Idioma", value=initial_lang)
with gr.Row():
# Começa preenchido apenas com vozes em Português
voice_name = gr.Dropdown(initial_voices, label="🎙️ Escolher Voz", value=initial_voice_value)
with gr.Row():
generate_btn = gr.Button('🚀 Gerar Áudio', variant='primary')
with gr.Accordion('🎛️ Configurações de Áudio', open=False):
speed = gr.Slider(minimum=0.5, maximum=2, value=1, step=0.1, label='⚡️ Velocidade')
translate_text = gr.Checkbox(value=False, label='🌐 Traduzir texto para o idioma selecionado')
remove_silence = gr.Checkbox(value=False, label='✂️ Remover Silêncio')
use_ffmpeg = gr.Checkbox(value=False,label="🎵 Aplicar FFmpeg (Rubberband Pitch + Formant)")
preset = gr.Dropdown(
choices=list(VOICE_PRESETS.keys()),
value="Natural (Padrão)",
label="🎙️ Preset de Voz (Estilo ElevenLabs)"
)
pitch = gr.Slider(minimum=0.85,maximum=1.20,value=1.09, step=0.01,label="🎵 Pitch (Rubberband – Formant Preserved)")
use_loudnorm = gr.Checkbox(
value=True,
label="🔊 Loudnorm (Volume profissional / ElevenLabs)"
)
with gr.Column():
audio = gr.Audio(interactive=False, label='🔊 Áudio Gerado', autoplay=True)
audio_file = gr.File(label='📥 Baixar Áudio')
with gr.Row():
autoplay = gr.Checkbox(value=True, label='▶️ Autoplay')
autoplay.change(toggle_autoplay, inputs=[autoplay], outputs=[audio])
# --- EVENTOS ---
# Quando mudar o idioma, atualiza a lista de vozes
language_name.change(filter_voices_by_language, inputs=[language_name], outputs=[voice_name])
inputs = [text, language_name, voice_name, speed, translate_text, remove_silence,use_ffmpeg,pitch,preset,use_loudnorm]
outputs = [audio, audio_file]
text.submit(KOKORO_TTS_API, inputs=inputs, outputs=outputs)
generate_btn.click(KOKORO_TTS_API, inputs=inputs, outputs=outputs)
gr.Examples(examples=dummy_examples, inputs=[text, language_name, voice_name])
return demo
if __name__ == "__main__":
print("Inicializando pipeline em Português...")
update_pipeline("Brazilian Portuguese")
demo = ui()
demo.queue().launch(show_api=False)