Spaces:
Running
Running
| import os | |
| import re | |
| import uuid | |
| import subprocess | |
| import numpy as np | |
| import wave | |
| import gradio as gr | |
| from huggingface_hub import list_repo_files | |
| from kokoro import KPipeline | |
| from deep_translator import GoogleTranslator | |
| from pydub import AudioSegment | |
| from pydub.silence import split_on_silence | |
| # --- Configurações Iniciais --- | |
| language_map_local = { | |
| "Brazilian Portuguese": "pt", | |
| "American English": "en", | |
| "British English": "en", | |
| "Hindi": "hi", | |
| "Spanish": "es", | |
| "French": "fr", | |
| "Italian": "it", | |
| "Japanese": "ja", | |
| "Mandarin Chinese": "zh-CN" | |
| } | |
| VOICE_PRESETS = { | |
| "Manual": { | |
| "pitch": None, | |
| "speed": None | |
| }, | |
| "Natural (Padrão)": { | |
| "pitch": 1.00, | |
| "speed": 1.00 | |
| }, | |
| "Levemente Jovem": { | |
| "pitch": 1.06, | |
| "speed": 1.02 | |
| }, | |
| "Jovem / ElevenLabs-like": { | |
| "pitch": 1.09, | |
| "speed": 1.05 | |
| }, | |
| "Grave / Narrador": { | |
| "pitch": 0.92, | |
| "speed": 0.95 | |
| }, | |
| "Muito Grave": { | |
| "pitch": 0.88, | |
| "speed": 0.92 | |
| }, | |
| "Rápido Comercial": { | |
| "pitch": 1.03, | |
| "speed": 1.12 | |
| } | |
| } | |
| # Mapeamento do Idioma para o Prefixo da Voz (ex: Brazilian Portuguese -> 'p') | |
| language_map = { | |
| "Brazilian Portuguese": "p", | |
| "American English": "a", | |
| "British English": "b", | |
| "Hindi": "h", | |
| "Spanish": "e", | |
| "French": "f", | |
| "Italian": "i", | |
| "Japanese": "j", | |
| "Mandarin Chinese": "z" | |
| } | |
| last_used_language = "p" | |
| pipeline = None | |
| # Lista global para armazenar todas as vozes carregadas | |
| ALL_VOICES = [] | |
| # --- Funções Auxiliares de Tradução e Texto --- | |
| def bulk_translate(text, target_language, chunk_size=500, MAX_ALLOWED_CHARACTERS=10000): | |
| if len(text) >= MAX_ALLOWED_CHARACTERS: | |
| gr.Warning("[WARNING] Text too long — skipping translation.") | |
| return text | |
| lang_code = language_map_local.get(target_language) | |
| if not lang_code: | |
| return text | |
| sentences = re.split(r'(?<=[.!?])\s+', text) | |
| chunks = [] | |
| current_chunk = "" | |
| for sentence in sentences: | |
| if len(current_chunk) + len(sentence) <= chunk_size: | |
| current_chunk += " " + sentence | |
| else: | |
| chunks.append(current_chunk.strip()) | |
| current_chunk = sentence | |
| if current_chunk: | |
| chunks.append(current_chunk.strip()) | |
| try: | |
| translated_chunks = [GoogleTranslator(target=lang_code).translate(chunk) for chunk in chunks] | |
| result = " ".join(translated_chunks) | |
| return result.strip() | |
| except Exception as e: | |
| gr.Warning(f"Translation failed: {e}") | |
| return text | |
| def clean_text(text): | |
| replacements = { | |
| "–": " ", "-": " ", "**": " ", "*": " ", "#": " " | |
| } | |
| for old, new in replacements.items(): | |
| text = text.replace(old, new) | |
| emoji_pattern = re.compile(r'[^\w\s,.:;?!@\'"()-]', flags=re.UNICODE) | |
| text = emoji_pattern.sub(r'', text) | |
| text = re.sub(r'\s+', ' ', text).strip() | |
| return text | |
| # --- Gerenciamento de Arquivos e Pipeline --- | |
| def create_audio_dir(): | |
| root_dir = os.getcwd() | |
| audio_dir = os.path.join(root_dir, "kokoro_audio") | |
| os.makedirs(audio_dir, exist_ok=True) | |
| return audio_dir | |
| temp_folder = create_audio_dir() | |
| def update_pipeline(Language): | |
| global pipeline, last_used_language | |
| new_lang = language_map.get(Language, "p") | |
| if new_lang != last_used_language or pipeline is None: | |
| try: | |
| pipeline = KPipeline(lang_code=new_lang) | |
| last_used_language = new_lang | |
| except Exception as e: | |
| gr.Warning(f"Error loading {Language}. Fallback to English.") | |
| pipeline = KPipeline(lang_code="a") | |
| last_used_language = "a" | |
| def get_voice_names(repo_id): | |
| """Obtém todas as vozes disponíveis.""" | |
| try: | |
| return [os.path.splitext(file.replace("voices/", ""))[0] for file in list_repo_files(repo_id) if file.startswith("voices/")] | |
| except: | |
| # Fallback manual com algumas vozes conhecidas | |
| return ["pf_dora", "pm_alex","pm_santa", "af_bella", "af_sarah", "bf_isabella", "ff_siwis", "ef_dora", "jf_nezumi", "zf_xiaoni"] | |
| def filter_voices_by_language(language): | |
| """Filtra a lista global ALL_VOICES baseada no prefixo do idioma selecionado.""" | |
| prefix = language_map.get(language, "a") # padrão 'a' se falhar | |
| # Filtra vozes que começam com o prefixo (ex: 'p' para 'pf_dora') | |
| filtered = [v for v in ALL_VOICES if v.startswith(prefix)] | |
| if not filtered: | |
| return gr.Dropdown(choices=ALL_VOICES, value=ALL_VOICES[0]) | |
| return gr.Dropdown(choices=filtered, value=filtered[0]) | |
| def tts_file_name(text, language): | |
| global temp_folder | |
| clean_t = re.sub(r'[^a-zA-Z\s]', '', text).lower().strip().replace(" ", "_") | |
| lang_clean = language.replace(" ", "_").strip() | |
| truncated_text = clean_t[:20] if len(clean_t) > 0 else lang_clean | |
| random_string = uuid.uuid4().hex[:8].upper() | |
| return f"{temp_folder}/{truncated_text}_{random_string}.wav" | |
| # --- Processamento de Áudio --- | |
| def remove_silence_function(file_path,minimum_silence=50): | |
| # Extract file name and format from the provided path | |
| output_path = file_path.replace(".wav", "_no_silence.wav") | |
| audio_format = "wav" | |
| # Reading and splitting the audio file into chunks | |
| sound = AudioSegment.from_file(file_path, format=audio_format) | |
| audio_chunks = split_on_silence(sound, | |
| min_silence_len=100, | |
| silence_thresh=-45, | |
| keep_silence=minimum_silence) | |
| # Putting the file back together | |
| combined = AudioSegment.empty() | |
| for chunk in audio_chunks: | |
| combined += chunk | |
| combined.export(output_path, format=audio_format) | |
| return output_path | |
| def apply_ffmpeg_audio_fx(input_wav, pitch=1.09, loudnorm=True): | |
| output_wav = input_wav.replace(".wav", "_fx.wav") | |
| filters = [] | |
| # Pitch + Formant | |
| filters.append(f"rubberband=pitch={pitch}:formant=preserved") | |
| # Loudnorm estilo ElevenLabs | |
| if loudnorm: | |
| filters.append("loudnorm=I=-16:TP=-1.5:LRA=11") | |
| af_filter = ",".join(filters) | |
| cmd = [ | |
| "ffmpeg", "-y", | |
| "-i", input_wav, | |
| "-af", af_filter, | |
| output_wav | |
| ] | |
| try: | |
| subprocess.run( | |
| cmd, | |
| check=True, | |
| stdout=subprocess.DEVNULL, | |
| stderr=subprocess.DEVNULL | |
| ) | |
| return output_wav | |
| except Exception as e: | |
| gr.Warning(f"FFmpeg FX falhou: {e}") | |
| return input_wav | |
| def generate_and_save_audio(text, Language, voice, speed, remove_silence, keep_silence_up_to,use_ffmpeg,pitch, use_loudnorm): | |
| text = clean_text(text) | |
| update_pipeline(Language) | |
| # Gerar áudio | |
| generator = pipeline(text, voice=voice, speed=speed, split_pattern=r'\n+') | |
| save_path = tts_file_name(text, Language) | |
| with wave.open(save_path, 'wb') as wav_file: | |
| wav_file.setnchannels(1) | |
| wav_file.setsampwidth(2) | |
| wav_file.setframerate(24000) | |
| for i, result in enumerate(generator): | |
| audio = result.audio | |
| audio_np = audio.numpy() | |
| audio_int16 = (audio_np * 32767).astype(np.int16) | |
| wav_file.writeframes(audio_int16.tobytes()) | |
| final = save_path | |
| if remove_silence: | |
| keep_silence = int(keep_silence_up_to * 1000) | |
| final = remove_silence_function(final, minimum_silence=keep_silence) | |
| if use_ffmpeg: | |
| final = apply_ffmpeg_audio_fx( | |
| final, | |
| pitch=pitch, | |
| loudnorm=use_loudnorm | |
| ) | |
| return final | |
| # --- API Principal para a UI --- | |
| def KOKORO_TTS_API(text, Language, voice, speed, translate_text, remove_silence,use_ffmpeg, pitch, preset, use_loudnorm): | |
| if not Language: Language = "Brazilian Portuguese" | |
| if not voice: voice = "pf_dora" | |
| if translate_text: | |
| text = bulk_translate(text, Language, chunk_size=500) | |
| if preset in VOICE_PRESETS and preset != "Manual": | |
| preset_cfg = VOICE_PRESETS[preset] | |
| if preset_cfg["pitch"] is not None: | |
| pitch = preset_cfg["pitch"] | |
| if preset_cfg["speed"] is not None: | |
| speed = preset_cfg["speed"] | |
| save_path = generate_and_save_audio( | |
| text=text, Language=Language, voice=voice, speed=speed, | |
| remove_silence=remove_silence, keep_silence_up_to=0.05, | |
| use_ffmpeg=use_ffmpeg, pitch=pitch, use_loudnorm = use_loudnorm) | |
| return save_path, save_path | |
| # --- Interface Gradio --- | |
| def toggle_autoplay(autoplay): | |
| return gr.Audio(interactive=False, label='Output Audio', autoplay=autoplay) | |
| def ui(): | |
| global ALL_VOICES | |
| lang_list = list(language_map.keys()) | |
| # Carrega todas as vozes uma única vez | |
| ALL_VOICES = get_voice_names("hexgrad/Kokoro-82M") | |
| # Define valores iniciais para PT-BR | |
| initial_lang = "Brazilian Portuguese" | |
| initial_voices = [v for v in ALL_VOICES if v.startswith(language_map[initial_lang])] | |
| initial_voice_value = "pf_dora" if "pf_dora" in initial_voices else (initial_voices[0] if initial_voices else ALL_VOICES[0]) | |
| dummy_examples = [ | |
| ["Olá! Hoje é um ótimo dia para estudar e aprender coisas novas.", "Brazilian Portuguese", "pf_dora"], | |
| ["Olá! Hoje é um ótimo dia para estudar e aprender coisas novas.", "Brazilian Portuguese", "pm_alex"], | |
| ["Olá! Hoje é um ótimo dia para estudar e aprender coisas novas.", "Brazilian Portuguese", "pm_santa"], | |
| ] | |
| with gr.Blocks(title="Kokoro TTS") as demo: | |
| gr.Markdown("## Kokoro TTS (Audio Only)") | |
| with gr.Row(): | |
| with gr.Column(): | |
| text = gr.Textbox(label='📝 Texto de Entrada', lines=3, placeholder="Digite seu texto aqui...") | |
| with gr.Row(): | |
| language_name = gr.Dropdown(lang_list, label="🌍 Selecionar Idioma", value=initial_lang) | |
| with gr.Row(): | |
| # Começa preenchido apenas com vozes em Português | |
| voice_name = gr.Dropdown(initial_voices, label="🎙️ Escolher Voz", value=initial_voice_value) | |
| with gr.Row(): | |
| generate_btn = gr.Button('🚀 Gerar Áudio', variant='primary') | |
| with gr.Accordion('🎛️ Configurações de Áudio', open=False): | |
| speed = gr.Slider(minimum=0.5, maximum=2, value=1, step=0.1, label='⚡️ Velocidade') | |
| translate_text = gr.Checkbox(value=False, label='🌐 Traduzir texto para o idioma selecionado') | |
| remove_silence = gr.Checkbox(value=False, label='✂️ Remover Silêncio') | |
| use_ffmpeg = gr.Checkbox(value=False,label="🎵 Aplicar FFmpeg (Rubberband Pitch + Formant)") | |
| preset = gr.Dropdown( | |
| choices=list(VOICE_PRESETS.keys()), | |
| value="Natural (Padrão)", | |
| label="🎙️ Preset de Voz (Estilo ElevenLabs)" | |
| ) | |
| pitch = gr.Slider(minimum=0.85,maximum=1.20,value=1.09, step=0.01,label="🎵 Pitch (Rubberband – Formant Preserved)") | |
| use_loudnorm = gr.Checkbox( | |
| value=True, | |
| label="🔊 Loudnorm (Volume profissional / ElevenLabs)" | |
| ) | |
| with gr.Column(): | |
| audio = gr.Audio(interactive=False, label='🔊 Áudio Gerado', autoplay=True) | |
| audio_file = gr.File(label='📥 Baixar Áudio') | |
| with gr.Row(): | |
| autoplay = gr.Checkbox(value=True, label='▶️ Autoplay') | |
| autoplay.change(toggle_autoplay, inputs=[autoplay], outputs=[audio]) | |
| # --- EVENTOS --- | |
| # Quando mudar o idioma, atualiza a lista de vozes | |
| language_name.change(filter_voices_by_language, inputs=[language_name], outputs=[voice_name]) | |
| inputs = [text, language_name, voice_name, speed, translate_text, remove_silence,use_ffmpeg,pitch,preset,use_loudnorm] | |
| outputs = [audio, audio_file] | |
| text.submit(KOKORO_TTS_API, inputs=inputs, outputs=outputs) | |
| generate_btn.click(KOKORO_TTS_API, inputs=inputs, outputs=outputs) | |
| gr.Examples(examples=dummy_examples, inputs=[text, language_name, voice_name]) | |
| return demo | |
| if __name__ == "__main__": | |
| print("Inicializando pipeline em Português...") | |
| update_pipeline("Brazilian Portuguese") | |
| demo = ui() | |
| demo.queue().launch(show_api=False) | |