Spaces:
Paused
Paused
| import re | |
| import tempfile | |
| import gradio as gr | |
| import numpy as np | |
| import soundfile as sf | |
| import torchaudio | |
| from cached_path import cached_path | |
| from transformers import AutoModelForCausalLM, AutoTokenizer | |
| from num2words import num2words | |
| try: | |
| import spaces | |
| USING_SPACES = True | |
| except ImportError: | |
| USING_SPACES = False | |
| def gpu_decorator(func): | |
| if USING_SPACES: | |
| return spaces.GPU(func) | |
| else: | |
| return func | |
| from f5_tts.model import DiT, UNetT | |
| from f5_tts.infer.utils_infer import ( | |
| load_vocoder, | |
| load_model, | |
| preprocess_ref_audio_text, | |
| infer_process, | |
| remove_silence_for_generated_wav, | |
| save_spectrogram, | |
| ) | |
| vocoder = load_vocoder() | |
| # Cargar modelos | |
| F5TTS_model_cfg = dict(dim=1024, depth=22, heads=16, ff_mult=2, text_dim=512, conv_layers=4) | |
| F5TTS_ema_model = load_model( | |
| DiT, F5TTS_model_cfg, str(cached_path("hf://jpgallegoar/F5-Spanish/model_1200000.safetensors")) | |
| ) | |
| chat_model_state = None | |
| chat_tokenizer_state = None | |
| def generate_response(messages, model, tokenizer): | |
| """Generar respuesta usando Qwen.""" | |
| text = tokenizer.apply_chat_template( | |
| messages, | |
| tokenize=False, | |
| add_generation_prompt=True, | |
| ) | |
| model_inputs = tokenizer([text], return_tensors="pt").to(model.device) | |
| generated_ids = model.generate( | |
| **model_inputs, | |
| max_new_tokens=512, | |
| temperature=0.7, | |
| top_p=0.95, | |
| ) | |
| generated_ids = [ | |
| output_ids[len(input_ids):] for input_ids, output_ids in zip(model_inputs.input_ids, generated_ids) | |
| ] | |
| return tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0] | |
| def traducir_numero_a_texto(texto): | |
| texto_separado = re.sub(r'([A-Za-z])(\d)', r'\1 \2', texto) | |
| texto_separado = re.sub(r'(\d)([A-Za-z])', r'\1 \2', texto_separado) | |
| def reemplazar_numero(match): | |
| numero = match.group() | |
| return num2words(int(numero), lang='es') | |
| texto_traducido = re.sub(r'\b\d+\b', reemplazar_numero, texto_separado) | |
| return texto_traducido | |
| def infer( | |
| ref_audio_orig, ref_text, gen_text, model, remove_silence, cross_fade_duration=0.15, speed=1, show_info=gr.Info | |
| ): | |
| ref_audio, ref_text = preprocess_ref_audio_text(ref_audio_orig, ref_text, show_info=show_info) | |
| ema_model = F5TTS_ema_model | |
| if not gen_text.startswith(" "): | |
| gen_text = " " + gen_text | |
| if not gen_text.endswith(". "): | |
| gen_text += ". " | |
| gen_text = gen_text.lower() | |
| gen_text = traducir_numero_a_texto(gen_text) | |
| final_wave, final_sample_rate, combined_spectrogram = infer_process( | |
| ref_audio, | |
| ref_text, | |
| gen_text, | |
| ema_model, | |
| vocoder, | |
| cross_fade_duration=cross_fade_duration, | |
| speed=speed, | |
| show_info=show_info, | |
| progress=gr.Progress(), | |
| ) | |
| # Remover silencios | |
| if remove_silence: | |
| with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as f: | |
| sf.write(f.name, final_wave, final_sample_rate) | |
| remove_silence_for_generated_wav(f.name) | |
| final_wave, _ = torchaudio.load(f.name) | |
| final_wave = final_wave.squeeze().cpu().numpy() | |
| # Guardar espectrograma | |
| with tempfile.NamedTemporaryFile(suffix=".png", delete=False) as tmp_spectrogram: | |
| spectrogram_path = tmp_spectrogram.name | |
| save_spectrogram(combined_spectrogram, spectrogram_path) | |
| return (final_sample_rate, final_wave), spectrogram_path | |
| with gr.Blocks() as app_tts: | |
| gr.Markdown("# TTS por Lotes") | |
| ref_audio_input = gr.Audio(label="Audio de Referencia", type="filepath") | |
| gen_text_input = gr.Textbox(label="Texto para Generar", lines=10) | |
| model_choice = gr.Radio(choices=["F5-TTS"], label="Seleccionar Modelo TTS", value="F5-TTS") | |
| generate_btn = gr.Button("Sintetizar", variant="primary") | |
| with gr.Accordion("Configuraciones Avanzadas", open=False): | |
| ref_text_input = gr.Textbox( | |
| label="Texto de Referencia", | |
| info="Deja en blanco para transcribir automáticamente el audio de referencia. Si ingresas texto, sobrescribirá la transcripción automática.", | |
| lines=2, | |
| ) | |
| remove_silence = gr.Checkbox( | |
| label="Eliminar Silencios", | |
| info="El modelo tiende a producir silencios, especialmente en audios más largos. Podemos eliminar manualmente los silencios si es necesario. Ten en cuenta que esta es una característica experimental y puede producir resultados extraños. Esto también aumentará el tiempo de generación.", | |
| value=False, | |
| ) | |
| speed_slider = gr.Slider( | |
| label="Velocidad", | |
| minimum=0.3, | |
| maximum=2.0, | |
| value=1.0, | |
| step=0.1, | |
| info="Ajusta la velocidad del audio.", | |
| ) | |
| cross_fade_duration_slider = gr.Slider( | |
| label="Duración del Cross-Fade (s)", | |
| minimum=0.0, | |
| maximum=1.0, | |
| value=0.15, | |
| step=0.01, | |
| info="Establece la duración del cross-fade entre clips de audio.", | |
| ) | |
| audio_output = gr.Audio(label="Audio Sintetizado") | |
| spectrogram_output = gr.Image(label="Espectrograma") | |
| generate_btn.click( | |
| infer, | |
| inputs=[ | |
| ref_audio_input, | |
| ref_text_input, | |
| gen_text_input, | |
| model_choice, | |
| remove_silence, | |
| cross_fade_duration_slider, | |
| speed_slider, | |
| ], | |
| outputs=[audio_output, spectrogram_output], | |
| ) | |
| with gr.Blocks() as app: | |
| gr.Markdown( | |
| """ | |
| # Spanish-F5 | |
| Esta es una interfaz web para F5 TTS, con un finetuning para poder hablar en castellano. | |
| """ | |
| ) | |
| gr.TabbedInterface( | |
| [app_tts], | |
| ["TTS"], | |
| ) | |
| if __name__ == "__main__": | |
| app.queue().launch() | |