import gradio as gr import os import re import uuid import scipy.io.wavfile import torch from pocket_tts import TTSModel #for voice clone from huggingface_hub import login hf_token = os.getenv("HF_TOKEN") if hf_token: login(token=hf_token) print("Loading TTS Model...") try: tts_model = TTSModel.load_model() print("Model loaded successfully.") except Exception as e: print(f"Error loading model: {e}") def get_tts_file_name(text, language="en"): temp_audio_dir = "./ai_tts_voice/" os.makedirs(temp_audio_dir, exist_ok=True) clean = re.sub(r'[^a-zA-Z\s]', '', text or "") clean = clean.lower().strip().replace(" ", "_")[:20] or "audio" uid = uuid.uuid4().hex[:8].upper() language = language.lower().strip() return os.path.join( temp_audio_dir, f"{clean}_{language}_{uid}.wav" ) DEFAULT_VOICES = [ "alba", "marius", "javert", "jean", "fantine", "cosette", "eponine", "azelma" ] def generate_speech(text, mode, preset_voice, clone_audio_path): if not text: raise gr.Error("Please enter text to generate speech.") state = None if mode == "Default Voices": print(f"Using preset voice: {preset_voice}") state = tts_model.get_state_for_audio_prompt(preset_voice) else: if not clone_audio_path: raise gr.Error("Please upload a reference audio file for cloning.") print(f"Cloning voice from: {clone_audio_path}") try: state = tts_model.get_state_for_audio_prompt(clone_audio_path) except Exception as e: error_msg = f"Error loading reference audio: {str(e)}. Please upload a valid WAV file." print(error_msg) raise gr.Error(error_msg) try: audio_tensor = tts_model.generate_audio(state, text) output_filename = get_tts_file_name(text) scipy.io.wavfile.write(output_filename, tts_model.sample_rate, audio_tensor.numpy()) return output_filename except Exception as e: raise gr.Error(f"Generation failed: {str(e)}") def toggle_inputs(mode): if mode == "Default Voices": return gr.update(visible=True), gr.update(visible=False) else: return gr.update(visible=False), gr.update(visible=True) CUSTOM_CSS = """ .gradio-container { font-family: 'SF Pro Display', -apple-system, BlinkMacSystemFont, sans-serif; } .header-container { text-align: center; margin-bottom: 20px; } .logo-img { margin: 0 auto; display: block; max-width: 100%; transition: transform 0.2s; } .logo-img:hover { transform: scale(1.02); opacity: 0.9; } .links-container a { text-decoration: none; color: #4a90e2; font-weight: 500; } .links-container a:hover { text-decoration: underline; } """ HEADER_HTML = """

Note: This is not an official demo from Kyutai Labs

""" with gr.Blocks(theme='JohnSmith9982/small_and_pretty', css=CUSTOM_CSS) as demo: gr.HTML(HEADER_HTML) with gr.Row(): with gr.Column(): text_input = gr.Textbox( label="Text Input", placeholder="Hi, how are you?", lines=3, value="Hi, how are you?" ) mode_radio = gr.Radio( choices=["Default Voices", "Voice Clone"], value="Default Voices", label="TTS Mode" ) with gr.Group(): dropdown_input = gr.Dropdown( choices=DEFAULT_VOICES, value="alba", label="Select Voice", visible=True ) audio_upload = gr.Audio( label="Upload Reference Audio (WAV recommended)", type="filepath", visible=False ) generate_btn = gr.Button("Generate Audio", variant="primary") example_audio_url = "https://huggingface.co/kyutai/tts-voices/resolve/main/alba-mackenna/casual.wav" with gr.Column(): output_audio = gr.Audio(label="Generated Speech", type="filepath") gr.Examples( examples=[ ["Hello, I am Fantine. Nice to meet you.", "Default Voices", "fantine", None], ["I am Cosette, and the weather is lovely.", "Default Voices", "cosette", None], ["Hey there, Eponine here.", "Default Voices", "eponine", None], ["Greetings from Azelma.", "Default Voices", "azelma", None], ["This is a voice cloning test using the uploaded reference audio.", "Voice Clone", None, example_audio_url], ], inputs=[text_input, mode_radio, dropdown_input, audio_upload], label="Click on an Example to Try" ) mode_radio.change( fn=toggle_inputs, inputs=[mode_radio], outputs=[dropdown_input, audio_upload] ) generate_btn.click( fn=generate_speech, inputs=[text_input, mode_radio, dropdown_input, audio_upload], outputs=[output_audio] ) if __name__ == "__main__": demo.queue().launch(share=False, debug=False)