Spaces:
Running
Running
| import gradio as gr | |
| import os | |
| import re | |
| import uuid | |
| import scipy.io.wavfile | |
| import torch | |
| from pocket_tts import TTSModel | |
| #for voice clone | |
| from huggingface_hub import login | |
| hf_token = os.getenv("HF_TOKEN") | |
| if hf_token: | |
| login(token=hf_token) | |
| print("Loading TTS Model...") | |
| try: | |
| tts_model = TTSModel.load_model() | |
| print("Model loaded successfully.") | |
| except Exception as e: | |
| print(f"Error loading model: {e}") | |
| def get_tts_file_name(text, language="en"): | |
| temp_audio_dir = "./ai_tts_voice/" | |
| os.makedirs(temp_audio_dir, exist_ok=True) | |
| clean = re.sub(r'[^a-zA-Z\s]', '', text or "") | |
| clean = clean.lower().strip().replace(" ", "_")[:20] or "audio" | |
| uid = uuid.uuid4().hex[:8].upper() | |
| language = language.lower().strip() | |
| return os.path.join( | |
| temp_audio_dir, | |
| f"{clean}_{language}_{uid}.wav" | |
| ) | |
| DEFAULT_VOICES = [ | |
| "alba", "marius", "javert", "jean", | |
| "fantine", "cosette", "eponine", "azelma" | |
| ] | |
| def generate_speech(text, mode, preset_voice, clone_audio_path): | |
| if not text: | |
| raise gr.Error("Please enter text to generate speech.") | |
| state = None | |
| if mode == "Default Voices": | |
| print(f"Using preset voice: {preset_voice}") | |
| state = tts_model.get_state_for_audio_prompt(preset_voice) | |
| else: | |
| if not clone_audio_path: | |
| raise gr.Error("Please upload a reference audio file for cloning.") | |
| print(f"Cloning voice from: {clone_audio_path}") | |
| try: | |
| state = tts_model.get_state_for_audio_prompt(clone_audio_path) | |
| except Exception as e: | |
| error_msg = f"Error loading reference audio: {str(e)}. Please upload a valid WAV file." | |
| print(error_msg) | |
| raise gr.Error(error_msg) | |
| try: | |
| audio_tensor = tts_model.generate_audio(state, text) | |
| output_filename = get_tts_file_name(text) | |
| scipy.io.wavfile.write(output_filename, tts_model.sample_rate, audio_tensor.numpy()) | |
| return output_filename | |
| except Exception as e: | |
| raise gr.Error(f"Generation failed: {str(e)}") | |
| def toggle_inputs(mode): | |
| if mode == "Default Voices": | |
| return gr.update(visible=True), gr.update(visible=False) | |
| else: | |
| return gr.update(visible=False), gr.update(visible=True) | |
| CUSTOM_CSS = """ | |
| .gradio-container { | |
| font-family: 'SF Pro Display', -apple-system, BlinkMacSystemFont, sans-serif; | |
| } | |
| .header-container { | |
| text-align: center; | |
| margin-bottom: 20px; | |
| } | |
| .logo-img { | |
| margin: 0 auto; | |
| display: block; | |
| max-width: 100%; | |
| transition: transform 0.2s; | |
| } | |
| .logo-img:hover { | |
| transform: scale(1.02); | |
| opacity: 0.9; | |
| } | |
| .links-container a { | |
| text-decoration: none; | |
| color: #4a90e2; | |
| font-weight: 500; | |
| } | |
| .links-container a:hover { | |
| text-decoration: underline; | |
| } | |
| """ | |
| HEADER_HTML = """ | |
| <div class="header-container" style="text-align:center;"> | |
| <a href="https://kyutai.org/tts" target="_blank" title="Visit Kyutai TTS"> | |
| <img src="https://raw.githubusercontent.com/kyutai-labs/pocket-tts/refs/heads/main/docs/logo.png" | |
| class="logo-img" width="200"> | |
| </a> | |
| <div class="links-container" | |
| style=" | |
| margin-top: 18px; | |
| display: flex; | |
| justify-content: center; | |
| align-items: center; | |
| gap: 14px; | |
| flex-wrap: wrap; | |
| "> | |
| <a href="https://github.com/kyutai-labs/pocket-tts" | |
| target="_blank" | |
| style="text-decoration:none;"> | |
| 🐱 GitHub Repository | |
| </a> | |
| <span style="color: gray;">|</span> | |
| <a href="https://huggingface.co/kyutai/pocket-tts" | |
| target="_blank" | |
| style="text-decoration:none;"> | |
| 🤗 Hugging Face Model Card | |
| </a> | |
| <span style="color: gray;">|</span> | |
| <a href="https://colab.research.google.com/github/NeuralFalconYT/Voice-Clone/blob/main/Pocket_TTS_Colab.ipynb" | |
| target="_blank" | |
| style=" | |
| display: inline-flex; | |
| align-items: center; | |
| "> | |
| <img src="https://colab.research.google.com/assets/colab-badge.svg" | |
| alt="Open in Colab" | |
| height="26"> | |
| </a> | |
| </div> | |
| <p style="font-size: 0.8em; color: gray; margin-top: 10px;"> | |
| <i>Note: This is not an official demo from Kyutai Labs</i> | |
| </p> | |
| </div> | |
| """ | |
| with gr.Blocks(theme='JohnSmith9982/small_and_pretty', css=CUSTOM_CSS) as demo: | |
| gr.HTML(HEADER_HTML) | |
| with gr.Row(): | |
| with gr.Column(): | |
| text_input = gr.Textbox( | |
| label="Text Input", | |
| placeholder="Hi, how are you?", | |
| lines=3, | |
| value="Hi, how are you?" | |
| ) | |
| mode_radio = gr.Radio( | |
| choices=["Default Voices", "Voice Clone"], | |
| value="Default Voices", | |
| label="TTS Mode" | |
| ) | |
| with gr.Group(): | |
| dropdown_input = gr.Dropdown( | |
| choices=DEFAULT_VOICES, | |
| value="alba", | |
| label="Select Voice", | |
| visible=True | |
| ) | |
| audio_upload = gr.Audio( | |
| label="Upload Reference Audio (WAV recommended)", | |
| type="filepath", | |
| visible=False | |
| ) | |
| generate_btn = gr.Button("Generate Audio", variant="primary") | |
| example_audio_url = "https://huggingface.co/kyutai/tts-voices/resolve/main/alba-mackenna/casual.wav" | |
| with gr.Column(): | |
| output_audio = gr.Audio(label="Generated Speech", type="filepath") | |
| gr.Examples( | |
| examples=[ | |
| ["Hello, I am Fantine. Nice to meet you.", "Default Voices", "fantine", None], | |
| ["I am Cosette, and the weather is lovely.", "Default Voices", "cosette", None], | |
| ["Hey there, Eponine here.", "Default Voices", "eponine", None], | |
| ["Greetings from Azelma.", "Default Voices", "azelma", None], | |
| ["This is a voice cloning test using the uploaded reference audio.", "Voice Clone", None, example_audio_url], | |
| ], | |
| inputs=[text_input, mode_radio, dropdown_input, audio_upload], | |
| label="Click on an Example to Try" | |
| ) | |
| mode_radio.change( | |
| fn=toggle_inputs, | |
| inputs=[mode_radio], | |
| outputs=[dropdown_input, audio_upload] | |
| ) | |
| generate_btn.click( | |
| fn=generate_speech, | |
| inputs=[text_input, mode_radio, dropdown_input, audio_upload], | |
| outputs=[output_audio] | |
| ) | |
| if __name__ == "__main__": | |
| demo.queue().launch(share=False, debug=False) | |