Spaces:
Sleeping
Sleeping
| import gradio as gr | |
| import numpy as np | |
| import os | |
| from pydub import AudioSegment | |
| from pocket_tts import TTSModel | |
| # 1. Load the model | |
| print("Loading Pocket-TTS model...") | |
| # Ensure you have HF_TOKEN in your Space Secrets for cloning to work | |
| tts = TTSModel.load_model() | |
| print("Model loaded successfully.") | |
| # 2. Define Presets (Simple Strings Only) | |
| # We map the display name to the internal ID string. | |
| # We do NOT use URLs here to avoid 404 errors. | |
| PRESET_VOICES = { | |
| "Alba (American English)": "alba", | |
| "Marius (French)": "marius", | |
| "Jean (Narrator)": "jean", | |
| "Fantine": "fantine", | |
| "Javert": "javert", | |
| "Cosette": "cosette", | |
| "Eponine": "eponine", | |
| "Azelma": "azelma", | |
| } | |
| def preprocess_audio(filepath): | |
| """ | |
| Takes any audio file (MP3, M4A, WAV), ensures it is | |
| a valid 16-bit PCM WAV compatible with the model. | |
| """ | |
| try: | |
| print(f"Converting file: {filepath}") | |
| audio = AudioSegment.from_file(filepath) | |
| # Pocket-TTS works best with mono, 24000Hz or 16000Hz, 16-bit | |
| # We enforce standard wav settings here to prevent "RIFF id" errors | |
| audio = audio.set_channels(1).set_sample_width(2) | |
| output_path = filepath + "_fixed.wav" | |
| audio.export(output_path, format="wav") | |
| print(f"Converted to: {output_path}") | |
| return output_path | |
| except Exception as e: | |
| raise gr.Error(f"Audio conversion failed. Make sure ffmpeg is installed in packages.txt. Error: {e}") | |
| def generate_speech(text, voice_choice, custom_voice_file): | |
| if not text.strip(): | |
| raise gr.Error("Please enter some text.") | |
| try: | |
| # LOGIC BRANCH 1: Custom Voice Upload | |
| if custom_voice_file is not None: | |
| print("--- Mode: Voice Cloning ---") | |
| # 1. Fix the audio file (Fixes 'RIFF id' error) | |
| clean_wav_path = preprocess_audio(custom_voice_file) | |
| # 2. Extract the speaker style | |
| # The model analyzes the WAV to clone the voice | |
| voice_state = tts.get_state_for_audio_prompt(clean_wav_path) | |
| # 3. Generate | |
| audio_tensor = tts.generate_audio(voice_state, text) | |
| # LOGIC BRANCH 2: Built-in Preset | |
| else: | |
| print("--- Mode: Preset Voice ---") | |
| voice_id = PRESET_VOICES[voice_choice] | |
| print(f"Using Internal ID: {voice_id}") | |
| # We pass the STRING directly. | |
| # We do NOT use get_state_for_audio_prompt for presets (Fixes 404 error) | |
| audio_tensor = tts.generate_audio(voice_id, text) | |
| return (tts.sample_rate, audio_tensor.numpy()) | |
| except Exception as e: | |
| # Print full error to logs for debugging | |
| import traceback | |
| traceback.print_exc() | |
| raise gr.Error(f"Generation Error: {str(e)}") | |
| # 3. Build Interface | |
| with gr.Blocks(title="Pocket-TTS Demo") as demo: | |
| gr.Markdown("# 🗣️ Pocket-TTS (Fixed)") | |
| gr.Markdown("Supports Voice Cloning (MP3/WAV) & Built-in Voices.") | |
| with gr.Row(): | |
| with gr.Column(): | |
| text_input = gr.Textbox( | |
| label="Text to Speak", | |
| lines=4, | |
| value="This is a test of the pocket text to speech system." | |
| ) | |
| with gr.Accordion("Voice Settings", open=True): | |
| voice_dropdown = gr.Dropdown( | |
| choices=list(PRESET_VOICES.keys()), | |
| value="Alba (American English)", | |
| label="Use a Preset Voice" | |
| ) | |
| gr.Markdown("**OR**") | |
| voice_upload = gr.Audio( | |
| label="Clone a Voice (Upload any audio)", | |
| type="filepath" | |
| ) | |
| submit_btn = gr.Button("Generate Audio", variant="primary") | |
| with gr.Column(): | |
| audio_output = gr.Audio(label="Result", type="numpy") | |
| submit_btn.click( | |
| fn=generate_speech, | |
| inputs=[text_input, voice_dropdown, voice_upload], | |
| outputs=audio_output | |
| ) | |
| demo.launch() |