import gradio as gr import numpy as np import os from pydub import AudioSegment from pocket_tts import TTSModel # 1. Load the model print("Loading Pocket-TTS model...") # Ensure you have HF_TOKEN in your Space Secrets for cloning to work tts = TTSModel.load_model() print("Model loaded successfully.") # 2. Define Presets (Simple Strings Only) # We map the display name to the internal ID string. # We do NOT use URLs here to avoid 404 errors. PRESET_VOICES = { "Alba (American English)": "alba", "Marius (French)": "marius", "Jean (Narrator)": "jean", "Fantine": "fantine", "Javert": "javert", "Cosette": "cosette", "Eponine": "eponine", "Azelma": "azelma", } def preprocess_audio(filepath): """ Takes any audio file (MP3, M4A, WAV), ensures it is a valid 16-bit PCM WAV compatible with the model. """ try: print(f"Converting file: {filepath}") audio = AudioSegment.from_file(filepath) # Pocket-TTS works best with mono, 24000Hz or 16000Hz, 16-bit # We enforce standard wav settings here to prevent "RIFF id" errors audio = audio.set_channels(1).set_sample_width(2) output_path = filepath + "_fixed.wav" audio.export(output_path, format="wav") print(f"Converted to: {output_path}") return output_path except Exception as e: raise gr.Error(f"Audio conversion failed. Make sure ffmpeg is installed in packages.txt. Error: {e}") def generate_speech(text, voice_choice, custom_voice_file): if not text.strip(): raise gr.Error("Please enter some text.") try: # LOGIC BRANCH 1: Custom Voice Upload if custom_voice_file is not None: print("--- Mode: Voice Cloning ---") # 1. Fix the audio file (Fixes 'RIFF id' error) clean_wav_path = preprocess_audio(custom_voice_file) # 2. Extract the speaker style # The model analyzes the WAV to clone the voice voice_state = tts.get_state_for_audio_prompt(clean_wav_path) # 3. Generate audio_tensor = tts.generate_audio(voice_state, text) # LOGIC BRANCH 2: Built-in Preset else: print("--- Mode: Preset Voice ---") voice_id = PRESET_VOICES[voice_choice] print(f"Using Internal ID: {voice_id}") # We pass the STRING directly. # We do NOT use get_state_for_audio_prompt for presets (Fixes 404 error) audio_tensor = tts.generate_audio(voice_id, text) return (tts.sample_rate, audio_tensor.numpy()) except Exception as e: # Print full error to logs for debugging import traceback traceback.print_exc() raise gr.Error(f"Generation Error: {str(e)}") # 3. Build Interface with gr.Blocks(title="Pocket-TTS Demo") as demo: gr.Markdown("# 🗣️ Pocket-TTS (Fixed)") gr.Markdown("Supports Voice Cloning (MP3/WAV) & Built-in Voices.") with gr.Row(): with gr.Column(): text_input = gr.Textbox( label="Text to Speak", lines=4, value="This is a test of the pocket text to speech system." ) with gr.Accordion("Voice Settings", open=True): voice_dropdown = gr.Dropdown( choices=list(PRESET_VOICES.keys()), value="Alba (American English)", label="Use a Preset Voice" ) gr.Markdown("**OR**") voice_upload = gr.Audio( label="Clone a Voice (Upload any audio)", type="filepath" ) submit_btn = gr.Button("Generate Audio", variant="primary") with gr.Column(): audio_output = gr.Audio(label="Result", type="numpy") submit_btn.click( fn=generate_speech, inputs=[text_input, voice_dropdown, voice_upload], outputs=audio_output ) demo.launch()