Spaces:

FresherDifference
/

Pocket-TTS

Sleeping

File size: 4,125 Bytes

import gradio as gr
import numpy as np
import os
from pydub import AudioSegment
from pocket_tts import TTSModel

# 1. Load the model
print("Loading Pocket-TTS model...")
# Ensure you have HF_TOKEN in your Space Secrets for cloning to work
tts = TTSModel.load_model()
print("Model loaded successfully.")

# 2. Define Presets (Simple Strings Only)
# We map the display name to the internal ID string.
# We do NOT use URLs here to avoid 404 errors.
PRESET_VOICES = {
    "Alba (American English)": "alba",
    "Marius (French)": "marius",
    "Jean (Narrator)": "jean",
    "Fantine": "fantine",
    "Javert": "javert",
    "Cosette": "cosette",
    "Eponine": "eponine",
    "Azelma": "azelma",
}

def preprocess_audio(filepath):
    """
    Takes any audio file (MP3, M4A, WAV), ensures it is 
    a valid 16-bit PCM WAV compatible with the model.
    """
    try:
        print(f"Converting file: {filepath}")
        audio = AudioSegment.from_file(filepath)
        
        # Pocket-TTS works best with mono, 24000Hz or 16000Hz, 16-bit
        # We enforce standard wav settings here to prevent "RIFF id" errors
        audio = audio.set_channels(1).set_sample_width(2)
        
        output_path = filepath + "_fixed.wav"
        audio.export(output_path, format="wav")
        print(f"Converted to: {output_path}")
        return output_path
    except Exception as e:
        raise gr.Error(f"Audio conversion failed. Make sure ffmpeg is installed in packages.txt. Error: {e}")

def generate_speech(text, voice_choice, custom_voice_file):
    if not text.strip():
        raise gr.Error("Please enter some text.")

    try:
        # LOGIC BRANCH 1: Custom Voice Upload
        if custom_voice_file is not None:
            print("--- Mode: Voice Cloning ---")
            
            # 1. Fix the audio file (Fixes 'RIFF id' error)
            clean_wav_path = preprocess_audio(custom_voice_file)
            
            # 2. Extract the speaker style
            # The model analyzes the WAV to clone the voice
            voice_state = tts.get_state_for_audio_prompt(clean_wav_path)
            
            # 3. Generate
            audio_tensor = tts.generate_audio(voice_state, text)

        # LOGIC BRANCH 2: Built-in Preset
        else:
            print("--- Mode: Preset Voice ---")
            voice_id = PRESET_VOICES[voice_choice]
            print(f"Using Internal ID: {voice_id}")
            
            # We pass the STRING directly. 
            # We do NOT use get_state_for_audio_prompt for presets (Fixes 404 error)
            audio_tensor = tts.generate_audio(voice_id, text)

        return (tts.sample_rate, audio_tensor.numpy())

    except Exception as e:
        # Print full error to logs for debugging
        import traceback
        traceback.print_exc()
        raise gr.Error(f"Generation Error: {str(e)}")

# 3. Build Interface
with gr.Blocks(title="Pocket-TTS Demo") as demo:
    gr.Markdown("# 🗣️ Pocket-TTS (Fixed)")
    gr.Markdown("Supports Voice Cloning (MP3/WAV) & Built-in Voices.")
    
    with gr.Row():
        with gr.Column():
            text_input = gr.Textbox(
                label="Text to Speak", 
                lines=4,
                value="This is a test of the pocket text to speech system."
            )
            
            with gr.Accordion("Voice Settings", open=True):
                voice_dropdown = gr.Dropdown(
                    choices=list(PRESET_VOICES.keys()), 
                    value="Alba (American English)", 
                    label="Use a Preset Voice"
                )
                gr.Markdown("**OR**")
                voice_upload = gr.Audio(
                    label="Clone a Voice (Upload any audio)", 
                    type="filepath"
                )
            
            submit_btn = gr.Button("Generate Audio", variant="primary")
            
        with gr.Column():
            audio_output = gr.Audio(label="Result", type="numpy")

    submit_btn.click(
        fn=generate_speech,
        inputs=[text_input, voice_dropdown, voice_upload],
        outputs=audio_output
    )

demo.launch()