File size: 4,125 Bytes
0468b28
f48358e
909f184
 
0468b28
 
909f184
f48358e
909f184
f48358e
 
 
909f184
 
 
f48358e
909f184
 
 
 
 
 
 
 
f48358e
 
909f184
f48358e
909f184
 
f48358e
909f184
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
0468b28
909f184
f48358e
909f184
 
 
 
 
 
 
 
 
 
 
 
 
 
f48358e
909f184
 
 
 
 
 
 
 
 
f48358e
909f184
f48358e
 
909f184
 
 
 
f48358e
909f184
f48358e
909f184
 
f48358e
 
 
 
 
 
909f184
f48358e
 
 
 
 
 
909f184
f48358e
 
 
909f184
f48358e
 
 
 
 
 
909f184
f48358e
 
 
 
0468b28
 
 
f48358e
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
import gradio as gr
import numpy as np
import os
from pydub import AudioSegment
from pocket_tts import TTSModel

# 1. Load the model
print("Loading Pocket-TTS model...")
# Ensure you have HF_TOKEN in your Space Secrets for cloning to work
tts = TTSModel.load_model()
print("Model loaded successfully.")

# 2. Define Presets (Simple Strings Only)
# We map the display name to the internal ID string.
# We do NOT use URLs here to avoid 404 errors.
PRESET_VOICES = {
    "Alba (American English)": "alba",
    "Marius (French)": "marius",
    "Jean (Narrator)": "jean",
    "Fantine": "fantine",
    "Javert": "javert",
    "Cosette": "cosette",
    "Eponine": "eponine",
    "Azelma": "azelma",
}

def preprocess_audio(filepath):
    """
    Takes any audio file (MP3, M4A, WAV), ensures it is 
    a valid 16-bit PCM WAV compatible with the model.
    """
    try:
        print(f"Converting file: {filepath}")
        audio = AudioSegment.from_file(filepath)
        
        # Pocket-TTS works best with mono, 24000Hz or 16000Hz, 16-bit
        # We enforce standard wav settings here to prevent "RIFF id" errors
        audio = audio.set_channels(1).set_sample_width(2)
        
        output_path = filepath + "_fixed.wav"
        audio.export(output_path, format="wav")
        print(f"Converted to: {output_path}")
        return output_path
    except Exception as e:
        raise gr.Error(f"Audio conversion failed. Make sure ffmpeg is installed in packages.txt. Error: {e}")

def generate_speech(text, voice_choice, custom_voice_file):
    if not text.strip():
        raise gr.Error("Please enter some text.")

    try:
        # LOGIC BRANCH 1: Custom Voice Upload
        if custom_voice_file is not None:
            print("--- Mode: Voice Cloning ---")
            
            # 1. Fix the audio file (Fixes 'RIFF id' error)
            clean_wav_path = preprocess_audio(custom_voice_file)
            
            # 2. Extract the speaker style
            # The model analyzes the WAV to clone the voice
            voice_state = tts.get_state_for_audio_prompt(clean_wav_path)
            
            # 3. Generate
            audio_tensor = tts.generate_audio(voice_state, text)

        # LOGIC BRANCH 2: Built-in Preset
        else:
            print("--- Mode: Preset Voice ---")
            voice_id = PRESET_VOICES[voice_choice]
            print(f"Using Internal ID: {voice_id}")
            
            # We pass the STRING directly. 
            # We do NOT use get_state_for_audio_prompt for presets (Fixes 404 error)
            audio_tensor = tts.generate_audio(voice_id, text)

        return (tts.sample_rate, audio_tensor.numpy())

    except Exception as e:
        # Print full error to logs for debugging
        import traceback
        traceback.print_exc()
        raise gr.Error(f"Generation Error: {str(e)}")

# 3. Build Interface
with gr.Blocks(title="Pocket-TTS Demo") as demo:
    gr.Markdown("# 🗣️ Pocket-TTS (Fixed)")
    gr.Markdown("Supports Voice Cloning (MP3/WAV) & Built-in Voices.")
    
    with gr.Row():
        with gr.Column():
            text_input = gr.Textbox(
                label="Text to Speak", 
                lines=4,
                value="This is a test of the pocket text to speech system."
            )
            
            with gr.Accordion("Voice Settings", open=True):
                voice_dropdown = gr.Dropdown(
                    choices=list(PRESET_VOICES.keys()), 
                    value="Alba (American English)", 
                    label="Use a Preset Voice"
                )
                gr.Markdown("**OR**")
                voice_upload = gr.Audio(
                    label="Clone a Voice (Upload any audio)", 
                    type="filepath"
                )
            
            submit_btn = gr.Button("Generate Audio", variant="primary")
            
        with gr.Column():
            audio_output = gr.Audio(label="Result", type="numpy")

    submit_btn.click(
        fn=generate_speech,
        inputs=[text_input, voice_dropdown, voice_upload],
        outputs=audio_output
    )

demo.launch()