Spaces:
Sleeping
Sleeping
File size: 4,125 Bytes
0468b28 f48358e 909f184 0468b28 909f184 f48358e 909f184 f48358e 909f184 f48358e 909f184 f48358e 909f184 f48358e 909f184 f48358e 909f184 0468b28 909f184 f48358e 909f184 f48358e 909f184 f48358e 909f184 f48358e 909f184 f48358e 909f184 f48358e 909f184 f48358e 909f184 f48358e 909f184 f48358e 909f184 f48358e 909f184 f48358e 0468b28 f48358e |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 |
import gradio as gr
import numpy as np
import os
from pydub import AudioSegment
from pocket_tts import TTSModel
# 1. Load the model
print("Loading Pocket-TTS model...")
# Ensure you have HF_TOKEN in your Space Secrets for cloning to work
tts = TTSModel.load_model()
print("Model loaded successfully.")
# 2. Define Presets (Simple Strings Only)
# We map the display name to the internal ID string.
# We do NOT use URLs here to avoid 404 errors.
PRESET_VOICES = {
"Alba (American English)": "alba",
"Marius (French)": "marius",
"Jean (Narrator)": "jean",
"Fantine": "fantine",
"Javert": "javert",
"Cosette": "cosette",
"Eponine": "eponine",
"Azelma": "azelma",
}
def preprocess_audio(filepath):
"""
Takes any audio file (MP3, M4A, WAV), ensures it is
a valid 16-bit PCM WAV compatible with the model.
"""
try:
print(f"Converting file: {filepath}")
audio = AudioSegment.from_file(filepath)
# Pocket-TTS works best with mono, 24000Hz or 16000Hz, 16-bit
# We enforce standard wav settings here to prevent "RIFF id" errors
audio = audio.set_channels(1).set_sample_width(2)
output_path = filepath + "_fixed.wav"
audio.export(output_path, format="wav")
print(f"Converted to: {output_path}")
return output_path
except Exception as e:
raise gr.Error(f"Audio conversion failed. Make sure ffmpeg is installed in packages.txt. Error: {e}")
def generate_speech(text, voice_choice, custom_voice_file):
if not text.strip():
raise gr.Error("Please enter some text.")
try:
# LOGIC BRANCH 1: Custom Voice Upload
if custom_voice_file is not None:
print("--- Mode: Voice Cloning ---")
# 1. Fix the audio file (Fixes 'RIFF id' error)
clean_wav_path = preprocess_audio(custom_voice_file)
# 2. Extract the speaker style
# The model analyzes the WAV to clone the voice
voice_state = tts.get_state_for_audio_prompt(clean_wav_path)
# 3. Generate
audio_tensor = tts.generate_audio(voice_state, text)
# LOGIC BRANCH 2: Built-in Preset
else:
print("--- Mode: Preset Voice ---")
voice_id = PRESET_VOICES[voice_choice]
print(f"Using Internal ID: {voice_id}")
# We pass the STRING directly.
# We do NOT use get_state_for_audio_prompt for presets (Fixes 404 error)
audio_tensor = tts.generate_audio(voice_id, text)
return (tts.sample_rate, audio_tensor.numpy())
except Exception as e:
# Print full error to logs for debugging
import traceback
traceback.print_exc()
raise gr.Error(f"Generation Error: {str(e)}")
# 3. Build Interface
with gr.Blocks(title="Pocket-TTS Demo") as demo:
gr.Markdown("# 🗣️ Pocket-TTS (Fixed)")
gr.Markdown("Supports Voice Cloning (MP3/WAV) & Built-in Voices.")
with gr.Row():
with gr.Column():
text_input = gr.Textbox(
label="Text to Speak",
lines=4,
value="This is a test of the pocket text to speech system."
)
with gr.Accordion("Voice Settings", open=True):
voice_dropdown = gr.Dropdown(
choices=list(PRESET_VOICES.keys()),
value="Alba (American English)",
label="Use a Preset Voice"
)
gr.Markdown("**OR**")
voice_upload = gr.Audio(
label="Clone a Voice (Upload any audio)",
type="filepath"
)
submit_btn = gr.Button("Generate Audio", variant="primary")
with gr.Column():
audio_output = gr.Audio(label="Result", type="numpy")
submit_btn.click(
fn=generate_speech,
inputs=[text_input, voice_dropdown, voice_upload],
outputs=audio_output
)
demo.launch() |