Spaces:

FresherDifference
/

Pocket-TTS

Sleeping

App Files Files Community

Pocket-TTS / app.py

FresherDifference

Update app.py

909f184 verified 18 days ago

raw

history blame contribute delete

4.13 kB

	import gradio as gr
	import numpy as np
	import os
	from pydub import AudioSegment
	from pocket_tts import TTSModel

	# 1. Load the model
	print("Loading Pocket-TTS model...")
	# Ensure you have HF_TOKEN in your Space Secrets for cloning to work
	tts = TTSModel.load_model()
	print("Model loaded successfully.")

	# 2. Define Presets (Simple Strings Only)
	# We map the display name to the internal ID string.
	# We do NOT use URLs here to avoid 404 errors.
	PRESET_VOICES = {
	"Alba (American English)": "alba",
	"Marius (French)": "marius",
	"Jean (Narrator)": "jean",
	"Fantine": "fantine",
	"Javert": "javert",
	"Cosette": "cosette",
	"Eponine": "eponine",
	"Azelma": "azelma",
	}

	def preprocess_audio(filepath):
	"""
	Takes any audio file (MP3, M4A, WAV), ensures it is
	a valid 16-bit PCM WAV compatible with the model.
	"""
	try:
	print(f"Converting file: {filepath}")
	audio = AudioSegment.from_file(filepath)

	# Pocket-TTS works best with mono, 24000Hz or 16000Hz, 16-bit
	# We enforce standard wav settings here to prevent "RIFF id" errors
	audio = audio.set_channels(1).set_sample_width(2)

	output_path = filepath + "_fixed.wav"
	audio.export(output_path, format="wav")
	print(f"Converted to: {output_path}")
	return output_path
	except Exception as e:
	raise gr.Error(f"Audio conversion failed. Make sure ffmpeg is installed in packages.txt. Error: {e}")

	def generate_speech(text, voice_choice, custom_voice_file):
	if not text.strip():
	raise gr.Error("Please enter some text.")

	try:
	# LOGIC BRANCH 1: Custom Voice Upload
	if custom_voice_file is not None:
	print("--- Mode: Voice Cloning ---")

	# 1. Fix the audio file (Fixes 'RIFF id' error)
	clean_wav_path = preprocess_audio(custom_voice_file)

	# 2. Extract the speaker style
	# The model analyzes the WAV to clone the voice
	voice_state = tts.get_state_for_audio_prompt(clean_wav_path)

	# 3. Generate
	audio_tensor = tts.generate_audio(voice_state, text)

	# LOGIC BRANCH 2: Built-in Preset
	else:
	print("--- Mode: Preset Voice ---")
	voice_id = PRESET_VOICES[voice_choice]
	print(f"Using Internal ID: {voice_id}")

	# We pass the STRING directly.
	# We do NOT use get_state_for_audio_prompt for presets (Fixes 404 error)
	audio_tensor = tts.generate_audio(voice_id, text)

	return (tts.sample_rate, audio_tensor.numpy())

	except Exception as e:
	# Print full error to logs for debugging
	import traceback
	traceback.print_exc()
	raise gr.Error(f"Generation Error: {str(e)}")

	# 3. Build Interface
	with gr.Blocks(title="Pocket-TTS Demo") as demo:
	gr.Markdown("# 🗣️ Pocket-TTS (Fixed)")
	gr.Markdown("Supports Voice Cloning (MP3/WAV) & Built-in Voices.")

	with gr.Row():
	with gr.Column():
	text_input = gr.Textbox(
	label="Text to Speak",
	lines=4,
	value="This is a test of the pocket text to speech system."
	)

	with gr.Accordion("Voice Settings", open=True):
	voice_dropdown = gr.Dropdown(
	choices=list(PRESET_VOICES.keys()),
	value="Alba (American English)",
	label="Use a Preset Voice"
	)
	gr.Markdown("OR")
	voice_upload = gr.Audio(
	label="Clone a Voice (Upload any audio)",
	type="filepath"
	)

	submit_btn = gr.Button("Generate Audio", variant="primary")

	with gr.Column():
	audio_output = gr.Audio(label="Result", type="numpy")

	submit_btn.click(
	fn=generate_speech,
	inputs=[text_input, voice_dropdown, voice_upload],
	outputs=audio_output
	)

	demo.launch()