Spaces:

maya-research
/

maya1

Running on Zero

maya1 / app.py

Veena

Update Maya1 Gradio app with preset characters

06301dc about 1 month ago

8.73 kB

	import gradio as gr
	import asyncio
	import io
	import sys
	sys.path.insert(0, '.')

	# Mock spaces module for local testing
	try:
	import spaces
	except ImportError:
	class SpacesMock:
	@staticmethod
	def GPU(func):
	return func
	spaces = SpacesMock()

	from maya1.model_loader import Maya1Model
	from maya1.pipeline import Maya1Pipeline
	from maya1.prompt_builder import Maya1PromptBuilder
	from maya1.snac_decoder import SNACDecoder
	from maya1.constants import AUDIO_SAMPLE_RATE

	# Preset characters (2 realistic + 2 creative)
	PRESET_CHARACTERS = {
	"Male American": {
	"description": "Male voice in their 30s with american accent",
	"example_text": "Hello world <laugh_harder> this is amazing <giggle> I love it"
	},
	"Female British": {
	"description": "Female voice in their 20s with british accent",
	"example_text": "Welcome everyone <excited> let me tell you something <sigh> incredible"
	},
	"Robot": {
	"description": "Creative, ai_machine_voice character. Male voice with robotic timbre",
	"example_text": "System initialized <whisper> processing data <gasp> computation complete"
	},
	"Singer": {
	"description": "Creative character. Female voice with smooth timbre",
	"example_text": "Listen to this <sing> la la la <laugh> beautiful melody <giggle>"
	}
	}

	# Global pipeline variables
	model = None
	prompt_builder = None
	snac_decoder = None
	pipeline = None
	models_loaded = False

	def load_models():
	"""Load Maya1 vLLM model and pipeline (runs once)."""
	global model, prompt_builder, snac_decoder, pipeline, models_loaded

	if models_loaded:
	return

	import torch
	import os

	# Ensure CUDA is available for HF Spaces
	if not torch.cuda.is_available():
	print("Warning: CUDA not available, using CPU")
	device = "cpu"
	else:
	device = "cuda"
	print(f"CUDA available: {torch.cuda.get_device_name(0)}")

	# Set environment variable for vLLM
	os.environ.setdefault("VLLM_USE_V1", "0")

	print("Loading Maya1 model with vLLM...")
	model = Maya1Model(
	model_path="maya-research/maya1",
	dtype="bfloat16",
	max_model_len=8192,
	gpu_memory_utilization=0.85,
	)

	print("Initializing prompt builder...")
	prompt_builder = Maya1PromptBuilder(model.tokenizer, model)

	print("Loading SNAC decoder...")
	snac_decoder = SNACDecoder(
	device=device,
	enable_batching=False,
	)

	print("Initializing pipeline...")
	pipeline = Maya1Pipeline(model, prompt_builder, snac_decoder)

	models_loaded = True
	print("Models loaded successfully!")

	def preset_selected(preset_name):
	"""Update description and text when preset is selected."""
	if preset_name in PRESET_CHARACTERS:
	char = PRESET_CHARACTERS[preset_name]
	return char["description"], char["example_text"]
	return "", ""

	@spaces.GPU
	def generate_speech(preset_name, description, text, temperature, max_tokens):
	"""Generate emotional speech from description and text using vLLM."""
	try:
	# Load models if not already loaded
	load_models()

	# If using preset, override description
	if preset_name and preset_name in PRESET_CHARACTERS:
	description = PRESET_CHARACTERS[preset_name]["description"]

	# Validate inputs
	if not description or not text:
	return None, "Error: Please provide both description and text!"

	print(f"Generating with temperature={temperature}, max_tokens={max_tokens}...")

	# Generate audio using vLLM pipeline (async wrapper)
	loop = asyncio.new_event_loop()
	asyncio.set_event_loop(loop)
	audio_bytes = loop.run_until_complete(
	pipeline.generate_speech(
	description=description,
	text=text,
	temperature=temperature,
	top_p=0.9,
	max_tokens=max_tokens,
	repetition_penalty=1.1,
	seed=None,
	)
	)
	loop.close()

	if audio_bytes is None:
	return None, "Error: Audio generation failed. Try different text or increase max_tokens."

	# Convert bytes to WAV file
	import wave
	wav_buffer = io.BytesIO()
	with wave.open(wav_buffer, 'wb') as wav_file:
	wav_file.setnchannels(1)
	wav_file.setsampwidth(2)
	wav_file.setframerate(AUDIO_SAMPLE_RATE)
	wav_file.writeframes(audio_bytes)

	wav_buffer.seek(0)

	# Calculate duration
	duration = len(audio_bytes) // 2 / AUDIO_SAMPLE_RATE
	frames = len(audio_bytes) // 2 // (AUDIO_SAMPLE_RATE // 6.86) // 7

	status_msg = f"Generated {duration:.2f}s of emotional speech!"

	return wav_buffer, status_msg

	except Exception as e:
	import traceback
	error_msg = f"Error: {str(e)}\n{traceback.format_exc()}"
	print(error_msg)
	return None, error_msg

	# Create Gradio interface
	with gr.Blocks(title="Maya1 - Open Source Emotional TTS", theme=gr.themes.Soft()) as demo:
	gr.Markdown("""
	# Maya1 - Open Source Emotional Text-to-Speech

	The best open source voice AI model with emotions!

	Generate realistic and expressive speech with natural language voice design.
	Choose a preset character or create your own custom voice.

	[Model](https://huggingface.co/maya-research/maya1) \| [GitHub](https://github.com/MayaResearch/maya1-fastapi)
	""")

	with gr.Row():
	with gr.Column(scale=1):
	gr.Markdown("### Character Selection")

	preset_dropdown = gr.Dropdown(
	choices=list(PRESET_CHARACTERS.keys()),
	label="Preset Characters",
	value=list(PRESET_CHARACTERS.keys())[0],
	info="Quick pick from 4 preset characters"
	)

	gr.Markdown("### Voice Design")

	description_input = gr.Textbox(
	label="Voice Description",
	placeholder="E.g., Male voice in their 30s with american accent. Normal pitch, warm timbre...",
	lines=3,
	value=PRESET_CHARACTERS[list(PRESET_CHARACTERS.keys())[0]]["description"]
	)

	text_input = gr.Textbox(
	label="Text to Speak",
	placeholder="Enter text with <emotion> tags like <laugh>, <sigh>, <excited>...",
	lines=4,
	value=PRESET_CHARACTERS[list(PRESET_CHARACTERS.keys())[0]]["example_text"]
	)

	with gr.Accordion("Advanced Settings", open=False):
	temperature_slider = gr.Slider(
	minimum=0.1,
	maximum=1.0,
	value=0.4,
	step=0.1,
	label="Temperature",
	info="Lower = more stable, Higher = more creative"
	)

	max_tokens_slider = gr.Slider(
	minimum=100,
	maximum=2048,
	value=500,
	step=50,
	label="Max Tokens",
	info="More tokens = longer audio"
	)

	generate_btn = gr.Button("Generate Speech", variant="primary", size="lg")

	with gr.Column(scale=1):
	gr.Markdown("### Generated Audio")

	audio_output = gr.Audio(
	label="Generated Speech",
	type="filepath",
	interactive=False
	)

	status_output = gr.Textbox(
	label="Status",
	lines=3,
	interactive=False
	)

	gr.Markdown("""
	### Supported Emotions

	`<angry>` `<chuckle>` `<cry>` `<disappointed>` `<excited>` `<gasp>`
	`<giggle>` `<laugh>` `<laugh_harder>` `<sarcastic>` `<sigh>`
	`<sing>` `<whisper>`
	""")

	# Event handlers
	preset_dropdown.change(
	fn=preset_selected,
	inputs=[preset_dropdown],
	outputs=[description_input, text_input]
	)

	generate_btn.click(
	fn=generate_speech,
	inputs=[preset_dropdown, description_input, text_input, temperature_slider, max_tokens_slider],
	outputs=[audio_output, status_output]
	)

	if __name__ == "__main__":
	demo.launch()