maya1 / app.py
Veena
Update Maya1 Gradio app with preset characters
06301dc
raw
history blame
8.73 kB
import gradio as gr
import asyncio
import io
import sys
sys.path.insert(0, '.')
# Mock spaces module for local testing
try:
import spaces
except ImportError:
class SpacesMock:
@staticmethod
def GPU(func):
return func
spaces = SpacesMock()
from maya1.model_loader import Maya1Model
from maya1.pipeline import Maya1Pipeline
from maya1.prompt_builder import Maya1PromptBuilder
from maya1.snac_decoder import SNACDecoder
from maya1.constants import AUDIO_SAMPLE_RATE
# Preset characters (2 realistic + 2 creative)
PRESET_CHARACTERS = {
"Male American": {
"description": "Male voice in their 30s with american accent",
"example_text": "Hello world <laugh_harder> this is amazing <giggle> I love it"
},
"Female British": {
"description": "Female voice in their 20s with british accent",
"example_text": "Welcome everyone <excited> let me tell you something <sigh> incredible"
},
"Robot": {
"description": "Creative, ai_machine_voice character. Male voice with robotic timbre",
"example_text": "System initialized <whisper> processing data <gasp> computation complete"
},
"Singer": {
"description": "Creative character. Female voice with smooth timbre",
"example_text": "Listen to this <sing> la la la <laugh> beautiful melody <giggle>"
}
}
# Global pipeline variables
model = None
prompt_builder = None
snac_decoder = None
pipeline = None
models_loaded = False
def load_models():
"""Load Maya1 vLLM model and pipeline (runs once)."""
global model, prompt_builder, snac_decoder, pipeline, models_loaded
if models_loaded:
return
import torch
import os
# Ensure CUDA is available for HF Spaces
if not torch.cuda.is_available():
print("Warning: CUDA not available, using CPU")
device = "cpu"
else:
device = "cuda"
print(f"CUDA available: {torch.cuda.get_device_name(0)}")
# Set environment variable for vLLM
os.environ.setdefault("VLLM_USE_V1", "0")
print("Loading Maya1 model with vLLM...")
model = Maya1Model(
model_path="maya-research/maya1",
dtype="bfloat16",
max_model_len=8192,
gpu_memory_utilization=0.85,
)
print("Initializing prompt builder...")
prompt_builder = Maya1PromptBuilder(model.tokenizer, model)
print("Loading SNAC decoder...")
snac_decoder = SNACDecoder(
device=device,
enable_batching=False,
)
print("Initializing pipeline...")
pipeline = Maya1Pipeline(model, prompt_builder, snac_decoder)
models_loaded = True
print("Models loaded successfully!")
def preset_selected(preset_name):
"""Update description and text when preset is selected."""
if preset_name in PRESET_CHARACTERS:
char = PRESET_CHARACTERS[preset_name]
return char["description"], char["example_text"]
return "", ""
@spaces.GPU
def generate_speech(preset_name, description, text, temperature, max_tokens):
"""Generate emotional speech from description and text using vLLM."""
try:
# Load models if not already loaded
load_models()
# If using preset, override description
if preset_name and preset_name in PRESET_CHARACTERS:
description = PRESET_CHARACTERS[preset_name]["description"]
# Validate inputs
if not description or not text:
return None, "Error: Please provide both description and text!"
print(f"Generating with temperature={temperature}, max_tokens={max_tokens}...")
# Generate audio using vLLM pipeline (async wrapper)
loop = asyncio.new_event_loop()
asyncio.set_event_loop(loop)
audio_bytes = loop.run_until_complete(
pipeline.generate_speech(
description=description,
text=text,
temperature=temperature,
top_p=0.9,
max_tokens=max_tokens,
repetition_penalty=1.1,
seed=None,
)
)
loop.close()
if audio_bytes is None:
return None, "Error: Audio generation failed. Try different text or increase max_tokens."
# Convert bytes to WAV file
import wave
wav_buffer = io.BytesIO()
with wave.open(wav_buffer, 'wb') as wav_file:
wav_file.setnchannels(1)
wav_file.setsampwidth(2)
wav_file.setframerate(AUDIO_SAMPLE_RATE)
wav_file.writeframes(audio_bytes)
wav_buffer.seek(0)
# Calculate duration
duration = len(audio_bytes) // 2 / AUDIO_SAMPLE_RATE
frames = len(audio_bytes) // 2 // (AUDIO_SAMPLE_RATE // 6.86) // 7
status_msg = f"Generated {duration:.2f}s of emotional speech!"
return wav_buffer, status_msg
except Exception as e:
import traceback
error_msg = f"Error: {str(e)}\n{traceback.format_exc()}"
print(error_msg)
return None, error_msg
# Create Gradio interface
with gr.Blocks(title="Maya1 - Open Source Emotional TTS", theme=gr.themes.Soft()) as demo:
gr.Markdown("""
# Maya1 - Open Source Emotional Text-to-Speech
**The best open source voice AI model with emotions!**
Generate realistic and expressive speech with natural language voice design.
Choose a preset character or create your own custom voice.
[Model](https://huggingface.co/maya-research/maya1) | [GitHub](https://github.com/MayaResearch/maya1-fastapi)
""")
with gr.Row():
with gr.Column(scale=1):
gr.Markdown("### Character Selection")
preset_dropdown = gr.Dropdown(
choices=list(PRESET_CHARACTERS.keys()),
label="Preset Characters",
value=list(PRESET_CHARACTERS.keys())[0],
info="Quick pick from 4 preset characters"
)
gr.Markdown("### Voice Design")
description_input = gr.Textbox(
label="Voice Description",
placeholder="E.g., Male voice in their 30s with american accent. Normal pitch, warm timbre...",
lines=3,
value=PRESET_CHARACTERS[list(PRESET_CHARACTERS.keys())[0]]["description"]
)
text_input = gr.Textbox(
label="Text to Speak",
placeholder="Enter text with <emotion> tags like <laugh>, <sigh>, <excited>...",
lines=4,
value=PRESET_CHARACTERS[list(PRESET_CHARACTERS.keys())[0]]["example_text"]
)
with gr.Accordion("Advanced Settings", open=False):
temperature_slider = gr.Slider(
minimum=0.1,
maximum=1.0,
value=0.4,
step=0.1,
label="Temperature",
info="Lower = more stable, Higher = more creative"
)
max_tokens_slider = gr.Slider(
minimum=100,
maximum=2048,
value=500,
step=50,
label="Max Tokens",
info="More tokens = longer audio"
)
generate_btn = gr.Button("Generate Speech", variant="primary", size="lg")
with gr.Column(scale=1):
gr.Markdown("### Generated Audio")
audio_output = gr.Audio(
label="Generated Speech",
type="filepath",
interactive=False
)
status_output = gr.Textbox(
label="Status",
lines=3,
interactive=False
)
gr.Markdown("""
### Supported Emotions
`<angry>` `<chuckle>` `<cry>` `<disappointed>` `<excited>` `<gasp>`
`<giggle>` `<laugh>` `<laugh_harder>` `<sarcastic>` `<sigh>`
`<sing>` `<whisper>`
""")
# Event handlers
preset_dropdown.change(
fn=preset_selected,
inputs=[preset_dropdown],
outputs=[description_input, text_input]
)
generate_btn.click(
fn=generate_speech,
inputs=[preset_dropdown, description_input, text_input, temperature_slider, max_tokens_slider],
outputs=[audio_output, status_output]
)
if __name__ == "__main__":
demo.launch()