Spaces:
Running
on
Zero
Running
on
Zero
| import gradio as gr | |
| import asyncio | |
| import io | |
| import sys | |
| sys.path.insert(0, '.') | |
| # Mock spaces module for local testing | |
| try: | |
| import spaces | |
| except ImportError: | |
| class SpacesMock: | |
| def GPU(func): | |
| return func | |
| spaces = SpacesMock() | |
| from maya1.model_loader import Maya1Model | |
| from maya1.pipeline import Maya1Pipeline | |
| from maya1.prompt_builder import Maya1PromptBuilder | |
| from maya1.snac_decoder import SNACDecoder | |
| from maya1.constants import AUDIO_SAMPLE_RATE | |
| # Preset characters (2 realistic + 2 creative) | |
| PRESET_CHARACTERS = { | |
| "Realistic: Sarcastic Male (American)": { | |
| "description": "Realistic male voice in the 30s age with a american accent. Low pitch, nasally timbre, conversational pacing, sarcastic tone delivery at low intensity, commercial domain, product_demo_voice role, formal delivery", | |
| "example_text": "<sarcastic> He really stood up there and said we need to <chuckle> save the world. <sigh> What a joke." | |
| }, | |
| "Realistic: Excited Female (Asian-American)": { | |
| "description": "Realistic female voice in the 20s age with a asian_american accent. Normal pitch, smooth timbre, conversational pacing, neutral tone delivery at high intensity, viral_content domain, meme_voice role, formal delivery", | |
| "example_text": "<excited> I am issuing a formal commendation for this particular item! It has exceeded all established metrics for excellence. <gasp> This is something I would actually spend my own money on. <laugh> Seriously!" | |
| }, | |
| "Creative: Alpha Leader (Indian)": { | |
| "description": "Creative, alpha character. Male voice in their 30s with a indian accent. Normal pitch, nasally timbre, very_fast pacing, energetic tone at medium intensity.", | |
| "example_text": "<angry> I don't want to hear excuses, I only want to see solutions! <sigh> Get your teams together, brainstorm for thirty minutes, and come back to me with a plan. <excited> Now move!" | |
| }, | |
| "Creative: Vampire (Middle Eastern)": { | |
| "description": "Creative, vampire character. Male voice in their 40s with a middle_eastern accent. Low pitch, nasally timbre, very_slow pacing, excited tone at medium intensity.", | |
| "example_text": "<whisper> Soon you will join me in this magnificent eternal darkness. <laugh> And we shall feast upon the world together, <excited> bound by this exquisite night forever. <mischievous>" | |
| } | |
| } | |
| # Global pipeline variables | |
| model = None | |
| prompt_builder = None | |
| snac_decoder = None | |
| pipeline = None | |
| async def load_models(): | |
| """Load Maya1 vLLM model and pipeline (runs once).""" | |
| global model, prompt_builder, snac_decoder, pipeline | |
| if model is None: | |
| print("Loading Maya1 model with vLLM...") | |
| model = Maya1Model( | |
| model_path="maya-research/maya1", | |
| dtype="bfloat16", | |
| max_model_len=8192, | |
| gpu_memory_utilization=0.85, | |
| ) | |
| print("Initializing prompt builder...") | |
| prompt_builder = Maya1PromptBuilder(model.tokenizer, model) | |
| print("Loading SNAC decoder...") | |
| snac_decoder = SNACDecoder( | |
| device="cuda", | |
| enable_batching=False, | |
| ) | |
| print("Initializing pipeline...") | |
| pipeline = Maya1Pipeline(model, prompt_builder, snac_decoder) | |
| print("Models loaded successfully!") | |
| def preset_selected(preset_name): | |
| """Update description and text when preset is selected.""" | |
| if preset_name in PRESET_CHARACTERS: | |
| char = PRESET_CHARACTERS[preset_name] | |
| return char["description"], char["example_text"] | |
| return "", "" | |
| def generate_speech(preset_name, description, text, temperature, max_tokens): | |
| """Generate emotional speech from description and text using vLLM.""" | |
| try: | |
| # Load models if not already loaded | |
| asyncio.run(load_models()) | |
| # If using preset, override description | |
| if preset_name and preset_name in PRESET_CHARACTERS: | |
| description = PRESET_CHARACTERS[preset_name]["description"] | |
| # Validate inputs | |
| if not description or not text: | |
| return None, "Error: Please provide both description and text!" | |
| print(f"Generating with temperature={temperature}, max_tokens={max_tokens}...") | |
| # Generate audio using vLLM pipeline | |
| audio_bytes = asyncio.run( | |
| pipeline.generate_speech( | |
| description=description, | |
| text=text, | |
| temperature=temperature, | |
| top_p=0.9, | |
| max_tokens=max_tokens, | |
| repetition_penalty=1.1, | |
| seed=None, | |
| ) | |
| ) | |
| if audio_bytes is None: | |
| return None, "Error: Audio generation failed. Try different text or increase max_tokens." | |
| # Convert bytes to WAV file | |
| import wave | |
| wav_buffer = io.BytesIO() | |
| with wave.open(wav_buffer, 'wb') as wav_file: | |
| wav_file.setnchannels(1) | |
| wav_file.setsampwidth(2) | |
| wav_file.setframerate(AUDIO_SAMPLE_RATE) | |
| wav_file.writeframes(audio_bytes) | |
| wav_buffer.seek(0) | |
| # Calculate duration | |
| duration = len(audio_bytes) // 2 / AUDIO_SAMPLE_RATE | |
| frames = len(audio_bytes) // 2 // (AUDIO_SAMPLE_RATE // 6.86) // 7 | |
| status_msg = f"Generated {duration:.2f}s of emotional speech!" | |
| return wav_buffer, status_msg | |
| except Exception as e: | |
| import traceback | |
| error_msg = f"Error: {str(e)}\n{traceback.format_exc()}" | |
| print(error_msg) | |
| return None, error_msg | |
| # Create Gradio interface | |
| with gr.Blocks(title="Maya1 - Open Source Emotional TTS", theme=gr.themes.Soft()) as demo: | |
| gr.Markdown(""" | |
| # Maya1 - Open Source Emotional Text-to-Speech | |
| **The best open source voice AI model with emotions!** | |
| Generate realistic and expressive speech with natural language voice design. | |
| Choose a preset character or create your own custom voice. | |
| [Model](https://huggingface.co/maya-research/maya1) | [GitHub](https://github.com/MayaResearch/maya1-fastapi) | |
| """) | |
| with gr.Row(): | |
| with gr.Column(scale=1): | |
| gr.Markdown("### Character Selection") | |
| preset_dropdown = gr.Dropdown( | |
| choices=list(PRESET_CHARACTERS.keys()), | |
| label="Preset Characters", | |
| value=list(PRESET_CHARACTERS.keys())[0], | |
| info="Quick pick from 4 preset characters" | |
| ) | |
| gr.Markdown("### Voice Design") | |
| description_input = gr.Textbox( | |
| label="Voice Description", | |
| placeholder="E.g., Male voice in their 30s with american accent. Normal pitch, warm timbre...", | |
| lines=3, | |
| value=PRESET_CHARACTERS[list(PRESET_CHARACTERS.keys())[0]]["description"] | |
| ) | |
| text_input = gr.Textbox( | |
| label="Text to Speak", | |
| placeholder="Enter text with <emotion> tags like <laugh>, <sigh>, <excited>...", | |
| lines=4, | |
| value=PRESET_CHARACTERS[list(PRESET_CHARACTERS.keys())[0]]["example_text"] | |
| ) | |
| with gr.Accordion("Advanced Settings", open=False): | |
| temperature_slider = gr.Slider( | |
| minimum=0.1, | |
| maximum=1.0, | |
| value=0.4, | |
| step=0.1, | |
| label="Temperature", | |
| info="Lower = more stable, Higher = more creative" | |
| ) | |
| max_tokens_slider = gr.Slider( | |
| minimum=100, | |
| maximum=2048, | |
| value=500, | |
| step=50, | |
| label="Max Tokens", | |
| info="More tokens = longer audio" | |
| ) | |
| generate_btn = gr.Button("Generate Speech", variant="primary", size="lg") | |
| with gr.Column(scale=1): | |
| gr.Markdown("### Generated Audio") | |
| audio_output = gr.Audio( | |
| label="Generated Speech", | |
| type="filepath", | |
| interactive=False | |
| ) | |
| status_output = gr.Textbox( | |
| label="Status", | |
| lines=3, | |
| interactive=False | |
| ) | |
| gr.Markdown(""" | |
| ### Supported Emotions | |
| `<angry>` `<appalled>` `<chuckle>` `<cry>` `<curious>` `<disappointed>` | |
| `<excited>` `<exhale>` `<gasp>` `<giggle>` `<gulp>` `<laugh>` | |
| `<laugh_harder>` `<mischievous>` `<sarcastic>` `<scream>` `<sigh>` | |
| `<sing>` `<snort>` `<whisper>` | |
| ### Tips | |
| - Use emotion tags naturally in your text | |
| - Longer text needs more max_tokens | |
| - Lower temperature for consistent results | |
| - Presets are great starting points! | |
| """) | |
| # Event handlers | |
| preset_dropdown.change( | |
| fn=preset_selected, | |
| inputs=[preset_dropdown], | |
| outputs=[description_input, text_input] | |
| ) | |
| generate_btn.click( | |
| fn=generate_speech, | |
| inputs=[preset_dropdown, description_input, text_input, temperature_slider, max_tokens_slider], | |
| outputs=[audio_output, status_output] | |
| ) | |
| if __name__ == "__main__": | |
| demo.launch() | |