Spaces:
Running
on
Zero
Running
on
Zero
| import gradio as gr | |
| import asyncio | |
| import io | |
| import sys | |
| sys.path.insert(0, '.') | |
| # Mock spaces module for local testing | |
| try: | |
| import spaces | |
| except ImportError: | |
| class SpacesMock: | |
| def GPU(func): | |
| return func | |
| spaces = SpacesMock() | |
| from maya1.model_loader import Maya1Model | |
| from maya1.pipeline import Maya1Pipeline | |
| from maya1.prompt_builder import Maya1PromptBuilder | |
| from maya1.snac_decoder import SNACDecoder | |
| from maya1.constants import AUDIO_SAMPLE_RATE | |
| # Preset characters (2 realistic + 2 creative) | |
| PRESET_CHARACTERS = { | |
| "Male American": { | |
| "description": "Male voice in their 30s with american accent", | |
| "example_text": "Hello world <laugh_harder> this is amazing <giggle> I love it" | |
| }, | |
| "Female British": { | |
| "description": "Female voice in their 20s with british accent", | |
| "example_text": "Welcome everyone <excited> let me tell you something <sigh> incredible" | |
| }, | |
| "Robot": { | |
| "description": "Creative, ai_machine_voice character. Male voice with robotic timbre", | |
| "example_text": "System initialized <whisper> processing data <gasp> computation complete" | |
| }, | |
| "Singer": { | |
| "description": "Creative character. Female voice with smooth timbre", | |
| "example_text": "Listen to this <sing> la la la <laugh> beautiful melody <giggle>" | |
| } | |
| } | |
| # Global pipeline variables | |
| model = None | |
| prompt_builder = None | |
| snac_decoder = None | |
| pipeline = None | |
| models_loaded = False | |
| def load_models(): | |
| """Load Maya1 vLLM model and pipeline (runs once).""" | |
| global model, prompt_builder, snac_decoder, pipeline, models_loaded | |
| if models_loaded: | |
| return | |
| import torch | |
| import os | |
| # Ensure CUDA is available for HF Spaces | |
| if not torch.cuda.is_available(): | |
| print("Warning: CUDA not available, using CPU") | |
| device = "cpu" | |
| else: | |
| device = "cuda" | |
| print(f"CUDA available: {torch.cuda.get_device_name(0)}") | |
| # Set environment variable for vLLM | |
| os.environ.setdefault("VLLM_USE_V1", "0") | |
| print("Loading Maya1 model with vLLM...") | |
| model = Maya1Model( | |
| model_path="maya-research/maya1", | |
| dtype="bfloat16", | |
| max_model_len=8192, | |
| gpu_memory_utilization=0.85, | |
| ) | |
| print("Initializing prompt builder...") | |
| prompt_builder = Maya1PromptBuilder(model.tokenizer, model) | |
| print("Loading SNAC decoder...") | |
| snac_decoder = SNACDecoder( | |
| device=device, | |
| enable_batching=False, | |
| ) | |
| print("Initializing pipeline...") | |
| pipeline = Maya1Pipeline(model, prompt_builder, snac_decoder) | |
| models_loaded = True | |
| print("Models loaded successfully!") | |
| def preset_selected(preset_name): | |
| """Update description and text when preset is selected.""" | |
| if preset_name in PRESET_CHARACTERS: | |
| char = PRESET_CHARACTERS[preset_name] | |
| return char["description"], char["example_text"] | |
| return "", "" | |
| def generate_speech(preset_name, description, text, temperature, max_tokens): | |
| """Generate emotional speech from description and text using vLLM.""" | |
| try: | |
| # Load models if not already loaded | |
| load_models() | |
| # If using preset, override description | |
| if preset_name and preset_name in PRESET_CHARACTERS: | |
| description = PRESET_CHARACTERS[preset_name]["description"] | |
| # Validate inputs | |
| if not description or not text: | |
| return None, "Error: Please provide both description and text!" | |
| print(f"Generating with temperature={temperature}, max_tokens={max_tokens}...") | |
| # Generate audio using vLLM pipeline (async wrapper) | |
| loop = asyncio.new_event_loop() | |
| asyncio.set_event_loop(loop) | |
| audio_bytes = loop.run_until_complete( | |
| pipeline.generate_speech( | |
| description=description, | |
| text=text, | |
| temperature=temperature, | |
| top_p=0.9, | |
| max_tokens=max_tokens, | |
| repetition_penalty=1.1, | |
| seed=None, | |
| ) | |
| ) | |
| loop.close() | |
| if audio_bytes is None: | |
| return None, "Error: Audio generation failed. Try different text or increase max_tokens." | |
| # Convert bytes to WAV file | |
| import wave | |
| wav_buffer = io.BytesIO() | |
| with wave.open(wav_buffer, 'wb') as wav_file: | |
| wav_file.setnchannels(1) | |
| wav_file.setsampwidth(2) | |
| wav_file.setframerate(AUDIO_SAMPLE_RATE) | |
| wav_file.writeframes(audio_bytes) | |
| wav_buffer.seek(0) | |
| # Calculate duration | |
| duration = len(audio_bytes) // 2 / AUDIO_SAMPLE_RATE | |
| frames = len(audio_bytes) // 2 // (AUDIO_SAMPLE_RATE // 6.86) // 7 | |
| status_msg = f"Generated {duration:.2f}s of emotional speech!" | |
| return wav_buffer, status_msg | |
| except Exception as e: | |
| import traceback | |
| error_msg = f"Error: {str(e)}\n{traceback.format_exc()}" | |
| print(error_msg) | |
| return None, error_msg | |
| # Create Gradio interface | |
| with gr.Blocks(title="Maya1 - Open Source Emotional TTS", theme=gr.themes.Soft()) as demo: | |
| gr.Markdown(""" | |
| # Maya1 - Open Source Emotional Text-to-Speech | |
| **The best open source voice AI model with emotions!** | |
| Generate realistic and expressive speech with natural language voice design. | |
| Choose a preset character or create your own custom voice. | |
| [Model](https://huggingface.co/maya-research/maya1) | [GitHub](https://github.com/MayaResearch/maya1-fastapi) | |
| """) | |
| with gr.Row(): | |
| with gr.Column(scale=1): | |
| gr.Markdown("### Character Selection") | |
| preset_dropdown = gr.Dropdown( | |
| choices=list(PRESET_CHARACTERS.keys()), | |
| label="Preset Characters", | |
| value=list(PRESET_CHARACTERS.keys())[0], | |
| info="Quick pick from 4 preset characters" | |
| ) | |
| gr.Markdown("### Voice Design") | |
| description_input = gr.Textbox( | |
| label="Voice Description", | |
| placeholder="E.g., Male voice in their 30s with american accent. Normal pitch, warm timbre...", | |
| lines=3, | |
| value=PRESET_CHARACTERS[list(PRESET_CHARACTERS.keys())[0]]["description"] | |
| ) | |
| text_input = gr.Textbox( | |
| label="Text to Speak", | |
| placeholder="Enter text with <emotion> tags like <laugh>, <sigh>, <excited>...", | |
| lines=4, | |
| value=PRESET_CHARACTERS[list(PRESET_CHARACTERS.keys())[0]]["example_text"] | |
| ) | |
| with gr.Accordion("Advanced Settings", open=False): | |
| temperature_slider = gr.Slider( | |
| minimum=0.1, | |
| maximum=1.0, | |
| value=0.4, | |
| step=0.1, | |
| label="Temperature", | |
| info="Lower = more stable, Higher = more creative" | |
| ) | |
| max_tokens_slider = gr.Slider( | |
| minimum=100, | |
| maximum=2048, | |
| value=500, | |
| step=50, | |
| label="Max Tokens", | |
| info="More tokens = longer audio" | |
| ) | |
| generate_btn = gr.Button("Generate Speech", variant="primary", size="lg") | |
| with gr.Column(scale=1): | |
| gr.Markdown("### Generated Audio") | |
| audio_output = gr.Audio( | |
| label="Generated Speech", | |
| type="filepath", | |
| interactive=False | |
| ) | |
| status_output = gr.Textbox( | |
| label="Status", | |
| lines=3, | |
| interactive=False | |
| ) | |
| gr.Markdown(""" | |
| ### Supported Emotions | |
| `<angry>` `<chuckle>` `<cry>` `<disappointed>` `<excited>` `<gasp>` | |
| `<giggle>` `<laugh>` `<laugh_harder>` `<sarcastic>` `<sigh>` | |
| `<sing>` `<whisper>` | |
| """) | |
| # Event handlers | |
| preset_dropdown.change( | |
| fn=preset_selected, | |
| inputs=[preset_dropdown], | |
| outputs=[description_input, text_input] | |
| ) | |
| generate_btn.click( | |
| fn=generate_speech, | |
| inputs=[preset_dropdown, description_input, text_input, temperature_slider, max_tokens_slider], | |
| outputs=[audio_output, status_output] | |
| ) | |
| if __name__ == "__main__": | |
| demo.launch() | |