Spaces:

maya-research
/

maya1

Running on Zero

File size: 8,728 Bytes

import gradio as gr
import asyncio
import io
import sys
sys.path.insert(0, '.')

# Mock spaces module for local testing
try:
    import spaces
except ImportError:
    class SpacesMock:
        @staticmethod
        def GPU(func):
            return func
    spaces = SpacesMock()

from maya1.model_loader import Maya1Model
from maya1.pipeline import Maya1Pipeline
from maya1.prompt_builder import Maya1PromptBuilder
from maya1.snac_decoder import SNACDecoder
from maya1.constants import AUDIO_SAMPLE_RATE

# Preset characters (2 realistic + 2 creative)
PRESET_CHARACTERS = {
    "Male American": {
        "description": "Male voice in their 30s with american accent",
        "example_text": "Hello world <laugh_harder> this is amazing <giggle> I love it"
    },
    "Female British": {
        "description": "Female voice in their 20s with british accent",
        "example_text": "Welcome everyone <excited> let me tell you something <sigh> incredible"
    },
    "Robot": {
        "description": "Creative, ai_machine_voice character. Male voice with robotic timbre",
        "example_text": "System initialized <whisper> processing data <gasp> computation complete"
    },
    "Singer": {
        "description": "Creative character. Female voice with smooth timbre",
        "example_text": "Listen to this <sing> la la la <laugh> beautiful melody <giggle>"
    }
}

# Global pipeline variables
model = None
prompt_builder = None
snac_decoder = None
pipeline = None
models_loaded = False

def load_models():
    """Load Maya1 vLLM model and pipeline (runs once)."""
    global model, prompt_builder, snac_decoder, pipeline, models_loaded
    
    if models_loaded:
        return
    
    import torch
    import os
    
    # Ensure CUDA is available for HF Spaces
    if not torch.cuda.is_available():
        print("Warning: CUDA not available, using CPU")
        device = "cpu"
    else:
        device = "cuda"
        print(f"CUDA available: {torch.cuda.get_device_name(0)}")
    
    # Set environment variable for vLLM
    os.environ.setdefault("VLLM_USE_V1", "0")
    
    print("Loading Maya1 model with vLLM...")
    model = Maya1Model(
        model_path="maya-research/maya1",
        dtype="bfloat16",
        max_model_len=8192,
        gpu_memory_utilization=0.85,
    )
    
    print("Initializing prompt builder...")
    prompt_builder = Maya1PromptBuilder(model.tokenizer, model)
    
    print("Loading SNAC decoder...")
    snac_decoder = SNACDecoder(
        device=device,
        enable_batching=False,
    )
    
    print("Initializing pipeline...")
    pipeline = Maya1Pipeline(model, prompt_builder, snac_decoder)
    
    models_loaded = True
    print("Models loaded successfully!")

def preset_selected(preset_name):
    """Update description and text when preset is selected."""
    if preset_name in PRESET_CHARACTERS:
        char = PRESET_CHARACTERS[preset_name]
        return char["description"], char["example_text"]
    return "", ""

@spaces.GPU
def generate_speech(preset_name, description, text, temperature, max_tokens):
    """Generate emotional speech from description and text using vLLM."""
    try:
        # Load models if not already loaded
        load_models()
        
        # If using preset, override description
        if preset_name and preset_name in PRESET_CHARACTERS:
            description = PRESET_CHARACTERS[preset_name]["description"]
        
        # Validate inputs
        if not description or not text:
            return None, "Error: Please provide both description and text!"
        
        print(f"Generating with temperature={temperature}, max_tokens={max_tokens}...")
        
        # Generate audio using vLLM pipeline (async wrapper)
        loop = asyncio.new_event_loop()
        asyncio.set_event_loop(loop)
        audio_bytes = loop.run_until_complete(
            pipeline.generate_speech(
                description=description,
                text=text,
                temperature=temperature,
                top_p=0.9,
                max_tokens=max_tokens,
                repetition_penalty=1.1,
                seed=None,
            )
        )
        loop.close()
        
        if audio_bytes is None:
            return None, "Error: Audio generation failed. Try different text or increase max_tokens."
        
        # Convert bytes to WAV file
        import wave
        wav_buffer = io.BytesIO()
        with wave.open(wav_buffer, 'wb') as wav_file:
            wav_file.setnchannels(1)
            wav_file.setsampwidth(2)
            wav_file.setframerate(AUDIO_SAMPLE_RATE)
            wav_file.writeframes(audio_bytes)
        
        wav_buffer.seek(0)
        
        # Calculate duration
        duration = len(audio_bytes) // 2 / AUDIO_SAMPLE_RATE
        frames = len(audio_bytes) // 2 // (AUDIO_SAMPLE_RATE // 6.86) // 7
        
        status_msg = f"Generated {duration:.2f}s of emotional speech!"
        
        return wav_buffer, status_msg
    
    except Exception as e:
        import traceback
        error_msg = f"Error: {str(e)}\n{traceback.format_exc()}"
        print(error_msg)
        return None, error_msg

# Create Gradio interface
with gr.Blocks(title="Maya1 - Open Source Emotional TTS", theme=gr.themes.Soft()) as demo:
    gr.Markdown("""
    # Maya1 - Open Source Emotional Text-to-Speech
    
    **The best open source voice AI model with emotions!**
    
    Generate realistic and expressive speech with natural language voice design.
    Choose a preset character or create your own custom voice.
    
    [Model](https://huggingface.co/maya-research/maya1) | [GitHub](https://github.com/MayaResearch/maya1-fastapi)
    """)
    
    with gr.Row():
        with gr.Column(scale=1):
            gr.Markdown("### Character Selection")
            
            preset_dropdown = gr.Dropdown(
                choices=list(PRESET_CHARACTERS.keys()),
                label="Preset Characters",
                value=list(PRESET_CHARACTERS.keys())[0],
                info="Quick pick from 4 preset characters"
            )
            
            gr.Markdown("### Voice Design")
            
            description_input = gr.Textbox(
                label="Voice Description",
                placeholder="E.g., Male voice in their 30s with american accent. Normal pitch, warm timbre...",
                lines=3,
                value=PRESET_CHARACTERS[list(PRESET_CHARACTERS.keys())[0]]["description"]
            )
            
            text_input = gr.Textbox(
                label="Text to Speak",
                placeholder="Enter text with <emotion> tags like <laugh>, <sigh>, <excited>...",
                lines=4,
                value=PRESET_CHARACTERS[list(PRESET_CHARACTERS.keys())[0]]["example_text"]
            )
            
            with gr.Accordion("Advanced Settings", open=False):
                temperature_slider = gr.Slider(
                    minimum=0.1,
                    maximum=1.0,
                    value=0.4,
                    step=0.1,
                    label="Temperature",
                    info="Lower = more stable, Higher = more creative"
                )
                
                max_tokens_slider = gr.Slider(
                    minimum=100,
                    maximum=2048,
                    value=500,
                    step=50,
                    label="Max Tokens",
                    info="More tokens = longer audio"
                )
            
            generate_btn = gr.Button("Generate Speech", variant="primary", size="lg")
        
        with gr.Column(scale=1):
            gr.Markdown("### Generated Audio")
            
            audio_output = gr.Audio(
                label="Generated Speech",
                type="filepath",
                interactive=False
            )
            
            status_output = gr.Textbox(
                label="Status",
                lines=3,
                interactive=False
            )
            
            gr.Markdown("""
            ### Supported Emotions
            
            `<angry>` `<chuckle>` `<cry>` `<disappointed>` `<excited>` `<gasp>` 
            `<giggle>` `<laugh>` `<laugh_harder>` `<sarcastic>` `<sigh>` 
            `<sing>` `<whisper>`
            """)
    
    # Event handlers
    preset_dropdown.change(
        fn=preset_selected,
        inputs=[preset_dropdown],
        outputs=[description_input, text_input]
    )
    
    generate_btn.click(
        fn=generate_speech,
        inputs=[preset_dropdown, description_input, text_input, temperature_slider, max_tokens_slider],
        outputs=[audio_output, status_output]
    )

if __name__ == "__main__":
    demo.launch()