Spaces:

maya-research
/

maya1

Paused

App Files Files Community

abidlabs HF Staff commited on Jan 14

Commit

68ae9a5

verified ·

1 Parent(s): 8175488

Update app.py

Browse files

Files changed (1) hide show

app.py +123 -126

app.py CHANGED Viewed

@@ -55,15 +55,27 @@ tokenizer = None
 snac_model = None
 models_loaded = False
 def build_prompt(tokenizer, description: str, text: str) -> str:
-    """Build formatted prompt for Maya1."""
     soh_token = tokenizer.decode([SOH_ID])
     eoh_token = tokenizer.decode([EOH_ID])
     soa_token = tokenizer.decode([SOA_ID])
     sos_token = tokenizer.decode([CODE_START_TOKEN_ID])
     eot_token = tokenizer.decode([TEXT_EOT_ID])
     bos_token = tokenizer.bos_token
     formatted_text = f'<description="{description}"> {text}'
     prompt = (
         soh_token + bos_token + formatted_text + eot_token +
@@ -71,21 +83,33 @@ def build_prompt(tokenizer, description: str, text: str) -> str:
     )
     return prompt
 def unpack_snac_from_7(snac_tokens: list) -> list:
-    """Unpack 7-token SNAC frames to 3 hierarchical levels."""
     if snac_tokens and snac_tokens[-1] == CODE_END_TOKEN_ID:
         snac_tokens = snac_tokens[:-1]
     frames = len(snac_tokens) // 7
     snac_tokens = snac_tokens[:frames * 7]
     if frames == 0:
         return [[], [], []]
     l1, l2, l3 = [], [], []
     for i in range(frames):
-        slots = snac_tokens[i*7:(i+1)*7]
         l1.append((slots[0] - CODE_TOKEN_OFFSET) % 4096)
         l2.extend([
             (slots[1] - CODE_TOKEN_OFFSET) % 4096,
@@ -97,220 +121,193 @@ def unpack_snac_from_7(snac_tokens: list) -> list:
             (slots[5] - CODE_TOKEN_OFFSET) % 4096,
             (slots[6] - CODE_TOKEN_OFFSET) % 4096,
         ])
     return [l1, l2, l3]
 def load_models():
-    """Load Maya1 Transformers model (runs once)."""
     global model, tokenizer, snac_model, models_loaded
     if models_loaded:
         return
     print("Loading Maya1 model with Transformers...")
     model = AutoModelForCausalLM.from_pretrained(
-        "maya-research/maya1",
-        torch_dtype=torch.bfloat16,
         device_map="auto",
         trust_remote_code=True
     )
-    tokenizer = AutoTokenizer.from_pretrained("maya-research/maya1", trust_remote_code=True)
     print("Loading SNAC decoder...")
     snac_model = SNAC.from_pretrained("hubertsiuzdak/snac_24khz").eval()
     if torch.cuda.is_available():
         snac_model = snac_model.to("cuda")
     models_loaded = True
     print("Models loaded successfully!")
 def preset_selected(preset_name):
-    """Update description and text when preset is selected."""
     if preset_name in PRESET_CHARACTERS:
         char = PRESET_CHARACTERS[preset_name]
         return char["description"], char["example_text"]
     return "", ""
 @spaces.GPU
 def generate_speech(preset_name, description, text, temperature, max_tokens):
-    """Generate emotional speech from description and text using Transformers."""
     try:
-        # Load models if not already loaded
         load_models()
-        # Validate inputs
         if not description or not text:
             return None, "Error: Please provide both description and text!"
-        print(f"Generating with temperature={temperature}, max_tokens={max_tokens}...")
-        # Build prompt
         prompt = build_prompt(tokenizer, description, text)
         inputs = tokenizer(prompt, return_tensors="pt")
         if torch.cuda.is_available():
             inputs = {k: v.to("cuda") for k, v in inputs.items()}
-        # Generate tokens
         with torch.inference_mode():
             outputs = model.generate(
-                **inputs,
                 max_new_tokens=max_tokens,
                 min_new_tokens=28,
-                temperature=temperature,
-                top_p=0.9,
                 repetition_penalty=1.1,
                 do_sample=True,
                 eos_token_id=CODE_END_TOKEN_ID,
                 pad_token_id=tokenizer.pad_token_id,
             )
-        # Extract SNAC tokens
-        generated_ids = outputs[0, inputs['input_ids'].shape[1]:].tolist()
-        # Find EOS and extract SNAC codes
         eos_idx = generated_ids.index(CODE_END_TOKEN_ID) if CODE_END_TOKEN_ID in generated_ids else len(generated_ids)
         snac_tokens = [t for t in generated_ids[:eos_idx] if SNAC_MIN_ID <= t <= SNAC_MAX_ID]
         if len(snac_tokens) < 7:
             return None, "Error: Not enough tokens generated. Try different text or increase max_tokens."
-        # Unpack and decode
         levels = unpack_snac_from_7(snac_tokens)
-        frames = len(levels[0])
         device = "cuda" if torch.cuda.is_available() else "cpu"
-        codes_tensor = [torch.tensor(level, dtype=torch.long, device=device).unsqueeze(0) for level in levels]
         with torch.inference_mode():
             z_q = snac_model.quantizer.from_codes(codes_tensor)
             audio = snac_model.decoder(z_q)[0, 0].cpu().numpy()
-        # Trim warmup
         if len(audio) > 2048:
             audio = audio[2048:]
-        # Convert to WAV and save to temporary file
         import tempfile
         import soundfile as sf
         audio_int16 = (audio * 32767).astype(np.int16)
-        # Create temporary file
-        with tempfile.NamedTemporaryFile(delete=False, suffix='.wav') as tmp_file:
             tmp_path = tmp_file.name
-        # Save audio
         sf.write(tmp_path, audio_int16, AUDIO_SAMPLE_RATE)
         duration = len(audio) / AUDIO_SAMPLE_RATE
-        status_msg = f"Generated {duration:.2f}s of emotional speech!"
-        return tmp_path, status_msg
     except Exception as e:
         import traceback
         error_msg = f"Error: {str(e)}\n{traceback.format_exc()}"
         print(error_msg)
         return None, error_msg
-# Create Gradio interface
 with gr.Blocks(title="Maya1 - Open Source Emotional TTS", theme=gr.themes.Soft()) as demo:
     gr.Markdown("""
     # Maya1 - Open Source Emotional Text-to-Speech
     **The best open source voice AI model with emotions!**
-    Generate realistic and expressive speech with natural language voice design.
-    Choose a preset character or create your own custom voice.
-    [Model](https://huggingface.co/maya-research/maya1) | [GitHub](https://github.com/MayaResearch/maya1-fastapi)
     """)
     with gr.Row():
         with gr.Column(scale=1):
-            gr.Markdown("### Character Selection")
             preset_dropdown = gr.Dropdown(
                 choices=list(PRESET_CHARACTERS.keys()),
-                label="Preset Characters",
                 value=list(PRESET_CHARACTERS.keys())[0],
-                info="Quick pick from 4 preset characters"
             )
-            gr.Markdown("### Voice Design")
             description_input = gr.Textbox(
                 label="Voice Description",
-                placeholder="E.g., Male voice in their 30s with american accent. Normal pitch, warm timbre...",
                 lines=3,
                 value=PRESET_CHARACTERS[list(PRESET_CHARACTERS.keys())[0]]["description"]
             )
             text_input = gr.Textbox(
                 label="Text to Speak",
-                placeholder="Enter text with <emotion> tags like <laugh>, <sigh>, <excited>...",
                 lines=4,
                 value=PRESET_CHARACTERS[list(PRESET_CHARACTERS.keys())[0]]["example_text"]
             )
-            with gr.Accordion("Advanced Settings", open=False):
-                temperature_slider = gr.Slider(
-                    minimum=0.1,
-                    maximum=1.0,
-                    value=0.4,
-                    step=0.1,
-                    label="Temperature",
-                    info="Lower = more stable, Higher = more creative"
-                )
-                max_tokens_slider = gr.Slider(
-                    minimum=100,
-                    maximum=2048,
-                    value=1500,
-                    step=50,
-                    label="Max Tokens",
-                    info="More tokens = longer audio"
-                )
-            generate_btn = gr.Button("Generate Speech", variant="primary", size="lg")
         with gr.Column(scale=1):
-            gr.Markdown("### Generated Audio")
-            audio_output = gr.Audio(
-                label="Generated Speech",
-                type="filepath",
-                interactive=False
-            )
-            status_output = gr.Textbox(
-                label="Status",
-                lines=3,
-                interactive=False
-            )
-            gr.Markdown("""
-            ### Supported Emotions
-            `<angry>` `<chuckle>` `<cry>` `<disappointed>` `<excited>` `<gasp>`
-            `<giggle>` `<laugh>` `<laugh_harder>` `<sarcastic>` `<sigh>`
-            `<sing>` `<whisper>`
-            """)
-    # Event handlers
     preset_dropdown.change(
         fn=preset_selected,
-        inputs=[preset_dropdown],
         outputs=[description_input, text_input]
     )
     generate_btn.click(
         fn=generate_speech,
         inputs=[preset_dropdown, description_input, text_input, temperature_slider, max_tokens_slider],
         outputs=[audio_output, status_output]
     )
-if __name__ == "__main__":
-    demo.launch()

 snac_model = None
 models_loaded = False
 def build_prompt(tokenizer, description: str, text: str) -> str:
+    """
+    Build a formatted prompt for the Maya1 text-to-speech model.
+    This function constructs the full input prompt expected by Maya1, including
+    special control tokens and a structured description tag that defines voice
+    characteristics and emotional delivery.
+    Args:
+        tokenizer: The tokenizer associated with the Maya1 model.
+        description (str): A structured natural-language description of the voice.
+        text (str): The text content to be synthesized into speech.
+    Returns:
+        str: A fully formatted prompt string ready for tokenization and generation.
+    """
     soh_token = tokenizer.decode([SOH_ID])
     eoh_token = tokenizer.decode([EOH_ID])
     soa_token = tokenizer.decode([SOA_ID])
     sos_token = tokenizer.decode([CODE_START_TOKEN_ID])
     eot_token = tokenizer.decode([TEXT_EOT_ID])
     bos_token = tokenizer.bos_token
     formatted_text = f'<description="{description}"> {text}'
     prompt = (
         soh_token + bos_token + formatted_text + eot_token +
     )
     return prompt
 def unpack_snac_from_7(snac_tokens: list) -> list:
+    """
+    Unpack SNAC tokens from 7-token frames into hierarchical code levels.
+    This function converts a flat list of SNAC token IDs produced by the model
+    into three hierarchical code streams required by the SNAC decoder.
+    Args:
+        snac_tokens (list): A list of integer SNAC token IDs generated by the model.
+    Returns:
+        list:
+            - level_1 (list[int]): Coarse acoustic codes.
+            - level_2 (list[int]): Mid-level acoustic codes.
+            - level_3 (list[int]): Fine-grained acoustic codes.
+    """
     if snac_tokens and snac_tokens[-1] == CODE_END_TOKEN_ID:
         snac_tokens = snac_tokens[:-1]
     frames = len(snac_tokens) // 7
     snac_tokens = snac_tokens[:frames * 7]
     if frames == 0:
         return [[], [], []]
     l1, l2, l3 = [], [], []
     for i in range(frames):
+        slots = snac_tokens[i * 7:(i + 1) * 7]
         l1.append((slots[0] - CODE_TOKEN_OFFSET) % 4096)
         l2.extend([
             (slots[1] - CODE_TOKEN_OFFSET) % 4096,
             (slots[5] - CODE_TOKEN_OFFSET) % 4096,
             (slots[6] - CODE_TOKEN_OFFSET) % 4096,
         ])
     return [l1, l2, l3]
 def load_models():
+    """
+    Load the Maya1 language model, tokenizer, and SNAC audio decoder.
+    This function performs one-time initialization of all required models.
+    Subsequent calls are no-ops to avoid reloading large model weights.
+    """
     global model, tokenizer, snac_model, models_loaded
     if models_loaded:
         return
     print("Loading Maya1 model with Transformers...")
     model = AutoModelForCausalLM.from_pretrained(
+        "maya-research/maya1",
+        torch_dtype=torch.bfloat16,
         device_map="auto",
         trust_remote_code=True
     )
+    tokenizer = AutoTokenizer.from_pretrained(
+        "maya-research/maya1",
+        trust_remote_code=True
+    )
     print("Loading SNAC decoder...")
     snac_model = SNAC.from_pretrained("hubertsiuzdak/snac_24khz").eval()
     if torch.cuda.is_available():
         snac_model = snac_model.to("cuda")
     models_loaded = True
     print("Models loaded successfully!")
 def preset_selected(preset_name):
+    """
+    Update the voice description and example text based on a preset selection.
+    This function is used as a Gradio event handler to populate UI fields when
+    a preset character is chosen.
+    Args:
+        preset_name (str): The name of the selected preset character.
+    Returns:
+        tuple:
+            - description (str): The preset voice description.
+            - example_text (str): The preset example dialogue.
+    """
     if preset_name in PRESET_CHARACTERS:
         char = PRESET_CHARACTERS[preset_name]
         return char["description"], char["example_text"]
     return "", ""
 @spaces.GPU
 def generate_speech(preset_name, description, text, temperature, max_tokens):
+    """
+    Generate emotional speech audio from text and voice description.
+    This function runs the full Maya1 inference pipeline: prompt construction,
+    token generation, SNAC code extraction, audio decoding, and WAV export.
+    It is designed to be called directly from a Gradio interface.
+    Args:
+        preset_name (str): Name of the selected preset character.
+        description (str): Natural-language voice design description.
+        text (str): Input text containing optional emotion tags.
+        temperature (float): Sampling temperature controlling creativity.
+        max_tokens (int): Maximum number of tokens to generate.
+    Returns:
+        tuple:
+            - audio_path (str or None): Path to the generated WAV file.
+            - status_message (str): Success or error message.
+    """
     try:
         load_models()
         if not description or not text:
             return None, "Error: Please provide both description and text!"
         prompt = build_prompt(tokenizer, description, text)
         inputs = tokenizer(prompt, return_tensors="pt")
         if torch.cuda.is_available():
             inputs = {k: v.to("cuda") for k, v in inputs.items()}
         with torch.inference_mode():
             outputs = model.generate(
+                **inputs,
                 max_new_tokens=max_tokens,
                 min_new_tokens=28,
+                temperature=temperature,
+                top_p=0.9,
                 repetition_penalty=1.1,
                 do_sample=True,
                 eos_token_id=CODE_END_TOKEN_ID,
                 pad_token_id=tokenizer.pad_token_id,
             )
+        generated_ids = outputs[0, inputs["input_ids"].shape[1]:].tolist()
         eos_idx = generated_ids.index(CODE_END_TOKEN_ID) if CODE_END_TOKEN_ID in generated_ids else len(generated_ids)
         snac_tokens = [t for t in generated_ids[:eos_idx] if SNAC_MIN_ID <= t <= SNAC_MAX_ID]
         if len(snac_tokens) < 7:
             return None, "Error: Not enough tokens generated. Try different text or increase max_tokens."
         levels = unpack_snac_from_7(snac_tokens)
         device = "cuda" if torch.cuda.is_available() else "cpu"
+        codes_tensor = [
+            torch.tensor(level, dtype=torch.long, device=device).unsqueeze(0)
+            for level in levels
+        ]
         with torch.inference_mode():
             z_q = snac_model.quantizer.from_codes(codes_tensor)
             audio = snac_model.decoder(z_q)[0, 0].cpu().numpy()
         if len(audio) > 2048:
             audio = audio[2048:]
         import tempfile
         import soundfile as sf
         audio_int16 = (audio * 32767).astype(np.int16)
+        with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as tmp_file:
             tmp_path = tmp_file.name
         sf.write(tmp_path, audio_int16, AUDIO_SAMPLE_RATE)
         duration = len(audio) / AUDIO_SAMPLE_RATE
+        return tmp_path, f"Generated {duration:.2f}s of emotional speech!"
     except Exception as e:
         import traceback
         error_msg = f"Error: {str(e)}\n{traceback.format_exc()}"
         print(error_msg)
         return None, error_msg
+# -------------------- Gradio App --------------------
 with gr.Blocks(title="Maya1 - Open Source Emotional TTS", theme=gr.themes.Soft()) as demo:
     gr.Markdown("""
     # Maya1 - Open Source Emotional Text-to-Speech
     **The best open source voice AI model with emotions!**
     """)
     with gr.Row():
         with gr.Column(scale=1):
             preset_dropdown = gr.Dropdown(
                 choices=list(PRESET_CHARACTERS.keys()),
                 value=list(PRESET_CHARACTERS.keys())[0],
+                label="Preset Characters"
             )
             description_input = gr.Textbox(
                 label="Voice Description",
                 lines=3,
                 value=PRESET_CHARACTERS[list(PRESET_CHARACTERS.keys())[0]]["description"]
             )
             text_input = gr.Textbox(
                 label="Text to Speak",
                 lines=4,
                 value=PRESET_CHARACTERS[list(PRESET_CHARACTERS.keys())[0]]["example_text"]
             )
+            temperature_slider = gr.Slider(0.1, 1.0, 0.4, step=0.1, label="Temperature")
+            max_tokens_slider = gr.Slider(100, 2048, 1500, step=50, label="Max Tokens")
+            generate_btn = gr.Button("Generate Speech", variant="primary")
         with gr.Column(scale=1):
+            audio_output = gr.Audio(type="filepath", label="Generated Audio")
+            status_output = gr.Textbox(label="Status")
     preset_dropdown.change(
         fn=preset_selected,
+        inputs=preset_dropdown,
         outputs=[description_input, text_input]
     )
     generate_btn.click(
         fn=generate_speech,
         inputs=[preset_dropdown, description_input, text_input, temperature_slider, max_tokens_slider],
         outputs=[audio_output, status_output]
     )
+if __name__ == "__main__":
+    demo.launch(mcp_server=True)