HeartMuLa

Running on Zero

App Files Files Community

mrfakename commited on 21 days ago

Commit

4af893c

verified ·

1 Parent(s): 0929406

Upload 2 files

Browse files

Files changed (2) hide show

app.py +246 -314
requirements.txt +2 -14

app.py CHANGED Viewed

@@ -1,363 +1,295 @@
-#!/usr/bin/env python3
 """
-HuggingFace Space app for Muse-8b music generation
-Text input -> Audio output
 """
-import spaces
-import gradio as gr
 import os
-import sys
 import tempfile
-from typing import Optional, Tuple
 import torch
-import numpy as np
-import torchaudio
-from huggingface_hub import snapshot_download
-# Add MuCodec to path
-sys.path.insert(0, "./MuCodec")
-from transformers import AutoModelForCausalLM, AutoTokenizer
-from MuCodec.model import PromptCondAudioDiffusion
-from MuCodec.tools.get_melvaehifigan48k import build_pretrained_models
-import MuCodec.tools.torch_tools as torch_tools
-# Constants
-MODEL_NAME = "bolshyC/Muse-8b"
-SAMPLE_RATE = 48000
-# ============================================================================
-# Model Loading at Module Level
-# ============================================================================
-print("Loading Muse language model...")
-device = "cuda" if torch.cuda.is_available() else "cpu"
-# Load language model
-tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, trust_remote_code=True)
-language_model = AutoModelForCausalLM.from_pretrained(
-    MODEL_NAME,
-    trust_remote_code=True,
-    torch_dtype=torch.float16 if device == "cuda" else torch.float32,
-    device_map="auto" if device == "cuda" else None,
-)
-if device == "cpu":
-    language_model = language_model.to(device)
-language_model.eval()
-print("Language model loaded!")
-# Load MuCodec decoder
-print("Loading MuCodec decoder...")
-mucodec_dir = "./MuCodec"
-ckpt_path = os.path.join(mucodec_dir, "ckpt/mucodec.pt")
-#audioldm_path = os.path.join(mucodec_dir, "tools/audioldm_48k.pth")
-audioldm_path = os.path.join(snapshot_download("haoheliu/audioldm_48k", local_dir="./alm"), "audioldm_48k.pth")
-config_path = os.path.join(mucodec_dir, "configs/models/transformer2D.json")
-# Load VAE and STFT
-vae, stft = build_pretrained_models(audioldm_path)
-vae = vae.eval().to(device)
-stft = stft.eval().to(device)
-# Load diffusion model
-main_config = {
-    "num_channels": 32,
-    "unet_model_name": None,
-    "unet_model_config_path": config_path,
-    "snr_gamma": None,
-}
-mucodec_model = PromptCondAudioDiffusion(**main_config)
-main_weights = torch.load(ckpt_path, map_location='cpu')
-mucodec_model.load_state_dict(main_weights, strict=False)
-mucodec_model = mucodec_model.to(device).eval()
-mucodec_model.init_device_dtype(torch.device(device), torch.float32)
-print("MuCodec decoder loaded!")
-# ============================================================================
-# Helper Functions
-# ============================================================================
-def parse_tokens_from_text(text: str) -> Optional[torch.Tensor]:
-    """Extract audio tokens from generated text"""
-    try:
-        if "<|audio_0|>" in text and "<|audio_1|>" in text:
-            start = text.find("<|audio_0|>") + len("<|audio_0|>")
-            end = text.find("<|audio_1|>")
-            token_str = text[start:end].strip()
-        else:
-            token_str = text.strip()
-        tokens = [int(t) for t in token_str.split() if t.isdigit()]
-        if len(tokens) == 0:
-            return None
-        return torch.tensor(tokens, dtype=torch.long).unsqueeze(0).unsqueeze(0)
-    except Exception as e:
-        print(f"Error parsing tokens: {e}")
-        return None
-def codes_to_audio(
-    codes: torch.Tensor,
-    num_steps: int = 20
-) -> torch.Tensor:
-    """Convert audio codes to waveform using MuCodec"""
-    codes = codes.to(device)
-    # Initialize latent
-    first_latent = torch.randn(codes.shape[0], 32, 512, 32).to(device)
-    first_latent_length = 0
-    first_latent_codes_length = 0
-    # Sliding window parameters
-    min_samples = 1024
-    hop_samples = min_samples // 4 * 3
-    ovlp_samples = min_samples - hop_samples
-    codes_len = codes.shape[-1]
-    target_len = int(codes_len / 100 * 4 * SAMPLE_RATE)
-    # Pad codes if too short
-    if codes_len < min_samples:
-        while codes.shape[-1] < min_samples:
-            codes = torch.cat([codes, codes], -1)
-        codes = codes[:, :, :min_samples]
-        codes_len = codes.shape[-1]
-    # Adjust codes length for sliding window
-    if (codes_len - ovlp_samples) % hop_samples > 0:
-        len_codes = int(np.ceil((codes_len - ovlp_samples) / hop_samples) * hop_samples + ovlp_samples)
-        while codes.shape[-1] < len_codes:
-            codes = torch.cat([codes, codes], -1)
-        codes = codes[:, :, :len_codes]
-    # Generate latents with sliding window
-    latent_length = 512
-    latent_list = []
-    spk_embeds = torch.zeros([1, 32, 1, 32], device=codes.device)
-    with torch.autocast(device_type="cuda" if torch.cuda.is_available() else "cpu", dtype=torch.float16):
-        for sinx in range(0, codes.shape[-1] - hop_samples, hop_samples):
-            codes_input = [codes[:, :, sinx:sinx + min_samples]]
-            if sinx == 0:
-                latents = mucodec_model.inference_codes(
-                    codes_input, spk_embeds, first_latent,
-                    latent_length, first_latent_length,
-                    additional_feats=[], guidance_scale=1.5,
-                    num_steps=num_steps, disable_progress=True,
-                    scenario='other_seg'
-                )
-            else:
-                true_latent = latent_list[-1][:, :, -ovlp_samples // 2:, :]
-                len_add = 512 - true_latent.shape[-2]
-                incontext_length = true_latent.shape[-2]
-                true_latent = torch.cat([
-                    true_latent,
-                    torch.randn(true_latent.shape[0], true_latent.shape[1],
-                              len_add, true_latent.shape[-1]).to(device)
-                ], -2)
-                latents = mucodec_model.inference_codes(
-                    codes_input, spk_embeds, true_latent,
-                    latent_length, incontext_length,
-                    additional_feats=[], guidance_scale=1.5,
-                    num_steps=num_steps, disable_progress=True,
-                    scenario='other_seg'
-                )
-            latent_list.append(latents)
-    # Decode latents to audio
-    latent_list = [l.float() for l in latent_list]
-    duration = 40.96
-    min_samples_audio = int(duration * SAMPLE_RATE)
-    hop_samples_audio = min_samples_audio // 4 * 3
-    ovlp_samples_audio = min_samples_audio - hop_samples_audio
-    output = None
-    for i, latent in enumerate(latent_list):
-        bsz, ch, t, f = latent.shape
-        latent = latent.reshape(bsz * 2, ch // 2, t, f)
-        mel = vae.decode_first_stage(latent)
-        cur_output = vae.decode_to_waveform(mel)
-        cur_output = torch.from_numpy(cur_output)[:, :min_samples_audio]
-        if output is None:
-            output = cur_output
-        else:
-            # Overlap-add smoothing
-            ov_win = torch.from_numpy(np.linspace(0, 1, ovlp_samples_audio)[None, :])
-            ov_win = torch.cat([ov_win, 1 - ov_win], -1)
-            output[:, -ovlp_samples_audio:] = (
-                output[:, -ovlp_samples_audio:] * ov_win[:, -ovlp_samples_audio:] +
-                cur_output[:, :ovlp_samples_audio] * ov_win[:, :ovlp_samples_audio]
-            )
-            output = torch.cat([output, cur_output[:, ovlp_samples_audio:]], -1)
-    # Trim to target length
-    output = output[:, :target_len]
-    return output
-# ============================================================================
-# Main Generation Function with @spaces.GPU
-# ============================================================================
-@spaces.GPU
-def generate_music(
-    prompt: str,
-    max_tokens: int = 3000,
-    temperature: float = 0.0,
-    top_p: float = 0.9,
-    repetition_penalty: float = 1.1,
-    num_diffusion_steps: int = 20,
-) -> Tuple[Optional[str], str]:
-    """Generate music from text prompt"""
-    if not prompt.strip():
-        return None, "Please enter a prompt"
-    try:
-        # Generate tokens
-        messages = [{"role": "user", "content": prompt}]
-        prompt_text = tokenizer.apply_chat_template(
-            messages, tokenize=False, add_generation_prompt=True
         )
-        inputs = tokenizer(prompt_text, return_tensors="pt")
-        inputs = {k: v.to(device) for k, v in inputs.items()}
-        generation_config = {
-            "max_new_tokens": max_tokens,
-            "temperature": temperature if temperature > 0 else 1.0,
-            "top_p": top_p,
-            "repetition_penalty": repetition_penalty,
-            "do_sample": temperature > 0,
-            "pad_token_id": tokenizer.pad_token_id or tokenizer.eos_token_id,
-            "eos_token_id": tokenizer.eos_token_id,
-        }
-        with torch.no_grad():
-            outputs = language_model.generate(**inputs, **generation_config)
-        input_length = inputs["input_ids"].shape[1]
-        generated_tokens = outputs[0][input_length:]
-        response = tokenizer.decode(generated_tokens, skip_special_tokens=False)
-        # Parse tokens
-        audio_codes = parse_tokens_from_text(response)
-        if audio_codes is None:
-            return None, "❌ Could not parse audio tokens from model output"
-        print(f"Parsed {audio_codes.shape[-1]} audio tokens")
-        # Decode to audio
-        waveform = codes_to_audio(audio_codes, num_steps=num_diffusion_steps)
-        # Save audio file
-        with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as f:
-            output_path = f.name
-        torchaudio.save(output_path, waveform.cpu(), SAMPLE_RATE)
-        duration = waveform.shape[-1] / SAMPLE_RATE
-        return output_path, f"✓ Generated {duration:.1f}s audio ({audio_codes.shape[-1]} tokens)"
-    except Exception as e:
-        import traceback
-        error_msg = f"❌ Error: {str(e)}\n{traceback.format_exc()}"
-        print(error_msg)
-        return None, error_msg
-# ============================================================================
-# Gradio Interface
-# ============================================================================
-with gr.Blocks(title="Muse-8b Music Generator") as demo:
     gr.Markdown(
         """
-        # 🎵 Muse-8b Music Generator
-        Generate music directly from text prompts using Muse-8b + MuCodec.
         """
     )
     with gr.Row():
-        with gr.Column(scale=2):
-            prompt_input = gr.Textbox(
-                label="Music Prompt",
-                placeholder="Describe the music you want to generate...\n\nExample: Please generate a song in style: Pop, Ballad, C-pop. Create an emotional love song with piano accompaniment.",
-                lines=5
             )
-            generate_btn = gr.Button("🎵 Generate Music", variant="primary", size="lg")
-            status_output = gr.Textbox(label="Status", lines=2)
-            audio_output = gr.Audio(label="Generated Music", type="filepath")
-        with gr.Column(scale=1):
-            gr.Markdown("### Generation Settings")
-            max_tokens_slider = gr.Slider(
-                minimum=500, maximum=5000, value=3000, step=100,
-                label="Max Tokens"
-            )
-            temperature_slider = gr.Slider(
-                minimum=0.0, maximum=1.0, value=0.0, step=0.1,
-                label="Temperature (0 = deterministic)"
-            )
-            top_p_slider = gr.Slider(
-                minimum=0.0, maximum=1.0, value=0.9, step=0.05,
-                label="Top P"
-            )
-            rep_penalty_slider = gr.Slider(
-                minimum=1.0, maximum=2.0, value=1.1, step=0.05,
-                label="Repetition Penalty"
-            )
-            diffusion_steps_slider = gr.Slider(
-                minimum=10, maximum=50, value=20, step=5,
-                label="Diffusion Steps (quality vs speed)"
             )
-    gr.Examples(
-        examples=[
-            ["Please generate a song in style: Pop, Ballad, C-pop. Create an emotional love song with piano accompaniment."],
-            ["Generate an upbeat electronic dance music track with strong bass and synth leads."],
-            ["Create a classical orchestral piece with strings and woodwinds, peaceful and serene."],
-            ["Make a jazz fusion track with saxophone and electric guitar solos."],
-        ],
-        inputs=prompt_input
-    )
     generate_btn.click(
         fn=generate_music,
         inputs=[
-            prompt_input,
-            max_tokens_slider,
-            temperature_slider,
-            top_p_slider,
-            rep_penalty_slider,
-            diffusion_steps_slider
         ],
-        outputs=[audio_output, status_output]
     )
     gr.Markdown(
         """
         ---
-        ### About
-        **Model**: [bolshyC/Muse-8b](https://huggingface.co/bolshyC/Muse-8b)
-        **Decoder**: MuCodec (Ultra Low-Bitrate Music Codec)
-        First generation may take ~1-2 minutes. Subsequent generations are faster.
         """
     )
-demo.queue().launch()

 """
+HeartMuLa Gradio App - Music Generation with Lyrics and Tags
+A self-contained Gradio app for Hugging Face Spaces
 """
 import os
 import tempfile
 import torch
+import gradio as gr
+from huggingface_hub import hf_hub_download, snapshot_download
+# Download models from HuggingFace Hub on startup
+def download_models():
+    """Download all required model files from HuggingFace Hub."""
+    cache_dir = os.environ.get("HF_HOME", os.path.expanduser("~/.cache/huggingface"))
+    model_dir = os.path.join(cache_dir, "heartmula_models")
+    if not os.path.exists(model_dir):
+        os.makedirs(model_dir, exist_ok=True)
+    # Download HeartMuLaGen (tokenizer and gen_config)
+    print("Downloading HeartMuLaGen files...")
+    for filename in ["tokenizer.json", "gen_config.json"]:
+        hf_hub_download(
+            repo_id="HeartMuLa/HeartMuLaGen",
+            filename=filename,
+            local_dir=model_dir,
         )
+    # Download HeartMuLa-oss-3B
+    print("Downloading HeartMuLa-oss-3B...")
+    snapshot_download(
+        repo_id="HeartMuLa/HeartMuLa-oss-3B",
+        local_dir=os.path.join(model_dir, "HeartMuLa-oss-3B"),
+    )
+    # Download HeartCodec-oss
+    print("Downloading HeartCodec-oss...")
+    snapshot_download(
+        repo_id="HeartMuLa/HeartCodec-oss",
+        local_dir=os.path.join(model_dir, "HeartCodec-oss"),
+    )
+    print("All models downloaded successfully!")
+    return model_dir
+# Global pipeline instance
+pipeline = None
+def load_pipeline():
+    """Load the HeartMuLa pipeline."""
+    global pipeline
+    if pipeline is not None:
+        return pipeline
+    from heartlib import HeartMuLaGenPipeline
+    model_dir = download_models()
+    # Determine device and dtype
+    if torch.cuda.is_available():
+        device = torch.device("cuda")
+        dtype = torch.bfloat16
+    else:
+        device = torch.device("cpu")
+        dtype = torch.float32
+    print(f"Loading pipeline on {device} with {dtype}...")
+    pipeline = HeartMuLaGenPipeline.from_pretrained(
+        model_dir,
+        device=device,
+        dtype=dtype,
+        version="3B",
+    )
+    print("Pipeline loaded successfully!")
+    return pipeline
+def generate_music(
+    lyrics: str,
+    tags: str,
+    max_duration_seconds: int,
+    temperature: float,
+    topk: int,
+    cfg_scale: float,
+    progress=gr.Progress(track_tqdm=True),
+):
+    """Generate music from lyrics and tags."""
+    if not lyrics.strip():
+        raise gr.Error("Please enter some lyrics!")
+    if not tags.strip():
+        raise gr.Error("Please enter at least one tag!")
+    pipe = load_pipeline()
+    # Create a temporary file for output
+    with tempfile.NamedTemporaryFile(suffix=".mp3", delete=False) as f:
+        output_path = f.name
+    max_audio_length_ms = max_duration_seconds * 1000
+    with torch.no_grad():
+        pipe(
+            {
+                "lyrics": lyrics,
+                "tags": tags,
+            },
+            max_audio_length_ms=max_audio_length_ms,
+            save_path=output_path,
+            topk=topk,
+            temperature=temperature,
+            cfg_scale=cfg_scale,
+        )
+    return output_path
+# Example lyrics
+EXAMPLE_LYRICS = """[Intro]
+[Verse]
+The sun creeps in across the floor
+I hear the traffic outside the door
+The coffee pot begins to hiss
+It is another morning just like this
+[Prechorus]
+The world keeps spinning round and round
+Feet are planted on the ground
+I find my rhythm in the sound
+[Chorus]
+Every day the light returns
+Every day the fire burns
+We keep on walking down this street
+Moving to the same steady beat
+It is the ordinary magic that we meet
+[Verse]
+The hours tick deeply into noon
+Chasing shadows, chasing the moon
+Work is done and the lights go low
+Watching the city start to glow
+[Bridge]
+It is not always easy, not always bright
+Sometimes we wrestle with the night
+But we make it to the morning light
+[Chorus]
+Every day the light returns
+Every day the fire burns
+We keep on walking down this street
+Moving to the same steady beat
+[Outro]
+Just another day
+Every single day"""
+EXAMPLE_TAGS = "piano,happy,uplifting,pop"
+# Build the Gradio interface
+with gr.Blocks(
+    title="HeartMuLa Music Generator",
+    theme=gr.themes.Soft(),
+) as demo:
     gr.Markdown(
         """
+        # HeartMuLa Music Generator
+        Generate music from lyrics and tags using [HeartMuLa](https://github.com/HeartMuLa/heartlib),
+        an open-source music foundation model.
+        **Instructions:**
+        1. Enter your lyrics with structure tags like `[Verse]`, `[Chorus]`, `[Bridge]`, etc.
+        2. Add comma-separated tags describing the music style (e.g., `piano,happy,romantic`)
+        3. Adjust generation parameters as needed
+        4. Click "Generate Music" and wait for your song!
+        *Note: Generation can take several minutes depending on the duration.*
         """
     )
     with gr.Row():
+        with gr.Column(scale=1):
+            lyrics_input = gr.Textbox(
+                label="Lyrics",
+                placeholder="Enter lyrics with structure tags like [Verse], [Chorus], etc.",
+                lines=20,
+                value=EXAMPLE_LYRICS,
             )
+            tags_input = gr.Textbox(
+                label="Tags",
+                placeholder="piano,happy,romantic,synthesizer",
+                value=EXAMPLE_TAGS,
+                info="Comma-separated tags describing the music style",
+            )
+            with gr.Accordion("Advanced Settings", open=False):
+                max_duration = gr.Slider(
+                    minimum=30,
+                    maximum=240,
+                    value=120,
+                    step=10,
+                    label="Max Duration (seconds)",
+                    info="Maximum length of generated audio",
+                )
+                temperature = gr.Slider(
+                    minimum=0.1,
+                    maximum=2.0,
+                    value=1.0,
+                    step=0.1,
+                    label="Temperature",
+                    info="Higher = more creative, Lower = more consistent",
+                )
+                topk = gr.Slider(
+                    minimum=1,
+                    maximum=100,
+                    value=50,
+                    step=1,
+                    label="Top-K",
+                    info="Number of top tokens to sample from",
+                )
+                cfg_scale = gr.Slider(
+                    minimum=1.0,
+                    maximum=3.0,
+                    value=1.5,
+                    step=0.1,
+                    label="CFG Scale",
+                    info="Classifier-free guidance scale",
+                )
+            generate_btn = gr.Button("Generate Music", variant="primary", size="lg")
+        with gr.Column(scale=1):
+            audio_output = gr.Audio(
+                label="Generated Music",
+                type="filepath",
             )
+            gr.Markdown(
+                """
+                ### Tips for Better Results
+                - Use structured lyrics with section tags
+                - Be specific with your style tags
+                - Try different temperature values for variety
+                - Shorter durations generate faster
+                ### Example Tags
+                - **Instruments:** piano, guitar, drums, synthesizer, violin, bass
+                - **Mood:** happy, sad, romantic, energetic, calm, melancholic
+                - **Genre:** pop, rock, jazz, classical, electronic, folk
+                - **Tempo:** fast, slow, upbeat, relaxed
+                """
+            )
     generate_btn.click(
         fn=generate_music,
         inputs=[
+            lyrics_input,
+            tags_input,
+            max_duration,
+            temperature,
+            topk,
+            cfg_scale,
         ],
+        outputs=audio_output,
     )
     gr.Markdown(
         """
         ---
+        **Model:** [HeartMuLa-oss-3B](https://huggingface.co/HeartMuLa/HeartMuLa-oss-3B) |
+        **Paper:** [arXiv](https://arxiv.org/abs/2601.10547) |
+        **Code:** [GitHub](https://github.com/HeartMuLa/heartlib)
+        *Licensed under Apache 2.0*
         """
     )
+if __name__ == "__main__":
+    # Preload models on startup
+    print("Initializing HeartMuLa...")
+    load_pipeline()
+    # Launch the app
+    demo.launch()

requirements.txt CHANGED Viewed

@@ -1,14 +1,2 @@
-spaces
-gradio
-torch
-torchaudio
-transformers
-accelerate
-diffusers
-einops
-librosa
-scipy
-numpy
-safetensors
-fairseq-fixed
-cached_path


1	+ gradio>=4.0.0
2	+ heartlib @ git+https://github.com/HeartMuLa/heartlib.git