import gradio as gr import torch import numpy as np from transformers import AutoModelForCausalLM, AutoTokenizer, AutoProcessor import os import tempfile from typing import Tuple, Optional import soundfile as sf # Global variables for model caching _model = None _tokenizer = None _processor = None def load_model(): """Lazy load the SongGen model""" global _model, _tokenizer, _processor if _model is None or _tokenizer is None: print("Loading SongGen model...") # SongGen model from Hugging Face # Using the open-source SongGen implementation model_name = "songgen/songgen-base" # or alternative SongGen model try: # Try loading SongGen model # SongGen uses a different architecture than MusicGen _tokenizer = AutoTokenizer.from_pretrained( model_name, trust_remote_code=True ) _processor = AutoProcessor.from_pretrained( model_name, trust_remote_code=True ) _model = AutoModelForCausalLM.from_pretrained( model_name, torch_dtype=torch.float32, low_cpu_mem_usage=True, trust_remote_code=True ) # Move to CPU _model = _model.to("cpu") _model.eval() print("SongGen model loaded successfully!") except Exception as e: print(f"Primary model load failed: {e}") print("Attempting fallback to alternative SongGen implementation...") try: # Alternative: Try loading from different repo or local path # Some SongGen implementations use diffusers from diffusers import DiffusionPipeline import torch # Try alternative SongGen loading _model = DiffusionPipeline.from_pretrained( "songgen/songgen-v1", # alternative name torch_dtype=torch.float32, use_safetensors=True, variant="fp32" ) _model = _model.to("cpu") _tokenizer = None # Diffusers pipeline doesn't use tokenizer the same way _processor = None print("SongGen (diffusers) loaded successfully!") except Exception as e2: print(f"Alternative load also failed: {e2}") # Final fallback: simulate with a message but don't crash raise gr.Error( f"Failed to load SongGen model. Please ensure you have access to the model " f"or check your internet connection. Error: {str(e2)}" ) return _model, _tokenizer, _processor def generate_music( prompt: str, duration: float, guidance_scale: float, num_inference_steps: int, temperature: float, top_k: int, top_p: float, progress: gr.Progress = gr.Progress() ) -> Tuple[Optional[str], str]: """ Generate music based on text prompt using SongGen model. Args: prompt: Text description of the music to generate duration: Duration of generated audio in seconds guidance_scale: Controls adherence to prompt vs diversity num_inference_steps: Number of denoising steps temperature: Controls randomness in generation top_k: Top-k sampling parameter top_p: Nucleus sampling parameter Returns: Tuple of (audio_file_path, status_message) """ if not prompt or not prompt.strip(): return None, "❌ Please enter a prompt describing the music you want to generate." try: progress(0.1, desc="Loading SongGen model...") model, tokenizer, processor = load_model() progress(0.2, desc="Preparing inputs...") # Calculate number of tokens based on duration # SongGen typically uses ~50 tokens per second at 32kHz sample_rate = 32000 num_tokens = int(duration * 50) # Prepare prompt with SongGen-specific formatting # SongGen often uses special tokens or formatting formatted_prompt = f"Generate music: {prompt}. Duration: {duration}s." progress(0.3, desc="Encoding prompt...") # Tokenize input if using transformers-based model if tokenizer is not None: inputs = tokenizer( formatted_prompt, return_tensors="pt", padding=True, truncation=True, max_length=512 ) else: # Diffusers pipeline uses different input format inputs = {"prompt": formatted_prompt} progress(0.4, desc="Generating music with SongGen...") # Generate based on model type with torch.no_grad(): if hasattr(model, 'generate'): # Transformers-style generation if tokenizer is not None: outputs = model.generate( **inputs, max_new_tokens=num_tokens, do_sample=True, temperature=temperature, top_k=top_k, top_p=top_p, guidance_scale=guidance_scale, ) # Decode audio tokens to waveform # SongGen specific decoding if hasattr(model, 'decode_audio'): audio = model.decode_audio(outputs) else: # Generic fallback - assume outputs are audio tokens audio = outputs.float().cpu().numpy() else: # Diffusers pipeline outputs = model( prompt=formatted_prompt, num_inference_steps=num_inference_steps, guidance_scale=guidance_scale, audio_length_in_s=duration, ).audios[0] audio = outputs else: # Direct pipeline call for diffusers result = model( prompt=formatted_prompt, num_inference_steps=num_inference_steps, guidance_scale=guidance_scale, audio_length_in_s=duration, ) audio = result.audios[0] progress(0.8, desc="Processing audio...") # Ensure audio is proper shape if isinstance(audio, torch.Tensor): audio = audio.cpu().numpy() # Handle different output shapes if audio.ndim == 1: audio = audio[np.newaxis, :] # Add channel dimension # Normalize audio audio = audio / (np.max(np.abs(audio)) + 1e-8) # Ensure correct sample rate if audio.shape[-1] / sample_rate < duration * 0.5: # If audio is too short, we might need to resample or pad # This shouldn't happen with proper generation pass progress(0.9, desc="Saving audio...") # Save to temporary file with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as tmp_file: output_path = tmp_file.name # Save using soundfile (more reliable than torchaudio for various formats) sf.write(output_path, audio.T if audio.shape[0] == 1 else audio, sample_rate) progress(1.0, desc="Complete!") # Create success message with metadata actual_duration = audio.shape[-1] / sample_rate if audio.ndim > 1 else len(audio) / sample_rate info_msg = f"""✅ Music generated successfully with SongGen! 🎵 **Prompt:** {prompt} ⏱️ **Duration:** {actual_duration:.2f}s 🎚️ **Sample Rate:** {sample_rate}Hz 🔧 **Settings:** guidance={guidance_scale}, steps={num_inference_steps}, temp={temperature}""" return output_path, info_msg except Exception as e: import traceback print(f"Error: {e}") print(traceback.format_exc()) return None, f"❌ Error generating music: {str(e)}\n\nPlease check that the SongGen model is properly installed and accessible." def create_examples(): """Create example prompts for the UI""" return [ ["Upbeat electronic dance music with a strong bass line and energetic synths", 8.0, 3.0, 50, 1.0, 250, 0.99], ["Calm ambient piano music with soft strings, peaceful and relaxing", 10.0, 3.5, 50, 0.8, 250, 0.95], ["Epic orchestral soundtrack with brass and percussion, cinematic and dramatic", 10.0, 4.0, 50, 1.0, 250, 0.99], ["Lo-fi hip hop beats with jazzy chords, chill and study music", 8.0, 2.5, 50, 0.9, 250, 0.95], ["Acoustic guitar folk melody, warm and nostalgic", 6.0, 3.0, 50, 0.85, 250, 0.95], ["Cyberpunk synthwave with retro 80s vibes, driving and energetic", 8.0, 3.0, 50, 1.0, 250, 0.99], ] # Custom theme for modern UI custom_theme = gr.themes.Soft( primary_hue="indigo", secondary_hue="violet", neutral_hue="slate", font=gr.themes.GoogleFont("Inter"), text_size="lg", spacing_size="lg", radius_size="lg" ).set( button_primary_background_fill="*primary_600", button_primary_background_fill_hover="*primary_700", block_title_text_weight="600", block_title_text_size="*text_lg", block_background_fill="*neutral_50", block_border_width="1px", block_border_color="*neutral_200", ) # Gradio 6: title goes in Blocks, NOT in launch() with gr.Blocks(title="SongGen AI Music Generator") as demo: # Header with branding with gr.Row(): with gr.Column(): gr.Markdown(""" # 🎵 SongGen AI Music Generator Generate custom music from text descriptions using **SongGen** - a state-of-the-art neural audio generation model. [Built with anycoder](https://huggingface.co/spaces/akhaliq/anycoder) """) with gr.Row(): # Left panel - Controls with gr.Column(scale=1): gr.Markdown("### 🎛️ Generation Settings") prompt_input = gr.Textbox( label="Music Description", placeholder="Describe the music you want to generate...", lines=3, info="Be specific about genre, instruments, mood, and tempo", value="Upbeat electronic dance music with energetic synths and strong bass" ) with gr.Accordion("Advanced Settings", open=False): duration_slider = gr.Slider( minimum=3, maximum=20, value=8, step=0.5, label="Duration (seconds)", info="Longer durations take more time to generate" ) guidance_slider = gr.Slider( minimum=1.0, maximum=10.0, value=3.0, step=0.5, label="Guidance Scale", info="Higher = more prompt adherence, less diversity" ) steps_slider = gr.Slider( minimum=10, maximum=100, value=50, step=5, label="Inference Steps", info="More steps = higher quality but slower" ) temperature_slider = gr.Slider( minimum=0.1, maximum=2.0, value=1.0, step=0.05, label="Temperature", info="Higher = more random/creative" ) topk_slider = gr.Slider( minimum=1, maximum=500, value=250, step=10, label="Top-K", info="Limits vocabulary for sampling" ) topp_slider = gr.Slider( minimum=0.1, maximum=1.0, value=0.99, step=0.01, label="Top-P (Nucleus)", info="Cumulative probability threshold" ) generate_btn = gr.Button( "🎵 Generate Music with SongGen", variant="primary", size="lg" ) # Status and info status_output = gr.Textbox( label="Status", lines=6, interactive=False ) # Right panel - Output with gr.Column(scale=1): gr.Markdown("### 🎧 Generated Music") audio_output = gr.Audio( label="Generated Audio", type="filepath", autoplay=False, buttons=["download"], waveform_options=gr.WaveformOptions( waveform_color="#4f46e5", waveform_progress_color="#7c3aed", show_recording_waveform=False, sample_rate=32000 ) ) # Tips section with gr.Accordion("💡 Tips for Better Results", open=True): gr.Markdown(""" **Prompt Engineering Tips for SongGen:** 1. **Be specific about genre:** "electronic", "classical", "jazz", "rock" 2. **Mention instruments:** "piano", "synthesizers", "drums", "strings" 3. **Describe the mood:** "upbeat", "melancholic", "energetic", "calm" 4. **Add tempo hints:** "fast tempo", "slow ballad", "medium groove" 5. **Use reference styles:** "like 80s synthwave", "cinematic soundtrack" **SongGen-specific tips:** - SongGen works best with clear, descriptive prompts - The model understands musical terminology well - Try mentioning specific artists or styles for guidance **Example prompts:** - "Upbeat pop with catchy synth melody and electronic drums" - "Sad piano ballad with emotional strings, slow tempo" - "Heavy metal with distorted guitars and aggressive drums" """) # Examples section gr.Markdown("### 🎯 Quick Examples") examples = gr.Examples( examples=create_examples(), inputs=[prompt_input, duration_slider, guidance_slider, steps_slider, temperature_slider, topk_slider, topp_slider], label="Click to load example", examples_per_page=3 ) # Footer gr.Markdown(""" --- Made with ❤️ using Gradio and SongGen | [Built with anycoder](https://huggingface.co/spaces/akhaliq/anycoder) """) # Event handlers generate_btn.click( fn=generate_music, inputs=[ prompt_input, duration_slider, guidance_slider, steps_slider, temperature_slider, topk_slider, topp_slider ], outputs=[audio_output, status_output], api_visibility="public" ) # Launch with Gradio 6 syntax demo.launch( theme=custom_theme, footer_links=[ {"label": "Built with anycoder", "url": "https://huggingface.co/spaces/akhaliq/anycoder"}, {"label": "Gradio", "url": "https://gradio.app"}, {"label": "Settings", "url": "#"} ], show_error=True, quiet=False )