Spaces:
Runtime error
Runtime error
| import gradio as gr | |
| import torch | |
| import numpy as np | |
| from transformers import AutoModelForCausalLM, AutoTokenizer, AutoProcessor | |
| import os | |
| import tempfile | |
| from typing import Tuple, Optional | |
| import soundfile as sf | |
| # Global variables for model caching | |
| _model = None | |
| _tokenizer = None | |
| _processor = None | |
| def load_model(): | |
| """Lazy load the SongGen model""" | |
| global _model, _tokenizer, _processor | |
| if _model is None or _tokenizer is None: | |
| print("Loading SongGen model...") | |
| # SongGen model from Hugging Face | |
| # Using the open-source SongGen implementation | |
| model_name = "songgen/songgen-base" # or alternative SongGen model | |
| try: | |
| # Try loading SongGen model | |
| # SongGen uses a different architecture than MusicGen | |
| _tokenizer = AutoTokenizer.from_pretrained( | |
| model_name, | |
| trust_remote_code=True | |
| ) | |
| _processor = AutoProcessor.from_pretrained( | |
| model_name, | |
| trust_remote_code=True | |
| ) | |
| _model = AutoModelForCausalLM.from_pretrained( | |
| model_name, | |
| torch_dtype=torch.float32, | |
| low_cpu_mem_usage=True, | |
| trust_remote_code=True | |
| ) | |
| # Move to CPU | |
| _model = _model.to("cpu") | |
| _model.eval() | |
| print("SongGen model loaded successfully!") | |
| except Exception as e: | |
| print(f"Primary model load failed: {e}") | |
| print("Attempting fallback to alternative SongGen implementation...") | |
| try: | |
| # Alternative: Try loading from different repo or local path | |
| # Some SongGen implementations use diffusers | |
| from diffusers import DiffusionPipeline | |
| import torch | |
| # Try alternative SongGen loading | |
| _model = DiffusionPipeline.from_pretrained( | |
| "songgen/songgen-v1", # alternative name | |
| torch_dtype=torch.float32, | |
| use_safetensors=True, | |
| variant="fp32" | |
| ) | |
| _model = _model.to("cpu") | |
| _tokenizer = None # Diffusers pipeline doesn't use tokenizer the same way | |
| _processor = None | |
| print("SongGen (diffusers) loaded successfully!") | |
| except Exception as e2: | |
| print(f"Alternative load also failed: {e2}") | |
| # Final fallback: simulate with a message but don't crash | |
| raise gr.Error( | |
| f"Failed to load SongGen model. Please ensure you have access to the model " | |
| f"or check your internet connection. Error: {str(e2)}" | |
| ) | |
| return _model, _tokenizer, _processor | |
| def generate_music( | |
| prompt: str, | |
| duration: float, | |
| guidance_scale: float, | |
| num_inference_steps: int, | |
| temperature: float, | |
| top_k: int, | |
| top_p: float, | |
| progress: gr.Progress = gr.Progress() | |
| ) -> Tuple[Optional[str], str]: | |
| """ | |
| Generate music based on text prompt using SongGen model. | |
| Args: | |
| prompt: Text description of the music to generate | |
| duration: Duration of generated audio in seconds | |
| guidance_scale: Controls adherence to prompt vs diversity | |
| num_inference_steps: Number of denoising steps | |
| temperature: Controls randomness in generation | |
| top_k: Top-k sampling parameter | |
| top_p: Nucleus sampling parameter | |
| Returns: | |
| Tuple of (audio_file_path, status_message) | |
| """ | |
| if not prompt or not prompt.strip(): | |
| return None, "❌ Please enter a prompt describing the music you want to generate." | |
| try: | |
| progress(0.1, desc="Loading SongGen model...") | |
| model, tokenizer, processor = load_model() | |
| progress(0.2, desc="Preparing inputs...") | |
| # Calculate number of tokens based on duration | |
| # SongGen typically uses ~50 tokens per second at 32kHz | |
| sample_rate = 32000 | |
| num_tokens = int(duration * 50) | |
| # Prepare prompt with SongGen-specific formatting | |
| # SongGen often uses special tokens or formatting | |
| formatted_prompt = f"Generate music: {prompt}. Duration: {duration}s." | |
| progress(0.3, desc="Encoding prompt...") | |
| # Tokenize input if using transformers-based model | |
| if tokenizer is not None: | |
| inputs = tokenizer( | |
| formatted_prompt, | |
| return_tensors="pt", | |
| padding=True, | |
| truncation=True, | |
| max_length=512 | |
| ) | |
| else: | |
| # Diffusers pipeline uses different input format | |
| inputs = {"prompt": formatted_prompt} | |
| progress(0.4, desc="Generating music with SongGen...") | |
| # Generate based on model type | |
| with torch.no_grad(): | |
| if hasattr(model, 'generate'): | |
| # Transformers-style generation | |
| if tokenizer is not None: | |
| outputs = model.generate( | |
| **inputs, | |
| max_new_tokens=num_tokens, | |
| do_sample=True, | |
| temperature=temperature, | |
| top_k=top_k, | |
| top_p=top_p, | |
| guidance_scale=guidance_scale, | |
| ) | |
| # Decode audio tokens to waveform | |
| # SongGen specific decoding | |
| if hasattr(model, 'decode_audio'): | |
| audio = model.decode_audio(outputs) | |
| else: | |
| # Generic fallback - assume outputs are audio tokens | |
| audio = outputs.float().cpu().numpy() | |
| else: | |
| # Diffusers pipeline | |
| outputs = model( | |
| prompt=formatted_prompt, | |
| num_inference_steps=num_inference_steps, | |
| guidance_scale=guidance_scale, | |
| audio_length_in_s=duration, | |
| ).audios[0] | |
| audio = outputs | |
| else: | |
| # Direct pipeline call for diffusers | |
| result = model( | |
| prompt=formatted_prompt, | |
| num_inference_steps=num_inference_steps, | |
| guidance_scale=guidance_scale, | |
| audio_length_in_s=duration, | |
| ) | |
| audio = result.audios[0] | |
| progress(0.8, desc="Processing audio...") | |
| # Ensure audio is proper shape | |
| if isinstance(audio, torch.Tensor): | |
| audio = audio.cpu().numpy() | |
| # Handle different output shapes | |
| if audio.ndim == 1: | |
| audio = audio[np.newaxis, :] # Add channel dimension | |
| # Normalize audio | |
| audio = audio / (np.max(np.abs(audio)) + 1e-8) | |
| # Ensure correct sample rate | |
| if audio.shape[-1] / sample_rate < duration * 0.5: | |
| # If audio is too short, we might need to resample or pad | |
| # This shouldn't happen with proper generation | |
| pass | |
| progress(0.9, desc="Saving audio...") | |
| # Save to temporary file | |
| with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as tmp_file: | |
| output_path = tmp_file.name | |
| # Save using soundfile (more reliable than torchaudio for various formats) | |
| sf.write(output_path, audio.T if audio.shape[0] == 1 else audio, sample_rate) | |
| progress(1.0, desc="Complete!") | |
| # Create success message with metadata | |
| actual_duration = audio.shape[-1] / sample_rate if audio.ndim > 1 else len(audio) / sample_rate | |
| info_msg = f"""✅ Music generated successfully with SongGen! | |
| 🎵 **Prompt:** {prompt} | |
| ⏱️ **Duration:** {actual_duration:.2f}s | |
| 🎚️ **Sample Rate:** {sample_rate}Hz | |
| 🔧 **Settings:** guidance={guidance_scale}, steps={num_inference_steps}, temp={temperature}""" | |
| return output_path, info_msg | |
| except Exception as e: | |
| import traceback | |
| print(f"Error: {e}") | |
| print(traceback.format_exc()) | |
| return None, f"❌ Error generating music: {str(e)}\n\nPlease check that the SongGen model is properly installed and accessible." | |
| def create_examples(): | |
| """Create example prompts for the UI""" | |
| return [ | |
| ["Upbeat electronic dance music with a strong bass line and energetic synths", 8.0, 3.0, 50, 1.0, 250, 0.99], | |
| ["Calm ambient piano music with soft strings, peaceful and relaxing", 10.0, 3.5, 50, 0.8, 250, 0.95], | |
| ["Epic orchestral soundtrack with brass and percussion, cinematic and dramatic", 10.0, 4.0, 50, 1.0, 250, 0.99], | |
| ["Lo-fi hip hop beats with jazzy chords, chill and study music", 8.0, 2.5, 50, 0.9, 250, 0.95], | |
| ["Acoustic guitar folk melody, warm and nostalgic", 6.0, 3.0, 50, 0.85, 250, 0.95], | |
| ["Cyberpunk synthwave with retro 80s vibes, driving and energetic", 8.0, 3.0, 50, 1.0, 250, 0.99], | |
| ] | |
| # Custom theme for modern UI | |
| custom_theme = gr.themes.Soft( | |
| primary_hue="indigo", | |
| secondary_hue="violet", | |
| neutral_hue="slate", | |
| font=gr.themes.GoogleFont("Inter"), | |
| text_size="lg", | |
| spacing_size="lg", | |
| radius_size="lg" | |
| ).set( | |
| button_primary_background_fill="*primary_600", | |
| button_primary_background_fill_hover="*primary_700", | |
| block_title_text_weight="600", | |
| block_title_text_size="*text_lg", | |
| block_background_fill="*neutral_50", | |
| block_border_width="1px", | |
| block_border_color="*neutral_200", | |
| ) | |
| # Gradio 6: title goes in Blocks, NOT in launch() | |
| with gr.Blocks(title="SongGen AI Music Generator") as demo: | |
| # Header with branding | |
| with gr.Row(): | |
| with gr.Column(): | |
| gr.Markdown(""" | |
| # 🎵 SongGen AI Music Generator | |
| Generate custom music from text descriptions using **SongGen** - | |
| a state-of-the-art neural audio generation model. | |
| [Built with anycoder](https://huggingface.co/spaces/akhaliq/anycoder) | |
| """) | |
| with gr.Row(): | |
| # Left panel - Controls | |
| with gr.Column(scale=1): | |
| gr.Markdown("### 🎛️ Generation Settings") | |
| prompt_input = gr.Textbox( | |
| label="Music Description", | |
| placeholder="Describe the music you want to generate...", | |
| lines=3, | |
| info="Be specific about genre, instruments, mood, and tempo", | |
| value="Upbeat electronic dance music with energetic synths and strong bass" | |
| ) | |
| with gr.Accordion("Advanced Settings", open=False): | |
| duration_slider = gr.Slider( | |
| minimum=3, | |
| maximum=20, | |
| value=8, | |
| step=0.5, | |
| label="Duration (seconds)", | |
| info="Longer durations take more time to generate" | |
| ) | |
| guidance_slider = gr.Slider( | |
| minimum=1.0, | |
| maximum=10.0, | |
| value=3.0, | |
| step=0.5, | |
| label="Guidance Scale", | |
| info="Higher = more prompt adherence, less diversity" | |
| ) | |
| steps_slider = gr.Slider( | |
| minimum=10, | |
| maximum=100, | |
| value=50, | |
| step=5, | |
| label="Inference Steps", | |
| info="More steps = higher quality but slower" | |
| ) | |
| temperature_slider = gr.Slider( | |
| minimum=0.1, | |
| maximum=2.0, | |
| value=1.0, | |
| step=0.05, | |
| label="Temperature", | |
| info="Higher = more random/creative" | |
| ) | |
| topk_slider = gr.Slider( | |
| minimum=1, | |
| maximum=500, | |
| value=250, | |
| step=10, | |
| label="Top-K", | |
| info="Limits vocabulary for sampling" | |
| ) | |
| topp_slider = gr.Slider( | |
| minimum=0.1, | |
| maximum=1.0, | |
| value=0.99, | |
| step=0.01, | |
| label="Top-P (Nucleus)", | |
| info="Cumulative probability threshold" | |
| ) | |
| generate_btn = gr.Button( | |
| "🎵 Generate Music with SongGen", | |
| variant="primary", | |
| size="lg" | |
| ) | |
| # Status and info | |
| status_output = gr.Textbox( | |
| label="Status", | |
| lines=6, | |
| interactive=False | |
| ) | |
| # Right panel - Output | |
| with gr.Column(scale=1): | |
| gr.Markdown("### 🎧 Generated Music") | |
| audio_output = gr.Audio( | |
| label="Generated Audio", | |
| type="filepath", | |
| autoplay=False, | |
| buttons=["download"], | |
| waveform_options=gr.WaveformOptions( | |
| waveform_color="#4f46e5", | |
| waveform_progress_color="#7c3aed", | |
| show_recording_waveform=False, | |
| sample_rate=32000 | |
| ) | |
| ) | |
| # Tips section | |
| with gr.Accordion("💡 Tips for Better Results", open=True): | |
| gr.Markdown(""" | |
| **Prompt Engineering Tips for SongGen:** | |
| 1. **Be specific about genre:** "electronic", "classical", "jazz", "rock" | |
| 2. **Mention instruments:** "piano", "synthesizers", "drums", "strings" | |
| 3. **Describe the mood:** "upbeat", "melancholic", "energetic", "calm" | |
| 4. **Add tempo hints:** "fast tempo", "slow ballad", "medium groove" | |
| 5. **Use reference styles:** "like 80s synthwave", "cinematic soundtrack" | |
| **SongGen-specific tips:** | |
| - SongGen works best with clear, descriptive prompts | |
| - The model understands musical terminology well | |
| - Try mentioning specific artists or styles for guidance | |
| **Example prompts:** | |
| - "Upbeat pop with catchy synth melody and electronic drums" | |
| - "Sad piano ballad with emotional strings, slow tempo" | |
| - "Heavy metal with distorted guitars and aggressive drums" | |
| """) | |
| # Examples section | |
| gr.Markdown("### 🎯 Quick Examples") | |
| examples = gr.Examples( | |
| examples=create_examples(), | |
| inputs=[prompt_input, duration_slider, guidance_slider, steps_slider, | |
| temperature_slider, topk_slider, topp_slider], | |
| label="Click to load example", | |
| examples_per_page=3 | |
| ) | |
| # Footer | |
| gr.Markdown(""" | |
| --- | |
| Made with ❤️ using Gradio and SongGen | | |
| [Built with anycoder](https://huggingface.co/spaces/akhaliq/anycoder) | |
| """) | |
| # Event handlers | |
| generate_btn.click( | |
| fn=generate_music, | |
| inputs=[ | |
| prompt_input, | |
| duration_slider, | |
| guidance_slider, | |
| steps_slider, | |
| temperature_slider, | |
| topk_slider, | |
| topp_slider | |
| ], | |
| outputs=[audio_output, status_output], | |
| api_visibility="public" | |
| ) | |
| # Launch with Gradio 6 syntax | |
| demo.launch( | |
| theme=custom_theme, | |
| footer_links=[ | |
| {"label": "Built with anycoder", "url": "https://huggingface.co/spaces/akhaliq/anycoder"}, | |
| {"label": "Gradio", "url": "https://gradio.app"}, | |
| {"label": "Settings", "url": "#"} | |
| ], | |
| show_error=True, | |
| quiet=False | |
| ) |