Spaces:
Sleeping
Sleeping
| import gradio as gr | |
| from audiocraft.models import MusicGen | |
| from audiocraft.data.audio import audio_write | |
| import torch | |
| import numpy as np | |
| import uuid | |
| import torchaudio | |
| # Load Model | |
| model = MusicGen.get_pretrained('facebook/musicgen-melody') | |
| # --- STRUCTURE DEFINITIONS --- | |
| # These act as "presets" for the song's roadmap | |
| STRUCTURE_PRESETS = { | |
| "Radio Hit (Intro-Verse-Chorus-Drop)": ["Intro", "Verse", "Build-Up", "Drop/Chorus", "Outro"], | |
| "Club Extended (Intro-Build-Drop-Break-Drop)": ["Intro", "Build-Up", "Drop", "Breakdown", "Drop 2", "Outro"], | |
| "Hip Hop Classic (Intro-Verse-Hook-Verse-Hook)": ["Intro", "Verse 1", "Hook", "Verse 2", "Hook", "Outro"], | |
| "Custom (Single Prompt)": ["Full Track"] | |
| } | |
| # --- STYLE PROMPTS FOR SECTIONS --- | |
| # How each section should sound based on the genre | |
| SECTION_PROMPTS = { | |
| "Bass House (Jayms Style)": { | |
| "Intro": "Atmospheric intro, filtered bass, simple hi-hats, tension building.", | |
| "Verse": "Minimal groove, shuffling hi-hats, deep sub-bass, metallic plucks, clean vocal chops.", | |
| "Build-Up": "Rising snare roll, pitch riser, white noise sweep, accelerating energy, hype vocals.", | |
| "Drop": "Explosive Bass House drop, metallic FM wobble bass, heavy kick, sidechain, festival energy.", | |
| "Breakdown": "Atmospheric pads, filtered chords, no drums, emotional melody.", | |
| "Outro": "Drum loop fading out, DJ friendly intro/outro, simple bass." | |
| }, | |
| "Future House (Mainstage)": { | |
| "Intro": "Bright piano chords, filtered kick, uplifting atmosphere.", | |
| "Verse": "Plucky melody, snapping fingers, deep house bass, clean production.", | |
| "Build-Up": "uplifting risers, snare build, hands in the air energy.", | |
| "Drop": "Bouncy Future House drop, metallic donk bass, shuffling beat, catchy lead.", | |
| "Outro": "Stripped back drums, fading melody." | |
| }, | |
| "Dubstep (Heavy)": { | |
| "Intro": "Dark cinematic drone, ominous atmosphere, slow percussion.", | |
| "Build-Up": "Intense drum roll, siren rising, pre-drop vocal scream.", | |
| "Drop": "Heavy dubstep drop, mechanical growl bass, half-time drums, sub-bass impact.", | |
| "Breakdown": "Orchestral strings, emotional piano, dark ambience.", | |
| "Outro": "Dark fading textures, slow drum hit." | |
| } | |
| } | |
| def generate_structured_song(audio_input, genre_style, structure_preset, total_duration, bpm, temp, cfg_scale, top_k, history): | |
| if audio_input is None: return None, history | |
| # 1. SETUP AUDIO INPUT (The DNA) | |
| sr, data = audio_input | |
| audio_data = data.astype(np.float32) | |
| if np.max(np.abs(audio_data)) > 0: audio_data /= np.max(np.abs(audio_data)) | |
| # Convert to Tensor | |
| audio_tensor = torch.from_numpy(audio_data).t() | |
| if audio_tensor.shape[0] > 1: audio_tensor = audio_tensor.mean(dim=0, keepdim=True) | |
| else: audio_tensor = audio_tensor.unsqueeze(0) | |
| if sr != 32000: | |
| resampler = torchaudio.transforms.Resample(sr, 32000) | |
| audio_tensor = resampler(audio_tensor) | |
| # 2. DETERMINE SECTIONS | |
| sections = STRUCTURE_PRESETS[structure_preset] | |
| if structure_preset == "Custom (Single Prompt)": | |
| # Fallback to simple generation | |
| pass | |
| # Calculate time per section | |
| # e.g., 60s total / 4 sections = 15s per section | |
| sec_duration = total_duration / len(sections) | |
| full_song = audio_tensor # Start with the user's input as context (optional, or start fresh) | |
| # Actually, let's treat the user input as the "Intro" DNA and generate from there. | |
| current_context = audio_tensor.unsqueeze(0) # [1, 1, T] | |
| print(f"Generating {len(sections)} sections...") | |
| generated_parts = [] | |
| # 3. GENERATION LOOP (The Architect) | |
| for i, section_name in enumerate(sections): | |
| # Get the specific prompt for this section (e.g., "Verse" description) | |
| if genre_style in SECTION_PROMPTS and section_name in SECTION_PROMPTS[genre_style]: | |
| specific_desc = SECTION_PROMPTS[genre_style][section_name] | |
| else: | |
| # Fallback if specific section text missing | |
| specific_desc = f"{genre_style} style, {section_name} section, dynamic change." | |
| final_prompt = f"{specific_desc} strictly {bpm} BPM, high fidelity." | |
| print(f"Rendering {section_name}: {final_prompt}") | |
| # Set params for this chunk | |
| # Note: We generate a bit more than needed to allow for crossfading overlap | |
| model.set_generation_params( | |
| duration=sec_duration + 2, # +2s for overlap/continuity | |
| temperature=temp, | |
| cfg_coef=cfg_scale, | |
| top_k=int(top_k) | |
| ) | |
| # Generate Continuation from previous part | |
| # We use the LAST 10 seconds of the previous audio as the "seed" for the new part | |
| # to ensure flow, but we change the text prompt to change the energy. | |
| # Grab last 10s of context | |
| context_len = current_context.shape[-1] | |
| trim_len = int(10 * 32000) | |
| if context_len > trim_len: | |
| input_seed = current_context[..., -trim_len:] | |
| else: | |
| input_seed = current_context | |
| next_chunk = model.generate_continuation( | |
| prompt=input_seed, | |
| prompt_sample_rate=32000, | |
| descriptions=[final_prompt], | |
| progress=True | |
| ) | |
| # Add to list | |
| # Remove the input_seed part from the output to avoid duplication? | |
| # generate_continuation returns the FULL audio (seed + new). | |
| # We only want the NEW part. | |
| new_audio = next_chunk[..., input_seed.shape[-1]:] | |
| generated_parts.append(new_audio) | |
| # Update context for next loop | |
| current_context = next_chunk | |
| # 4. STITCHING | |
| # Concatenate all parts | |
| final_mix = torch.cat(generated_parts, dim=-1) | |
| # 5. EXPORT | |
| uid = str(uuid.uuid4())[:8] | |
| filename = f"structured_song_{uid}" | |
| audio_write(filename, final_mix[0].cpu(), model.sample_rate, strategy="loudness") | |
| return f"{filename}.wav", [f"{filename}.wav"] + history | |
| # --- UI --- | |
| with gr.Blocks(theme=gr.themes.Soft(primary_hue="purple", secondary_hue="zinc")) as demo: | |
| history = gr.State([]) | |
| gr.Markdown("# 🏗️ Infinite Song Architect: Structure Mode") | |
| gr.Markdown("Define a Verse-Chorus structure, and the AI will build each section dynamically.") | |
| with gr.Row(): | |
| with gr.Column(): | |
| audio_in = gr.Audio(label="🎧 Input DNA (Chords/Bass)", type="numpy") | |
| with gr.Row(): | |
| genre_drop = gr.Dropdown(list(SECTION_PROMPTS.keys()), value="Bass House (Jayms Style)", label="Sound Palette") | |
| struct_drop = gr.Dropdown(list(STRUCTURE_PRESETS.keys()), value="Radio Hit (Intro-Verse-Chorus-Drop)", label="Song Structure") | |
| with gr.Row(): | |
| bpm_slider = gr.Slider(60, 180, 126, label="BPM") | |
| len_slider = gr.Slider(30, 120, 60, step=10, label="Total Length") | |
| with gr.Accordion("Fine Tune", open=False): | |
| cfg_slider = gr.Slider(1, 15, 8, label="Strictness") | |
| temp_slider = gr.Slider(0.1, 1.5, 0.6, label="Creativity") | |
| top_k_slider = gr.Slider(10, 250, 50, label="Stability") | |
| btn = gr.Button("🏗️ BUILD FULL SONG", variant="primary") | |
| out_audio = gr.Audio(label="Full Track Output") | |
| with gr.Column(): | |
| history_list = gr.Files(label="History") | |
| btn.click(generate_structured_song, | |
| [audio_in, genre_drop, struct_drop, len_slider, bpm_slider, temp_slider, cfg_slider, top_k_slider, history], | |
| [out_audio, history_list]) | |
| demo.launch() |