import gradio as gr from audiocraft.models import MusicGen from audiocraft.data.audio import audio_write import torch import numpy as np import uuid import torchaudio # Load Model model = MusicGen.get_pretrained('facebook/musicgen-melody') # --- STRUCTURE DEFINITIONS --- # These act as "presets" for the song's roadmap STRUCTURE_PRESETS = { "Radio Hit (Intro-Verse-Chorus-Drop)": ["Intro", "Verse", "Build-Up", "Drop/Chorus", "Outro"], "Club Extended (Intro-Build-Drop-Break-Drop)": ["Intro", "Build-Up", "Drop", "Breakdown", "Drop 2", "Outro"], "Hip Hop Classic (Intro-Verse-Hook-Verse-Hook)": ["Intro", "Verse 1", "Hook", "Verse 2", "Hook", "Outro"], "Custom (Single Prompt)": ["Full Track"] } # --- STYLE PROMPTS FOR SECTIONS --- # How each section should sound based on the genre SECTION_PROMPTS = { "Bass House (Jayms Style)": { "Intro": "Atmospheric intro, filtered bass, simple hi-hats, tension building.", "Verse": "Minimal groove, shuffling hi-hats, deep sub-bass, metallic plucks, clean vocal chops.", "Build-Up": "Rising snare roll, pitch riser, white noise sweep, accelerating energy, hype vocals.", "Drop": "Explosive Bass House drop, metallic FM wobble bass, heavy kick, sidechain, festival energy.", "Breakdown": "Atmospheric pads, filtered chords, no drums, emotional melody.", "Outro": "Drum loop fading out, DJ friendly intro/outro, simple bass." }, "Future House (Mainstage)": { "Intro": "Bright piano chords, filtered kick, uplifting atmosphere.", "Verse": "Plucky melody, snapping fingers, deep house bass, clean production.", "Build-Up": "uplifting risers, snare build, hands in the air energy.", "Drop": "Bouncy Future House drop, metallic donk bass, shuffling beat, catchy lead.", "Outro": "Stripped back drums, fading melody." }, "Dubstep (Heavy)": { "Intro": "Dark cinematic drone, ominous atmosphere, slow percussion.", "Build-Up": "Intense drum roll, siren rising, pre-drop vocal scream.", "Drop": "Heavy dubstep drop, mechanical growl bass, half-time drums, sub-bass impact.", "Breakdown": "Orchestral strings, emotional piano, dark ambience.", "Outro": "Dark fading textures, slow drum hit." } } def generate_structured_song(audio_input, genre_style, structure_preset, total_duration, bpm, temp, cfg_scale, top_k, history): if audio_input is None: return None, history # 1. SETUP AUDIO INPUT (The DNA) sr, data = audio_input audio_data = data.astype(np.float32) if np.max(np.abs(audio_data)) > 0: audio_data /= np.max(np.abs(audio_data)) # Convert to Tensor audio_tensor = torch.from_numpy(audio_data).t() if audio_tensor.shape[0] > 1: audio_tensor = audio_tensor.mean(dim=0, keepdim=True) else: audio_tensor = audio_tensor.unsqueeze(0) if sr != 32000: resampler = torchaudio.transforms.Resample(sr, 32000) audio_tensor = resampler(audio_tensor) # 2. DETERMINE SECTIONS sections = STRUCTURE_PRESETS[structure_preset] if structure_preset == "Custom (Single Prompt)": # Fallback to simple generation pass # Calculate time per section # e.g., 60s total / 4 sections = 15s per section sec_duration = total_duration / len(sections) full_song = audio_tensor # Start with the user's input as context (optional, or start fresh) # Actually, let's treat the user input as the "Intro" DNA and generate from there. current_context = audio_tensor.unsqueeze(0) # [1, 1, T] print(f"Generating {len(sections)} sections...") generated_parts = [] # 3. GENERATION LOOP (The Architect) for i, section_name in enumerate(sections): # Get the specific prompt for this section (e.g., "Verse" description) if genre_style in SECTION_PROMPTS and section_name in SECTION_PROMPTS[genre_style]: specific_desc = SECTION_PROMPTS[genre_style][section_name] else: # Fallback if specific section text missing specific_desc = f"{genre_style} style, {section_name} section, dynamic change." final_prompt = f"{specific_desc} strictly {bpm} BPM, high fidelity." print(f"Rendering {section_name}: {final_prompt}") # Set params for this chunk # Note: We generate a bit more than needed to allow for crossfading overlap model.set_generation_params( duration=sec_duration + 2, # +2s for overlap/continuity temperature=temp, cfg_coef=cfg_scale, top_k=int(top_k) ) # Generate Continuation from previous part # We use the LAST 10 seconds of the previous audio as the "seed" for the new part # to ensure flow, but we change the text prompt to change the energy. # Grab last 10s of context context_len = current_context.shape[-1] trim_len = int(10 * 32000) if context_len > trim_len: input_seed = current_context[..., -trim_len:] else: input_seed = current_context next_chunk = model.generate_continuation( prompt=input_seed, prompt_sample_rate=32000, descriptions=[final_prompt], progress=True ) # Add to list # Remove the input_seed part from the output to avoid duplication? # generate_continuation returns the FULL audio (seed + new). # We only want the NEW part. new_audio = next_chunk[..., input_seed.shape[-1]:] generated_parts.append(new_audio) # Update context for next loop current_context = next_chunk # 4. STITCHING # Concatenate all parts final_mix = torch.cat(generated_parts, dim=-1) # 5. EXPORT uid = str(uuid.uuid4())[:8] filename = f"structured_song_{uid}" audio_write(filename, final_mix[0].cpu(), model.sample_rate, strategy="loudness") return f"{filename}.wav", [f"{filename}.wav"] + history # --- UI --- with gr.Blocks(theme=gr.themes.Soft(primary_hue="purple", secondary_hue="zinc")) as demo: history = gr.State([]) gr.Markdown("# 🏗️ Infinite Song Architect: Structure Mode") gr.Markdown("Define a Verse-Chorus structure, and the AI will build each section dynamically.") with gr.Row(): with gr.Column(): audio_in = gr.Audio(label="🎧 Input DNA (Chords/Bass)", type="numpy") with gr.Row(): genre_drop = gr.Dropdown(list(SECTION_PROMPTS.keys()), value="Bass House (Jayms Style)", label="Sound Palette") struct_drop = gr.Dropdown(list(STRUCTURE_PRESETS.keys()), value="Radio Hit (Intro-Verse-Chorus-Drop)", label="Song Structure") with gr.Row(): bpm_slider = gr.Slider(60, 180, 126, label="BPM") len_slider = gr.Slider(30, 120, 60, step=10, label="Total Length") with gr.Accordion("Fine Tune", open=False): cfg_slider = gr.Slider(1, 15, 8, label="Strictness") temp_slider = gr.Slider(0.1, 1.5, 0.6, label="Creativity") top_k_slider = gr.Slider(10, 250, 50, label="Stability") btn = gr.Button("🏗️ BUILD FULL SONG", variant="primary") out_audio = gr.Audio(label="Full Track Output") with gr.Column(): history_list = gr.Files(label="History") btn.click(generate_structured_song, [audio_in, genre_drop, struct_drop, len_slider, bpm_slider, temp_slider, cfg_slider, top_k_slider, history], [out_audio, history_list]) demo.launch()