Spaces:

Curlyblaze
/

PicUrTrack

Sleeping

File size: 7,740 Bytes

3f3e9b0
 
 
 
b4e351f
b86c392
34bb3fa
3f3e9b0
5d54434
3f3e9b0
 
34bb3fa
 
 
 
 
 
 
0ac2f0a
 
34bb3fa
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1dbd358
34bb3fa
1dbd358
 
34bb3fa
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
f052c96
34bb3fa
f052c96
34bb3fa
 
 
 
 
 
 
 
 
 
f052c96
34bb3fa
 
 
 
 
 
 
 
 
 
 
 
 
 
 
f052c96
34bb3fa
 
 
 
 
 
 
 
afe1bd7
34bb3fa
 
 
 
afe1bd7
f052c96
34bb3fa
 
 
 
 
 
 
 
 
 
 
 
 
 
f052c96
5d54434
f052c96
34bb3fa
 
f052c96
34bb3fa
 
 
 
 
 
 
f052c96
 
34bb3fa
 
f052c96
 
34bb3fa
 
f052c96
34bb3fa
 
 
afe1bd7
34bb3fa
 
 
 
 
 
 
 
 
 
 
 
 
 
f052c96

import gradio as gr
from audiocraft.models import MusicGen
from audiocraft.data.audio import audio_write
import torch
import numpy as np
import uuid
import torchaudio

# Load Model
model = MusicGen.get_pretrained('facebook/musicgen-melody')

# --- STRUCTURE DEFINITIONS ---
# These act as "presets" for the song's roadmap
STRUCTURE_PRESETS = {
    "Radio Hit (Intro-Verse-Chorus-Drop)": ["Intro", "Verse", "Build-Up", "Drop/Chorus", "Outro"],
    "Club Extended (Intro-Build-Drop-Break-Drop)": ["Intro", "Build-Up", "Drop", "Breakdown", "Drop 2", "Outro"],
    "Hip Hop Classic (Intro-Verse-Hook-Verse-Hook)": ["Intro", "Verse 1", "Hook", "Verse 2", "Hook", "Outro"],
    "Custom (Single Prompt)": ["Full Track"]
}

# --- STYLE PROMPTS FOR SECTIONS ---
# How each section should sound based on the genre
SECTION_PROMPTS = {
    "Bass House (Jayms Style)": {
        "Intro": "Atmospheric intro, filtered bass, simple hi-hats, tension building.",
        "Verse": "Minimal groove, shuffling hi-hats, deep sub-bass, metallic plucks, clean vocal chops.",
        "Build-Up": "Rising snare roll, pitch riser, white noise sweep, accelerating energy, hype vocals.",
        "Drop": "Explosive Bass House drop, metallic FM wobble bass, heavy kick, sidechain, festival energy.",
        "Breakdown": "Atmospheric pads, filtered chords, no drums, emotional melody.",
        "Outro": "Drum loop fading out, DJ friendly intro/outro, simple bass."
    },
    "Future House (Mainstage)": {
        "Intro": "Bright piano chords, filtered kick, uplifting atmosphere.",
        "Verse": "Plucky melody, snapping fingers, deep house bass, clean production.",
        "Build-Up": "uplifting risers, snare build, hands in the air energy.",
        "Drop": "Bouncy Future House drop, metallic donk bass, shuffling beat, catchy lead.",
        "Outro": "Stripped back drums, fading melody."
    },
    "Dubstep (Heavy)": {
        "Intro": "Dark cinematic drone, ominous atmosphere, slow percussion.",
        "Build-Up": "Intense drum roll, siren rising, pre-drop vocal scream.",
        "Drop": "Heavy dubstep drop, mechanical growl bass, half-time drums, sub-bass impact.",
        "Breakdown": "Orchestral strings, emotional piano, dark ambience.",
        "Outro": "Dark fading textures, slow drum hit."
    }
}

def generate_structured_song(audio_input, genre_style, structure_preset, total_duration, bpm, temp, cfg_scale, top_k, history):
    if audio_input is None: return None, history

    # 1. SETUP AUDIO INPUT (The DNA)
    sr, data = audio_input
    audio_data = data.astype(np.float32)
    if np.max(np.abs(audio_data)) > 0: audio_data /= np.max(np.abs(audio_data))
    
    # Convert to Tensor
    audio_tensor = torch.from_numpy(audio_data).t()
    if audio_tensor.shape[0] > 1: audio_tensor = audio_tensor.mean(dim=0, keepdim=True)
    else: audio_tensor = audio_tensor.unsqueeze(0)
    
    if sr != 32000:
        resampler = torchaudio.transforms.Resample(sr, 32000)
        audio_tensor = resampler(audio_tensor)

    # 2. DETERMINE SECTIONS
    sections = STRUCTURE_PRESETS[structure_preset]
    if structure_preset == "Custom (Single Prompt)":
        # Fallback to simple generation
        pass 

    # Calculate time per section
    # e.g., 60s total / 4 sections = 15s per section
    sec_duration = total_duration / len(sections)
    
    full_song = audio_tensor # Start with the user's input as context (optional, or start fresh)
    # Actually, let's treat the user input as the "Intro" DNA and generate from there.
    
    current_context = audio_tensor.unsqueeze(0) # [1, 1, T]
    
    print(f"Generating {len(sections)} sections...")
    
    generated_parts = []

    # 3. GENERATION LOOP (The Architect)
    for i, section_name in enumerate(sections):
        # Get the specific prompt for this section (e.g., "Verse" description)
        if genre_style in SECTION_PROMPTS and section_name in SECTION_PROMPTS[genre_style]:
            specific_desc = SECTION_PROMPTS[genre_style][section_name]
        else:
            # Fallback if specific section text missing
            specific_desc = f"{genre_style} style, {section_name} section, dynamic change."
        
        final_prompt = f"{specific_desc} strictly {bpm} BPM, high fidelity."
        print(f"Rendering {section_name}: {final_prompt}")

        # Set params for this chunk
        # Note: We generate a bit more than needed to allow for crossfading overlap
        model.set_generation_params(
            duration=sec_duration + 2, # +2s for overlap/continuity
            temperature=temp,
            cfg_coef=cfg_scale,
            top_k=int(top_k)
        )

        # Generate Continuation from previous part
        # We use the LAST 10 seconds of the previous audio as the "seed" for the new part
        # to ensure flow, but we change the text prompt to change the energy.
        
        # Grab last 10s of context
        context_len = current_context.shape[-1]
        trim_len = int(10 * 32000)
        if context_len > trim_len:
            input_seed = current_context[..., -trim_len:]
        else:
            input_seed = current_context

        next_chunk = model.generate_continuation(
            prompt=input_seed,
            prompt_sample_rate=32000,
            descriptions=[final_prompt],
            progress=True
        )
        
        # Add to list
        # Remove the input_seed part from the output to avoid duplication?
        # generate_continuation returns the FULL audio (seed + new). 
        # We only want the NEW part.
        new_audio = next_chunk[..., input_seed.shape[-1]:] 
        
        generated_parts.append(new_audio)
        
        # Update context for next loop
        current_context = next_chunk

    # 4. STITCHING
    # Concatenate all parts
    final_mix = torch.cat(generated_parts, dim=-1)

    # 5. EXPORT
    uid = str(uuid.uuid4())[:8]
    filename = f"structured_song_{uid}"
    audio_write(filename, final_mix[0].cpu(), model.sample_rate, strategy="loudness")
    
    return f"{filename}.wav", [f"{filename}.wav"] + history

# --- UI ---
with gr.Blocks(theme=gr.themes.Soft(primary_hue="purple", secondary_hue="zinc")) as demo:
    history = gr.State([])
    gr.Markdown("# 🏗️ Infinite Song Architect: Structure Mode")
    gr.Markdown("Define a Verse-Chorus structure, and the AI will build each section dynamically.")

    with gr.Row():
        with gr.Column():
            audio_in = gr.Audio(label="🎧 Input DNA (Chords/Bass)", type="numpy")
            
            with gr.Row():
                genre_drop = gr.Dropdown(list(SECTION_PROMPTS.keys()), value="Bass House (Jayms Style)", label="Sound Palette")
                struct_drop = gr.Dropdown(list(STRUCTURE_PRESETS.keys()), value="Radio Hit (Intro-Verse-Chorus-Drop)", label="Song Structure")
            
            with gr.Row():
                bpm_slider = gr.Slider(60, 180, 126, label="BPM")
                len_slider = gr.Slider(30, 120, 60, step=10, label="Total Length")
            
            with gr.Accordion("Fine Tune", open=False):
                cfg_slider = gr.Slider(1, 15, 8, label="Strictness")
                temp_slider = gr.Slider(0.1, 1.5, 0.6, label="Creativity")
                top_k_slider = gr.Slider(10, 250, 50, label="Stability")

            btn = gr.Button("🏗️ BUILD FULL SONG", variant="primary")
            out_audio = gr.Audio(label="Full Track Output")

        with gr.Column():
            history_list = gr.Files(label="History")

    btn.click(generate_structured_song, 
              [audio_in, genre_drop, struct_drop, len_slider, bpm_slider, temp_slider, cfg_slider, top_k_slider, history], 
              [out_audio, history_list])

demo.launch()