File size: 7,740 Bytes
3f3e9b0
 
 
 
b4e351f
b86c392
34bb3fa
3f3e9b0
5d54434
3f3e9b0
 
34bb3fa
 
 
 
 
 
 
0ac2f0a
 
34bb3fa
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1dbd358
34bb3fa
1dbd358
 
34bb3fa
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
f052c96
34bb3fa
f052c96
34bb3fa
 
 
 
 
 
 
 
 
 
f052c96
34bb3fa
 
 
 
 
 
 
 
 
 
 
 
 
 
 
f052c96
34bb3fa
 
 
 
 
 
 
 
afe1bd7
34bb3fa
 
 
 
afe1bd7
f052c96
34bb3fa
 
 
 
 
 
 
 
 
 
 
 
 
 
f052c96
5d54434
f052c96
34bb3fa
 
f052c96
34bb3fa
 
 
 
 
 
 
f052c96
 
34bb3fa
 
f052c96
 
34bb3fa
 
f052c96
34bb3fa
 
 
afe1bd7
34bb3fa
 
 
 
 
 
 
 
 
 
 
 
 
 
f052c96
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
import gradio as gr
from audiocraft.models import MusicGen
from audiocraft.data.audio import audio_write
import torch
import numpy as np
import uuid
import torchaudio

# Load Model
model = MusicGen.get_pretrained('facebook/musicgen-melody')

# --- STRUCTURE DEFINITIONS ---
# These act as "presets" for the song's roadmap
STRUCTURE_PRESETS = {
    "Radio Hit (Intro-Verse-Chorus-Drop)": ["Intro", "Verse", "Build-Up", "Drop/Chorus", "Outro"],
    "Club Extended (Intro-Build-Drop-Break-Drop)": ["Intro", "Build-Up", "Drop", "Breakdown", "Drop 2", "Outro"],
    "Hip Hop Classic (Intro-Verse-Hook-Verse-Hook)": ["Intro", "Verse 1", "Hook", "Verse 2", "Hook", "Outro"],
    "Custom (Single Prompt)": ["Full Track"]
}

# --- STYLE PROMPTS FOR SECTIONS ---
# How each section should sound based on the genre
SECTION_PROMPTS = {
    "Bass House (Jayms Style)": {
        "Intro": "Atmospheric intro, filtered bass, simple hi-hats, tension building.",
        "Verse": "Minimal groove, shuffling hi-hats, deep sub-bass, metallic plucks, clean vocal chops.",
        "Build-Up": "Rising snare roll, pitch riser, white noise sweep, accelerating energy, hype vocals.",
        "Drop": "Explosive Bass House drop, metallic FM wobble bass, heavy kick, sidechain, festival energy.",
        "Breakdown": "Atmospheric pads, filtered chords, no drums, emotional melody.",
        "Outro": "Drum loop fading out, DJ friendly intro/outro, simple bass."
    },
    "Future House (Mainstage)": {
        "Intro": "Bright piano chords, filtered kick, uplifting atmosphere.",
        "Verse": "Plucky melody, snapping fingers, deep house bass, clean production.",
        "Build-Up": "uplifting risers, snare build, hands in the air energy.",
        "Drop": "Bouncy Future House drop, metallic donk bass, shuffling beat, catchy lead.",
        "Outro": "Stripped back drums, fading melody."
    },
    "Dubstep (Heavy)": {
        "Intro": "Dark cinematic drone, ominous atmosphere, slow percussion.",
        "Build-Up": "Intense drum roll, siren rising, pre-drop vocal scream.",
        "Drop": "Heavy dubstep drop, mechanical growl bass, half-time drums, sub-bass impact.",
        "Breakdown": "Orchestral strings, emotional piano, dark ambience.",
        "Outro": "Dark fading textures, slow drum hit."
    }
}

def generate_structured_song(audio_input, genre_style, structure_preset, total_duration, bpm, temp, cfg_scale, top_k, history):
    if audio_input is None: return None, history

    # 1. SETUP AUDIO INPUT (The DNA)
    sr, data = audio_input
    audio_data = data.astype(np.float32)
    if np.max(np.abs(audio_data)) > 0: audio_data /= np.max(np.abs(audio_data))
    
    # Convert to Tensor
    audio_tensor = torch.from_numpy(audio_data).t()
    if audio_tensor.shape[0] > 1: audio_tensor = audio_tensor.mean(dim=0, keepdim=True)
    else: audio_tensor = audio_tensor.unsqueeze(0)
    
    if sr != 32000:
        resampler = torchaudio.transforms.Resample(sr, 32000)
        audio_tensor = resampler(audio_tensor)

    # 2. DETERMINE SECTIONS
    sections = STRUCTURE_PRESETS[structure_preset]
    if structure_preset == "Custom (Single Prompt)":
        # Fallback to simple generation
        pass 

    # Calculate time per section
    # e.g., 60s total / 4 sections = 15s per section
    sec_duration = total_duration / len(sections)
    
    full_song = audio_tensor # Start with the user's input as context (optional, or start fresh)
    # Actually, let's treat the user input as the "Intro" DNA and generate from there.
    
    current_context = audio_tensor.unsqueeze(0) # [1, 1, T]
    
    print(f"Generating {len(sections)} sections...")
    
    generated_parts = []

    # 3. GENERATION LOOP (The Architect)
    for i, section_name in enumerate(sections):
        # Get the specific prompt for this section (e.g., "Verse" description)
        if genre_style in SECTION_PROMPTS and section_name in SECTION_PROMPTS[genre_style]:
            specific_desc = SECTION_PROMPTS[genre_style][section_name]
        else:
            # Fallback if specific section text missing
            specific_desc = f"{genre_style} style, {section_name} section, dynamic change."
        
        final_prompt = f"{specific_desc} strictly {bpm} BPM, high fidelity."
        print(f"Rendering {section_name}: {final_prompt}")

        # Set params for this chunk
        # Note: We generate a bit more than needed to allow for crossfading overlap
        model.set_generation_params(
            duration=sec_duration + 2, # +2s for overlap/continuity
            temperature=temp,
            cfg_coef=cfg_scale,
            top_k=int(top_k)
        )

        # Generate Continuation from previous part
        # We use the LAST 10 seconds of the previous audio as the "seed" for the new part
        # to ensure flow, but we change the text prompt to change the energy.
        
        # Grab last 10s of context
        context_len = current_context.shape[-1]
        trim_len = int(10 * 32000)
        if context_len > trim_len:
            input_seed = current_context[..., -trim_len:]
        else:
            input_seed = current_context

        next_chunk = model.generate_continuation(
            prompt=input_seed,
            prompt_sample_rate=32000,
            descriptions=[final_prompt],
            progress=True
        )
        
        # Add to list
        # Remove the input_seed part from the output to avoid duplication?
        # generate_continuation returns the FULL audio (seed + new). 
        # We only want the NEW part.
        new_audio = next_chunk[..., input_seed.shape[-1]:] 
        
        generated_parts.append(new_audio)
        
        # Update context for next loop
        current_context = next_chunk

    # 4. STITCHING
    # Concatenate all parts
    final_mix = torch.cat(generated_parts, dim=-1)

    # 5. EXPORT
    uid = str(uuid.uuid4())[:8]
    filename = f"structured_song_{uid}"
    audio_write(filename, final_mix[0].cpu(), model.sample_rate, strategy="loudness")
    
    return f"{filename}.wav", [f"{filename}.wav"] + history

# --- UI ---
with gr.Blocks(theme=gr.themes.Soft(primary_hue="purple", secondary_hue="zinc")) as demo:
    history = gr.State([])
    gr.Markdown("# 🏗️ Infinite Song Architect: Structure Mode")
    gr.Markdown("Define a Verse-Chorus structure, and the AI will build each section dynamically.")

    with gr.Row():
        with gr.Column():
            audio_in = gr.Audio(label="🎧 Input DNA (Chords/Bass)", type="numpy")
            
            with gr.Row():
                genre_drop = gr.Dropdown(list(SECTION_PROMPTS.keys()), value="Bass House (Jayms Style)", label="Sound Palette")
                struct_drop = gr.Dropdown(list(STRUCTURE_PRESETS.keys()), value="Radio Hit (Intro-Verse-Chorus-Drop)", label="Song Structure")
            
            with gr.Row():
                bpm_slider = gr.Slider(60, 180, 126, label="BPM")
                len_slider = gr.Slider(30, 120, 60, step=10, label="Total Length")
            
            with gr.Accordion("Fine Tune", open=False):
                cfg_slider = gr.Slider(1, 15, 8, label="Strictness")
                temp_slider = gr.Slider(0.1, 1.5, 0.6, label="Creativity")
                top_k_slider = gr.Slider(10, 250, 50, label="Stability")

            btn = gr.Button("🏗️ BUILD FULL SONG", variant="primary")
            out_audio = gr.Audio(label="Full Track Output")

        with gr.Column():
            history_list = gr.Files(label="History")

    btn.click(generate_structured_song, 
              [audio_in, genre_drop, struct_drop, len_slider, bpm_slider, temp_slider, cfg_slider, top_k_slider, history], 
              [out_audio, history_list])

demo.launch()