Spaces:
Sleeping
Sleeping
File size: 7,740 Bytes
3f3e9b0 b4e351f b86c392 34bb3fa 3f3e9b0 5d54434 3f3e9b0 34bb3fa 0ac2f0a 34bb3fa 1dbd358 34bb3fa 1dbd358 34bb3fa f052c96 34bb3fa f052c96 34bb3fa f052c96 34bb3fa f052c96 34bb3fa afe1bd7 34bb3fa afe1bd7 f052c96 34bb3fa f052c96 5d54434 f052c96 34bb3fa f052c96 34bb3fa f052c96 34bb3fa f052c96 34bb3fa f052c96 34bb3fa afe1bd7 34bb3fa f052c96 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 | import gradio as gr
from audiocraft.models import MusicGen
from audiocraft.data.audio import audio_write
import torch
import numpy as np
import uuid
import torchaudio
# Load Model
model = MusicGen.get_pretrained('facebook/musicgen-melody')
# --- STRUCTURE DEFINITIONS ---
# These act as "presets" for the song's roadmap
STRUCTURE_PRESETS = {
"Radio Hit (Intro-Verse-Chorus-Drop)": ["Intro", "Verse", "Build-Up", "Drop/Chorus", "Outro"],
"Club Extended (Intro-Build-Drop-Break-Drop)": ["Intro", "Build-Up", "Drop", "Breakdown", "Drop 2", "Outro"],
"Hip Hop Classic (Intro-Verse-Hook-Verse-Hook)": ["Intro", "Verse 1", "Hook", "Verse 2", "Hook", "Outro"],
"Custom (Single Prompt)": ["Full Track"]
}
# --- STYLE PROMPTS FOR SECTIONS ---
# How each section should sound based on the genre
SECTION_PROMPTS = {
"Bass House (Jayms Style)": {
"Intro": "Atmospheric intro, filtered bass, simple hi-hats, tension building.",
"Verse": "Minimal groove, shuffling hi-hats, deep sub-bass, metallic plucks, clean vocal chops.",
"Build-Up": "Rising snare roll, pitch riser, white noise sweep, accelerating energy, hype vocals.",
"Drop": "Explosive Bass House drop, metallic FM wobble bass, heavy kick, sidechain, festival energy.",
"Breakdown": "Atmospheric pads, filtered chords, no drums, emotional melody.",
"Outro": "Drum loop fading out, DJ friendly intro/outro, simple bass."
},
"Future House (Mainstage)": {
"Intro": "Bright piano chords, filtered kick, uplifting atmosphere.",
"Verse": "Plucky melody, snapping fingers, deep house bass, clean production.",
"Build-Up": "uplifting risers, snare build, hands in the air energy.",
"Drop": "Bouncy Future House drop, metallic donk bass, shuffling beat, catchy lead.",
"Outro": "Stripped back drums, fading melody."
},
"Dubstep (Heavy)": {
"Intro": "Dark cinematic drone, ominous atmosphere, slow percussion.",
"Build-Up": "Intense drum roll, siren rising, pre-drop vocal scream.",
"Drop": "Heavy dubstep drop, mechanical growl bass, half-time drums, sub-bass impact.",
"Breakdown": "Orchestral strings, emotional piano, dark ambience.",
"Outro": "Dark fading textures, slow drum hit."
}
}
def generate_structured_song(audio_input, genre_style, structure_preset, total_duration, bpm, temp, cfg_scale, top_k, history):
if audio_input is None: return None, history
# 1. SETUP AUDIO INPUT (The DNA)
sr, data = audio_input
audio_data = data.astype(np.float32)
if np.max(np.abs(audio_data)) > 0: audio_data /= np.max(np.abs(audio_data))
# Convert to Tensor
audio_tensor = torch.from_numpy(audio_data).t()
if audio_tensor.shape[0] > 1: audio_tensor = audio_tensor.mean(dim=0, keepdim=True)
else: audio_tensor = audio_tensor.unsqueeze(0)
if sr != 32000:
resampler = torchaudio.transforms.Resample(sr, 32000)
audio_tensor = resampler(audio_tensor)
# 2. DETERMINE SECTIONS
sections = STRUCTURE_PRESETS[structure_preset]
if structure_preset == "Custom (Single Prompt)":
# Fallback to simple generation
pass
# Calculate time per section
# e.g., 60s total / 4 sections = 15s per section
sec_duration = total_duration / len(sections)
full_song = audio_tensor # Start with the user's input as context (optional, or start fresh)
# Actually, let's treat the user input as the "Intro" DNA and generate from there.
current_context = audio_tensor.unsqueeze(0) # [1, 1, T]
print(f"Generating {len(sections)} sections...")
generated_parts = []
# 3. GENERATION LOOP (The Architect)
for i, section_name in enumerate(sections):
# Get the specific prompt for this section (e.g., "Verse" description)
if genre_style in SECTION_PROMPTS and section_name in SECTION_PROMPTS[genre_style]:
specific_desc = SECTION_PROMPTS[genre_style][section_name]
else:
# Fallback if specific section text missing
specific_desc = f"{genre_style} style, {section_name} section, dynamic change."
final_prompt = f"{specific_desc} strictly {bpm} BPM, high fidelity."
print(f"Rendering {section_name}: {final_prompt}")
# Set params for this chunk
# Note: We generate a bit more than needed to allow for crossfading overlap
model.set_generation_params(
duration=sec_duration + 2, # +2s for overlap/continuity
temperature=temp,
cfg_coef=cfg_scale,
top_k=int(top_k)
)
# Generate Continuation from previous part
# We use the LAST 10 seconds of the previous audio as the "seed" for the new part
# to ensure flow, but we change the text prompt to change the energy.
# Grab last 10s of context
context_len = current_context.shape[-1]
trim_len = int(10 * 32000)
if context_len > trim_len:
input_seed = current_context[..., -trim_len:]
else:
input_seed = current_context
next_chunk = model.generate_continuation(
prompt=input_seed,
prompt_sample_rate=32000,
descriptions=[final_prompt],
progress=True
)
# Add to list
# Remove the input_seed part from the output to avoid duplication?
# generate_continuation returns the FULL audio (seed + new).
# We only want the NEW part.
new_audio = next_chunk[..., input_seed.shape[-1]:]
generated_parts.append(new_audio)
# Update context for next loop
current_context = next_chunk
# 4. STITCHING
# Concatenate all parts
final_mix = torch.cat(generated_parts, dim=-1)
# 5. EXPORT
uid = str(uuid.uuid4())[:8]
filename = f"structured_song_{uid}"
audio_write(filename, final_mix[0].cpu(), model.sample_rate, strategy="loudness")
return f"{filename}.wav", [f"{filename}.wav"] + history
# --- UI ---
with gr.Blocks(theme=gr.themes.Soft(primary_hue="purple", secondary_hue="zinc")) as demo:
history = gr.State([])
gr.Markdown("# 🏗️ Infinite Song Architect: Structure Mode")
gr.Markdown("Define a Verse-Chorus structure, and the AI will build each section dynamically.")
with gr.Row():
with gr.Column():
audio_in = gr.Audio(label="🎧 Input DNA (Chords/Bass)", type="numpy")
with gr.Row():
genre_drop = gr.Dropdown(list(SECTION_PROMPTS.keys()), value="Bass House (Jayms Style)", label="Sound Palette")
struct_drop = gr.Dropdown(list(STRUCTURE_PRESETS.keys()), value="Radio Hit (Intro-Verse-Chorus-Drop)", label="Song Structure")
with gr.Row():
bpm_slider = gr.Slider(60, 180, 126, label="BPM")
len_slider = gr.Slider(30, 120, 60, step=10, label="Total Length")
with gr.Accordion("Fine Tune", open=False):
cfg_slider = gr.Slider(1, 15, 8, label="Strictness")
temp_slider = gr.Slider(0.1, 1.5, 0.6, label="Creativity")
top_k_slider = gr.Slider(10, 250, 50, label="Stability")
btn = gr.Button("🏗️ BUILD FULL SONG", variant="primary")
out_audio = gr.Audio(label="Full Track Output")
with gr.Column():
history_list = gr.Files(label="History")
btn.click(generate_structured_song,
[audio_in, genre_drop, struct_drop, len_slider, bpm_slider, temp_slider, cfg_slider, top_k_slider, history],
[out_audio, history_list])
demo.launch() |