PicUrTrack / app.py
Curlyblaze's picture
Update app.py
34bb3fa verified
import gradio as gr
from audiocraft.models import MusicGen
from audiocraft.data.audio import audio_write
import torch
import numpy as np
import uuid
import torchaudio
# Load Model
model = MusicGen.get_pretrained('facebook/musicgen-melody')
# --- STRUCTURE DEFINITIONS ---
# These act as "presets" for the song's roadmap
STRUCTURE_PRESETS = {
"Radio Hit (Intro-Verse-Chorus-Drop)": ["Intro", "Verse", "Build-Up", "Drop/Chorus", "Outro"],
"Club Extended (Intro-Build-Drop-Break-Drop)": ["Intro", "Build-Up", "Drop", "Breakdown", "Drop 2", "Outro"],
"Hip Hop Classic (Intro-Verse-Hook-Verse-Hook)": ["Intro", "Verse 1", "Hook", "Verse 2", "Hook", "Outro"],
"Custom (Single Prompt)": ["Full Track"]
}
# --- STYLE PROMPTS FOR SECTIONS ---
# How each section should sound based on the genre
SECTION_PROMPTS = {
"Bass House (Jayms Style)": {
"Intro": "Atmospheric intro, filtered bass, simple hi-hats, tension building.",
"Verse": "Minimal groove, shuffling hi-hats, deep sub-bass, metallic plucks, clean vocal chops.",
"Build-Up": "Rising snare roll, pitch riser, white noise sweep, accelerating energy, hype vocals.",
"Drop": "Explosive Bass House drop, metallic FM wobble bass, heavy kick, sidechain, festival energy.",
"Breakdown": "Atmospheric pads, filtered chords, no drums, emotional melody.",
"Outro": "Drum loop fading out, DJ friendly intro/outro, simple bass."
},
"Future House (Mainstage)": {
"Intro": "Bright piano chords, filtered kick, uplifting atmosphere.",
"Verse": "Plucky melody, snapping fingers, deep house bass, clean production.",
"Build-Up": "uplifting risers, snare build, hands in the air energy.",
"Drop": "Bouncy Future House drop, metallic donk bass, shuffling beat, catchy lead.",
"Outro": "Stripped back drums, fading melody."
},
"Dubstep (Heavy)": {
"Intro": "Dark cinematic drone, ominous atmosphere, slow percussion.",
"Build-Up": "Intense drum roll, siren rising, pre-drop vocal scream.",
"Drop": "Heavy dubstep drop, mechanical growl bass, half-time drums, sub-bass impact.",
"Breakdown": "Orchestral strings, emotional piano, dark ambience.",
"Outro": "Dark fading textures, slow drum hit."
}
}
def generate_structured_song(audio_input, genre_style, structure_preset, total_duration, bpm, temp, cfg_scale, top_k, history):
if audio_input is None: return None, history
# 1. SETUP AUDIO INPUT (The DNA)
sr, data = audio_input
audio_data = data.astype(np.float32)
if np.max(np.abs(audio_data)) > 0: audio_data /= np.max(np.abs(audio_data))
# Convert to Tensor
audio_tensor = torch.from_numpy(audio_data).t()
if audio_tensor.shape[0] > 1: audio_tensor = audio_tensor.mean(dim=0, keepdim=True)
else: audio_tensor = audio_tensor.unsqueeze(0)
if sr != 32000:
resampler = torchaudio.transforms.Resample(sr, 32000)
audio_tensor = resampler(audio_tensor)
# 2. DETERMINE SECTIONS
sections = STRUCTURE_PRESETS[structure_preset]
if structure_preset == "Custom (Single Prompt)":
# Fallback to simple generation
pass
# Calculate time per section
# e.g., 60s total / 4 sections = 15s per section
sec_duration = total_duration / len(sections)
full_song = audio_tensor # Start with the user's input as context (optional, or start fresh)
# Actually, let's treat the user input as the "Intro" DNA and generate from there.
current_context = audio_tensor.unsqueeze(0) # [1, 1, T]
print(f"Generating {len(sections)} sections...")
generated_parts = []
# 3. GENERATION LOOP (The Architect)
for i, section_name in enumerate(sections):
# Get the specific prompt for this section (e.g., "Verse" description)
if genre_style in SECTION_PROMPTS and section_name in SECTION_PROMPTS[genre_style]:
specific_desc = SECTION_PROMPTS[genre_style][section_name]
else:
# Fallback if specific section text missing
specific_desc = f"{genre_style} style, {section_name} section, dynamic change."
final_prompt = f"{specific_desc} strictly {bpm} BPM, high fidelity."
print(f"Rendering {section_name}: {final_prompt}")
# Set params for this chunk
# Note: We generate a bit more than needed to allow for crossfading overlap
model.set_generation_params(
duration=sec_duration + 2, # +2s for overlap/continuity
temperature=temp,
cfg_coef=cfg_scale,
top_k=int(top_k)
)
# Generate Continuation from previous part
# We use the LAST 10 seconds of the previous audio as the "seed" for the new part
# to ensure flow, but we change the text prompt to change the energy.
# Grab last 10s of context
context_len = current_context.shape[-1]
trim_len = int(10 * 32000)
if context_len > trim_len:
input_seed = current_context[..., -trim_len:]
else:
input_seed = current_context
next_chunk = model.generate_continuation(
prompt=input_seed,
prompt_sample_rate=32000,
descriptions=[final_prompt],
progress=True
)
# Add to list
# Remove the input_seed part from the output to avoid duplication?
# generate_continuation returns the FULL audio (seed + new).
# We only want the NEW part.
new_audio = next_chunk[..., input_seed.shape[-1]:]
generated_parts.append(new_audio)
# Update context for next loop
current_context = next_chunk
# 4. STITCHING
# Concatenate all parts
final_mix = torch.cat(generated_parts, dim=-1)
# 5. EXPORT
uid = str(uuid.uuid4())[:8]
filename = f"structured_song_{uid}"
audio_write(filename, final_mix[0].cpu(), model.sample_rate, strategy="loudness")
return f"{filename}.wav", [f"{filename}.wav"] + history
# --- UI ---
with gr.Blocks(theme=gr.themes.Soft(primary_hue="purple", secondary_hue="zinc")) as demo:
history = gr.State([])
gr.Markdown("# 🏗️ Infinite Song Architect: Structure Mode")
gr.Markdown("Define a Verse-Chorus structure, and the AI will build each section dynamically.")
with gr.Row():
with gr.Column():
audio_in = gr.Audio(label="🎧 Input DNA (Chords/Bass)", type="numpy")
with gr.Row():
genre_drop = gr.Dropdown(list(SECTION_PROMPTS.keys()), value="Bass House (Jayms Style)", label="Sound Palette")
struct_drop = gr.Dropdown(list(STRUCTURE_PRESETS.keys()), value="Radio Hit (Intro-Verse-Chorus-Drop)", label="Song Structure")
with gr.Row():
bpm_slider = gr.Slider(60, 180, 126, label="BPM")
len_slider = gr.Slider(30, 120, 60, step=10, label="Total Length")
with gr.Accordion("Fine Tune", open=False):
cfg_slider = gr.Slider(1, 15, 8, label="Strictness")
temp_slider = gr.Slider(0.1, 1.5, 0.6, label="Creativity")
top_k_slider = gr.Slider(10, 250, 50, label="Stability")
btn = gr.Button("🏗️ BUILD FULL SONG", variant="primary")
out_audio = gr.Audio(label="Full Track Output")
with gr.Column():
history_list = gr.Files(label="History")
btn.click(generate_structured_song,
[audio_in, genre_drop, struct_drop, len_slider, bpm_slider, temp_slider, cfg_slider, top_k_slider, history],
[out_audio, history_list])
demo.launch()