Spaces:

Curlyblaze
/

PicUrTrack

Sleeping

App Files Files Community

PicUrTrack / app.py

Curlyblaze

Update app.py

34bb3fa verified about 2 months ago

raw

history blame contribute delete

7.74 kB

	import gradio as gr
	from audiocraft.models import MusicGen
	from audiocraft.data.audio import audio_write
	import torch
	import numpy as np
	import uuid
	import torchaudio

	# Load Model
	model = MusicGen.get_pretrained('facebook/musicgen-melody')

	# --- STRUCTURE DEFINITIONS ---
	# These act as "presets" for the song's roadmap
	STRUCTURE_PRESETS = {
	"Radio Hit (Intro-Verse-Chorus-Drop)": ["Intro", "Verse", "Build-Up", "Drop/Chorus", "Outro"],
	"Club Extended (Intro-Build-Drop-Break-Drop)": ["Intro", "Build-Up", "Drop", "Breakdown", "Drop 2", "Outro"],
	"Hip Hop Classic (Intro-Verse-Hook-Verse-Hook)": ["Intro", "Verse 1", "Hook", "Verse 2", "Hook", "Outro"],
	"Custom (Single Prompt)": ["Full Track"]
	}

	# --- STYLE PROMPTS FOR SECTIONS ---
	# How each section should sound based on the genre
	SECTION_PROMPTS = {
	"Bass House (Jayms Style)": {
	"Intro": "Atmospheric intro, filtered bass, simple hi-hats, tension building.",
	"Verse": "Minimal groove, shuffling hi-hats, deep sub-bass, metallic plucks, clean vocal chops.",
	"Build-Up": "Rising snare roll, pitch riser, white noise sweep, accelerating energy, hype vocals.",
	"Drop": "Explosive Bass House drop, metallic FM wobble bass, heavy kick, sidechain, festival energy.",
	"Breakdown": "Atmospheric pads, filtered chords, no drums, emotional melody.",
	"Outro": "Drum loop fading out, DJ friendly intro/outro, simple bass."
	},
	"Future House (Mainstage)": {
	"Intro": "Bright piano chords, filtered kick, uplifting atmosphere.",
	"Verse": "Plucky melody, snapping fingers, deep house bass, clean production.",
	"Build-Up": "uplifting risers, snare build, hands in the air energy.",
	"Drop": "Bouncy Future House drop, metallic donk bass, shuffling beat, catchy lead.",
	"Outro": "Stripped back drums, fading melody."
	},
	"Dubstep (Heavy)": {
	"Intro": "Dark cinematic drone, ominous atmosphere, slow percussion.",
	"Build-Up": "Intense drum roll, siren rising, pre-drop vocal scream.",
	"Drop": "Heavy dubstep drop, mechanical growl bass, half-time drums, sub-bass impact.",
	"Breakdown": "Orchestral strings, emotional piano, dark ambience.",
	"Outro": "Dark fading textures, slow drum hit."
	}
	}

	def generate_structured_song(audio_input, genre_style, structure_preset, total_duration, bpm, temp, cfg_scale, top_k, history):
	if audio_input is None: return None, history

	# 1. SETUP AUDIO INPUT (The DNA)
	sr, data = audio_input
	audio_data = data.astype(np.float32)
	if np.max(np.abs(audio_data)) > 0: audio_data /= np.max(np.abs(audio_data))

	# Convert to Tensor
	audio_tensor = torch.from_numpy(audio_data).t()
	if audio_tensor.shape[0] > 1: audio_tensor = audio_tensor.mean(dim=0, keepdim=True)
	else: audio_tensor = audio_tensor.unsqueeze(0)

	if sr != 32000:
	resampler = torchaudio.transforms.Resample(sr, 32000)
	audio_tensor = resampler(audio_tensor)

	# 2. DETERMINE SECTIONS
	sections = STRUCTURE_PRESETS[structure_preset]
	if structure_preset == "Custom (Single Prompt)":
	# Fallback to simple generation
	pass

	# Calculate time per section
	# e.g., 60s total / 4 sections = 15s per section
	sec_duration = total_duration / len(sections)

	full_song = audio_tensor # Start with the user's input as context (optional, or start fresh)
	# Actually, let's treat the user input as the "Intro" DNA and generate from there.

	current_context = audio_tensor.unsqueeze(0) # [1, 1, T]

	print(f"Generating {len(sections)} sections...")

	generated_parts = []

	# 3. GENERATION LOOP (The Architect)
	for i, section_name in enumerate(sections):
	# Get the specific prompt for this section (e.g., "Verse" description)
	if genre_style in SECTION_PROMPTS and section_name in SECTION_PROMPTS[genre_style]:
	specific_desc = SECTION_PROMPTS[genre_style][section_name]
	else:
	# Fallback if specific section text missing
	specific_desc = f"{genre_style} style, {section_name} section, dynamic change."

	final_prompt = f"{specific_desc} strictly {bpm} BPM, high fidelity."
	print(f"Rendering {section_name}: {final_prompt}")

	# Set params for this chunk
	# Note: We generate a bit more than needed to allow for crossfading overlap
	model.set_generation_params(
	duration=sec_duration + 2, # +2s for overlap/continuity
	temperature=temp,
	cfg_coef=cfg_scale,
	top_k=int(top_k)
	)

	# Generate Continuation from previous part
	# We use the LAST 10 seconds of the previous audio as the "seed" for the new part
	# to ensure flow, but we change the text prompt to change the energy.

	# Grab last 10s of context
	context_len = current_context.shape[-1]
	trim_len = int(10 * 32000)
	if context_len > trim_len:
	input_seed = current_context[..., -trim_len:]
	else:
	input_seed = current_context

	next_chunk = model.generate_continuation(
	prompt=input_seed,
	prompt_sample_rate=32000,
	descriptions=[final_prompt],
	progress=True
	)

	# Add to list
	# Remove the input_seed part from the output to avoid duplication?
	# generate_continuation returns the FULL audio (seed + new).
	# We only want the NEW part.
	new_audio = next_chunk[..., input_seed.shape[-1]:]

	generated_parts.append(new_audio)

	# Update context for next loop
	current_context = next_chunk

	# 4. STITCHING
	# Concatenate all parts
	final_mix = torch.cat(generated_parts, dim=-1)

	# 5. EXPORT
	uid = str(uuid.uuid4())[:8]
	filename = f"structured_song_{uid}"
	audio_write(filename, final_mix[0].cpu(), model.sample_rate, strategy="loudness")

	return f"{filename}.wav", [f"{filename}.wav"] + history

	# --- UI ---
	with gr.Blocks(theme=gr.themes.Soft(primary_hue="purple", secondary_hue="zinc")) as demo:
	history = gr.State([])
	gr.Markdown("# 🏗️ Infinite Song Architect: Structure Mode")
	gr.Markdown("Define a Verse-Chorus structure, and the AI will build each section dynamically.")

	with gr.Row():
	with gr.Column():
	audio_in = gr.Audio(label="🎧 Input DNA (Chords/Bass)", type="numpy")

	with gr.Row():
	genre_drop = gr.Dropdown(list(SECTION_PROMPTS.keys()), value="Bass House (Jayms Style)", label="Sound Palette")
	struct_drop = gr.Dropdown(list(STRUCTURE_PRESETS.keys()), value="Radio Hit (Intro-Verse-Chorus-Drop)", label="Song Structure")

	with gr.Row():
	bpm_slider = gr.Slider(60, 180, 126, label="BPM")
	len_slider = gr.Slider(30, 120, 60, step=10, label="Total Length")

	with gr.Accordion("Fine Tune", open=False):
	cfg_slider = gr.Slider(1, 15, 8, label="Strictness")
	temp_slider = gr.Slider(0.1, 1.5, 0.6, label="Creativity")
	top_k_slider = gr.Slider(10, 250, 50, label="Stability")

	btn = gr.Button("🏗️ BUILD FULL SONG", variant="primary")
	out_audio = gr.Audio(label="Full Track Output")

	with gr.Column():
	history_list = gr.Files(label="History")

	btn.click(generate_structured_song,
	[audio_in, genre_drop, struct_drop, len_slider, bpm_slider, temp_slider, cfg_slider, top_k_slider, history],
	[out_audio, history_list])

	demo.launch()