""" BeatForge - AI music generation studio for Hugging Face CPU Basic. The default engine is a deterministic/procedural composer that turns structured lyrics and style tags into original instrumental audio. It is intentionally CPU-native so the Space runs on the free Hugging Face tier. The UI and function boundaries are ready for a future HeartMuLa/MusicGen GPU backend. """ from __future__ import annotations import math import os import re import tempfile import time import uuid from dataclasses import dataclass from pathlib import Path from typing import Dict, Iterable, List, Tuple import gradio as gr import numpy as np import requests import soundfile as sf from scipy import signal try: from pydub import AudioSegment except Exception: AudioSegment = None SR = 44100 MAX_SECONDS = 150 SECTION_RE = re.compile(r"^\s*\[([^\]]+)\]\s*$", re.MULTILINE) STYLE_PRESETS: Dict[str, Dict[str, object]] = { "pop": {"tempo": 112, "scale": "major", "drum": "four", "swing": 0.00, "brightness": 0.55}, "acoustic": {"tempo": 92, "scale": "major", "drum": "soft", "swing": 0.02, "brightness": 0.35}, "electronic": {"tempo": 124, "scale": "minor", "drum": "four", "swing": 0.00, "brightness": 0.75}, "synthwave": {"tempo": 104, "scale": "minor", "drum": "four", "swing": 0.01, "brightness": 0.70}, "rock": {"tempo": 128, "scale": "minor", "drum": "rock", "swing": 0.00, "brightness": 0.62}, "trap": {"tempo": 140, "scale": "minor", "drum": "trap", "swing": 0.04, "brightness": 0.66}, "lofi": {"tempo": 78, "scale": "minor", "drum": "lofi", "swing": 0.08, "brightness": 0.28}, "jazz": {"tempo": 96, "scale": "major", "drum": "brush", "swing": 0.16, "brightness": 0.42}, "cinematic": {"tempo": 76, "scale": "minor", "drum": "cinematic", "swing": 0.00, "brightness": 0.50}, } NOTE_ROOTS = { "C": 261.63, "C#": 277.18, "D": 293.66, "D#": 311.13, "E": 329.63, "F": 349.23, "F#": 369.99, "G": 392.00, "G#": 415.30, "A": 440.00, "A#": 466.16, "B": 493.88, } MAJOR = np.array([0, 2, 4, 5, 7, 9, 11]) MINOR = np.array([0, 2, 3, 5, 7, 8, 10]) CUSTOM_CSS = """ .gradio-container { max-width: 1180px !important; margin: 0 auto !important; font-family: Inter, ui-sans-serif, system-ui, -apple-system, BlinkMacSystemFont, 'Segoe UI', sans-serif !important; } .hero { min-height: 340px; padding: 2.4rem 1.4rem 1.2rem; border-radius: 8px; color: #f8fafc; background-image: linear-gradient(rgba(8, 12, 18, 0.58), rgba(8, 12, 18, 0.78)), url('https://images.unsplash.com/photo-1493225457124-a3eb161ffa5f?auto=format&fit=crop&w=1800&q=80'); background-size: cover; background-position: center; border: 1px solid rgba(248, 113, 113, 0.28); margin-bottom: 1rem; display: flex; flex-direction: column; justify-content: flex-end; } .hero h1 { margin: 0 0 0.45rem 0 !important; color: #fff !important; font-size: 2.7rem !important; letter-spacing: 0; line-height: 1.05; } .hero p { margin: 0 !important; color: #dbeafe !important; font-size: 1rem; max-width: 760px; } .hero-row { display: flex; flex-wrap: wrap; gap: 0.55rem; margin-top: 1rem; align-items: center; } .badge { border-radius: 8px; border: 1px solid rgba(255, 255, 255, 0.24); background: rgba(255, 255, 255, 0.12); padding: 0.38rem 0.72rem; color: #fff; font-size: 0.82rem; } .brand-link { color: #ffffff !important; background: #dc2626; border-radius: 8px; padding: 0.48rem 0.82rem; font-weight: 800; text-decoration: none !important; } .panel { border-radius: 8px; border: 1px solid rgba(148, 163, 184, 0.18); padding: 1rem; } .status textarea { font-family: 'JetBrains Mono', Consolas, monospace !important; font-size: 0.86rem !important; } button, .gr-button { border-radius: 8px !important; } .notice { border: 1px solid rgba(14, 165, 233, 0.30); background: rgba(14, 165, 233, 0.08); padding: 1rem; border-radius: 8px; line-height: 1.55; } @media (max-width: 640px) { .hero { min-height: 300px; padding: 1.4rem 1rem 1rem; } .hero h1 { font-size: 2rem !important; } } """ @dataclass class Section: name: str text: str @dataclass class SongPlan: tempo: int key: str scale: str drum: str swing: float brightness: float seed: int sections: List[Section] def tmp_path(suffix: str) -> str: return tempfile.NamedTemporaryFile(prefix=f"beatforge_{uuid.uuid4().hex[:10]}_", suffix=suffix, delete=False).name def normalize(audio: np.ndarray, peak: float = 0.94) -> np.ndarray: audio = np.nan_to_num(audio.astype(np.float32), nan=0.0, posinf=0.0, neginf=0.0) if audio.size == 0: return audio max_val = float(np.max(np.abs(audio))) if max_val > peak: audio = audio / max_val * peak return audio.astype(np.float32) def parse_sections(lyrics: str) -> List[Section]: lyrics = (lyrics or "").strip() if not lyrics: return [Section("Instrumental", "Open instrumental theme")] matches = list(SECTION_RE.finditer(lyrics)) if not matches: return [Section("Verse", lyrics)] sections: List[Section] = [] for idx, match in enumerate(matches): start = match.end() end = matches[idx + 1].start() if idx + 1 < len(matches) else len(lyrics) text = lyrics[start:end].strip() sections.append(Section(match.group(1).strip().title(), text or match.group(1).strip())) return sections or [Section("Verse", lyrics)] def stable_seed(*parts: str) -> int: data = "|".join(parts) value = 2166136261 for ch in data: value ^= ord(ch) value = (value * 16777619) & 0xFFFFFFFF return value def choose_plan(lyrics: str, tags: str, duration: int, creativity: float) -> SongPlan: tag_text = (tags or "").lower() preset = STYLE_PRESETS["pop"].copy() for name, values in STYLE_PRESETS.items(): if name in tag_text: preset.update(values) break if "happy" in tag_text or "bright" in tag_text or "uplifting" in tag_text: preset["scale"] = "major" if "dark" in tag_text or "sad" in tag_text or "moody" in tag_text: preset["scale"] = "minor" if "slow" in tag_text: preset["tempo"] = max(64, int(preset["tempo"]) - 18) if "fast" in tag_text or "energetic" in tag_text: preset["tempo"] = min(150, int(preset["tempo"]) + 16) seed = stable_seed(lyrics[:800], tags, str(duration), str(creativity)) keys = list(NOTE_ROOTS.keys()) key = keys[seed % len(keys)] tempo_jitter = int((creativity - 1.0) * 10) return SongPlan( tempo=int(np.clip(int(preset["tempo"]) + tempo_jitter, 62, 156)), key=key, scale=str(preset["scale"]), drum=str(preset["drum"]), swing=float(preset["swing"]), brightness=float(preset["brightness"]), seed=seed, sections=parse_sections(lyrics), ) def note_freq(root: str, semitone: int, octave_shift: int = 0) -> float: return NOTE_ROOTS[root] * (2 ** ((semitone + 12 * octave_shift) / 12.0)) def envelope(length: int, attack: float, release: float) -> np.ndarray: env = np.ones(length, dtype=np.float32) a = min(length, max(1, int(attack * SR))) r = min(length, max(1, int(release * SR))) env[:a] = np.linspace(0, 1, a) env[-r:] *= np.linspace(1, 0, r) return env def osc(freq: float, seconds: float, kind: str = "sine", phase: float = 0.0) -> np.ndarray: n = max(1, int(seconds * SR)) t = np.arange(n, dtype=np.float32) / SR if kind == "saw": return signal.sawtooth(2 * np.pi * freq * t + phase).astype(np.float32) if kind == "square": return signal.square(2 * np.pi * freq * t + phase).astype(np.float32) if kind == "tri": return signal.sawtooth(2 * np.pi * freq * t + phase, width=0.5).astype(np.float32) return np.sin(2 * np.pi * freq * t + phase).astype(np.float32) def add_at(track: np.ndarray, start: int, audio: np.ndarray, gain: float = 1.0) -> None: if start >= len(track): return end = min(len(track), start + len(audio)) track[start:end] += audio[: end - start] * gain def kick() -> np.ndarray: n = int(0.34 * SR) t = np.arange(n) / SR freq = 92 * np.exp(-t * 18) + 38 phase = 2 * np.pi * np.cumsum(freq) / SR body = np.sin(phase) * np.exp(-t * 9) click = np.random.default_rng(7).normal(0, 0.018, n) * np.exp(-t * 85) return normalize((body + click).astype(np.float32), 0.95) def snare() -> np.ndarray: n = int(0.22 * SR) rng = np.random.default_rng(11) noise = rng.normal(0, 1, n).astype(np.float32) sos = signal.butter(2, [1400, 7200], btype="bandpass", fs=SR, output="sos") noise = signal.sosfilt(sos, noise) * np.exp(-np.arange(n) / SR * 16) tone = osc(190, n / SR, "sine") * np.exp(-np.arange(n) / SR * 22) return normalize(noise * 0.7 + tone * 0.35, 0.8) def hat() -> np.ndarray: n = int(0.08 * SR) rng = np.random.default_rng(19) noise = rng.normal(0, 1, n).astype(np.float32) sos = signal.butter(2, 7000, btype="highpass", fs=SR, output="sos") noise = signal.sosfilt(sos, noise) * np.exp(-np.arange(n) / SR * 55) return normalize(noise, 0.45) def render_drums(length: int, tempo: int, pattern: str, swing: float, rng: np.random.Generator) -> np.ndarray: track = np.zeros(length, dtype=np.float32) beat = 60.0 / tempo k, s, h = kick(), snare(), hat() total_beats = int((length / SR) / beat) + 2 for b in range(total_beats): base = b * beat bar_pos = b % 4 if pattern in {"four", "electronic"}: add_at(track, int(base * SR), k, 0.9) if bar_pos in {1, 3}: add_at(track, int(base * SR), s, 0.62) elif pattern == "rock": if bar_pos in {0, 2}: add_at(track, int(base * SR), k, 0.95) if bar_pos in {1, 3}: add_at(track, int(base * SR), s, 0.78) if bar_pos == 2: add_at(track, int((base + beat * 0.5) * SR), k, 0.55) elif pattern == "trap": if bar_pos in {0, 2}: add_at(track, int(base * SR), k, 0.9) if bar_pos == 3: add_at(track, int(base * SR), s, 0.7) elif pattern == "cinematic": if b % 8 == 0: add_at(track, int(base * SR), k, 0.85) if b % 8 == 6: add_at(track, int(base * SR), s, 0.45) else: if bar_pos in {0, 2}: add_at(track, int(base * SR), k, 0.42) if bar_pos == 3: add_at(track, int(base * SR), s, 0.34) for sub in range(2): off = base + sub * beat * 0.5 if sub == 1: off += beat * swing gain = 0.22 + 0.10 * rng.random() if pattern == "trap" and rng.random() < 0.35: add_at(track, int((off + beat * 0.25) * SR), h, gain * 0.75) add_at(track, int(off * SR), h, gain) return normalize(track, 0.85) def render_chord(freqs: Iterable[float], seconds: float, brightness: float) -> np.ndarray: freqs = list(freqs) n = max(1, int(seconds * SR)) chord = np.zeros(n, dtype=np.float32) for i, f in enumerate(freqs): chord += osc(f, seconds, "saw", phase=i * 0.2) * (0.32 / (i + 1)) chord += osc(f * 2, seconds, "tri", phase=i * 0.1) * 0.08 sos = signal.butter(2, 900 + brightness * 3200, btype="lowpass", fs=SR, output="sos") chord = signal.sosfilt(sos, chord) return chord * envelope(n, 0.05, 0.18) def render_tone(freq: float, seconds: float, kind: str, gain: float) -> np.ndarray: n = max(1, int(seconds * SR)) tone = osc(freq, seconds, kind) if kind != "sine": sos = signal.butter(2, 2400, btype="lowpass", fs=SR, output="sos") tone = signal.sosfilt(sos, tone) return tone * envelope(n, 0.01, 0.05) * gain def section_weight(name: str) -> float: low = name.lower() if "chorus" in low or "hook" in low: return 1.28 if "bridge" in low: return 1.10 if "intro" in low or "outro" in low: return 0.72 return 1.0 def render_track(lyrics: str, tags: str, duration: int, creativity: float, diversity: int, cfg: float) -> Tuple[str, str]: duration = int(np.clip(duration, 15, MAX_SECONDS)) plan = choose_plan(lyrics, tags, duration, creativity) rng = np.random.default_rng(plan.seed) length = duration * SR scale = MAJOR if plan.scale == "major" else MINOR root = plan.key beat = 60.0 / plan.tempo track = np.zeros(length, dtype=np.float32) drums = render_drums(length, plan.tempo, plan.drum, plan.swing, rng) * (0.36 + 0.06 * cfg) track += drums weights = np.array([section_weight(s.name) for s in plan.sections], dtype=np.float32) sec_lengths = np.maximum(4.0, duration * weights / weights.sum()) starts = np.cumsum(np.concatenate([[0.0], sec_lengths[:-1]])) progression = [0, 5, 3, 4] if plan.scale == "major" else [0, 5, 6, 3] for sec_idx, (section, sec_start, sec_len) in enumerate(zip(plan.sections, starts, sec_lengths)): energy = section_weight(section.name) words = re.findall(r"[A-Za-z']+", section.text) syllable_proxy = max(4, sum(max(1, len(w) // 4) for w in words)) bars = max(1, int(sec_len / (beat * 4))) for bar in range(bars + 1): t0 = sec_start + bar * beat * 4 if t0 >= duration: break degree = progression[(bar + sec_idx) % len(progression)] chord_degrees = [degree, (degree + 2) % 7, (degree + 4) % 7] chord_freqs = [note_freq(root, int(scale[d]), octave_shift=-1) for d in chord_degrees] chord = render_chord(chord_freqs, min(beat * 3.8, duration - t0), plan.brightness) add_at(track, int(t0 * SR), chord, 0.24 * energy) bass_degree = int(scale[degree]) for step in range(4): bt = t0 + step * beat if bt >= duration: continue bass_freq = note_freq(root, bass_degree, octave_shift=-2) bass = render_tone(bass_freq, beat * 0.82, "sine", 0.28 * energy) add_at(track, int(bt * SR), bass, 1.0) melody_steps = min(int(sec_len / (beat * 0.5)), 96) for m in range(melody_steps): if rng.random() > 0.72 + (creativity - 1.0) * 0.25: continue mt = sec_start + m * beat * 0.5 + (beat * plan.swing if m % 2 else 0) if mt >= duration: continue idx = (m + syllable_proxy + sec_idx * 2 + int(rng.integers(0, max(2, diversity // 15)))) % len(scale) octave = 0 if rng.random() < 0.75 else 1 mf = note_freq(root, int(scale[idx]), octave_shift=octave) lead = render_tone(mf, beat * (0.34 + 0.20 * rng.random()), "tri", 0.17 * energy) add_at(track, int(mt * SR), lead, 1.0) if "vinyl" in tags.lower() or "lofi" in tags.lower(): noise = rng.normal(0, 0.008, length).astype(np.float32) sos = signal.butter(2, 5000, btype="lowpass", fs=SR, output="sos") track += signal.sosfilt(sos, noise) * 0.5 if "wide" in tags.lower() or "ambient" in tags.lower() or "cinematic" in tags.lower(): pad = np.roll(track, int(0.028 * SR)) * 0.12 + np.roll(track, int(0.061 * SR)) * 0.08 track += pad track = master(track, plan.brightness) out = export_audio(track, "mp3") stats = ( "Generated on BeatForge CPU Composer.\n\n" f"Key: {plan.key} {plan.scale}\n" f"Tempo: {plan.tempo} BPM\n" f"Sections: {', '.join(s.name for s in plan.sections)}\n" f"Duration: {duration}s\n" f"Engine: free-tier procedural composer\n\n" "For neural HeartMuLa quality, upgrade this Space to GPU and connect the HeartMuLa backend." ) return out, stats def master(audio: np.ndarray, brightness: float) -> np.ndarray: sos_hp = signal.butter(2, 32, btype="highpass", fs=SR, output="sos") audio = signal.sosfilt(sos_hp, audio) if brightness > 0.55: audio += signal.lfilter([1, -0.96], [1], audio) * 0.04 audio = np.tanh(audio * 1.45) * 0.82 return normalize(audio, 0.94) def export_audio(audio: np.ndarray, output_format: str) -> str: wav = tmp_path(".wav") sf.write(wav, normalize(audio), SR, subtype="PCM_16") if output_format == "mp3" and AudioSegment is not None: try: mp3 = tmp_path(".mp3") AudioSegment.from_wav(wav).export(mp3, format="mp3", bitrate="192k") return mp3 except Exception as exc: print(f"MP3 export failed, returning WAV: {exc}") return wav def generate_track(lyrics: str, tags: str, duration: int, creativity: float, diversity: int, cfg: float, progress=gr.Progress()): if not lyrics or not lyrics.strip(): yield None, "Add lyrics or section notes before generating." return if duration > MAX_SECONDS: yield None, f"Free CPU mode is capped at {MAX_SECONDS}s to keep the Space responsive." return try: start = time.time() progress(0.05, desc="Parsing lyrics and style tags") yield None, "Planning arrangement from lyrics and style tags..." time.sleep(0.1) progress(0.28, desc="Composing drums, bass, chords, and lead") yield None, "Composing section-aware arrangement..." audio_path, stats = render_track(lyrics, tags, duration, creativity, diversity, cfg) elapsed = time.time() - start progress(1.0, desc="Track ready") yield audio_path, stats + f"\nRender time: {elapsed:.1f}s" except Exception as exc: yield None, f"Generation failed: {exc}" def create_app() -> gr.Blocks: with gr.Blocks( css=CUSTOM_CSS, title="BeatForge by Bilal Ansari", theme=gr.themes.Soft( primary_hue=gr.themes.colors.red, secondary_hue=gr.themes.colors.blue, neutral_hue=gr.themes.colors.slate, ), ) as app: gr.HTML( """
Lyrics-to-music studio for Hugging Face free tier. Built by Bilal Ansari with a CPU-native composer and a clean upgrade path to HeartMuLa 3B.