audio-mastering-suite / analysis.py
AnimalMonk's picture
6-band EQ for Super AI + bass protection prompts
bd0be42 verified
# Build: 2026-03-21T15:25:58.145163+00:00
"""AI-powered audio analysis using Gemini Pro β€” feature extraction and recommendations."""
import json
import os
import numpy as np
from scipy.signal import welch
# ---------------------------------------------------------------------------
# Audio feature extraction
# ---------------------------------------------------------------------------
_BANDS = [
("Sub-bass", 20, 60),
("Bass", 60, 250),
("Low-Mids", 250, 500),
("Mids", 500, 2000),
("Upper-Mids", 2000, 6000),
("Highs", 6000, 20000),
]
def extract_features(audio, sample_rate):
"""Extract audio features for AI analysis (basic β€” used by Auto Master).
Args:
audio: numpy array, shape (samples,) or (samples, channels).
sample_rate: int.
Returns:
dict with spectral, dynamic, and stereo measurements.
"""
# Convert to mono for spectral analysis
if audio.ndim == 2:
mono = audio.mean(axis=1)
else:
mono = audio
# --- Spectral analysis via Welch ---
nperseg = min(8192, len(mono))
freqs, psd = welch(mono, fs=sample_rate, nperseg=nperseg)
# Spectral centroid
total_energy = np.sum(psd)
if total_energy > 0:
spectral_centroid = float(np.sum(freqs * psd) / total_energy)
else:
spectral_centroid = 0.0
# Spectral rolloff (85%)
cumulative = np.cumsum(psd)
if total_energy > 0:
rolloff_idx = np.searchsorted(cumulative, 0.85 * total_energy)
spectral_rolloff = float(freqs[min(rolloff_idx, len(freqs) - 1)])
else:
spectral_rolloff = 0.0
# Band energy distribution (dB) β€” use float() to avoid numpy float32
band_energy = {}
for name, lo, hi in _BANDS:
mask = (freqs >= lo) & (freqs < hi)
band_rms = float(np.sqrt(np.mean(psd[mask]))) if np.any(mask) else 0.0
if band_rms > 0:
band_energy[name] = round(20.0 * np.log10(band_rms), 1)
else:
band_energy[name] = -100.0
# --- Dynamics (cast to Python float for JSON serialization) ---
rms = float(np.sqrt(np.mean(mono ** 2)))
peak = float(np.max(np.abs(mono)))
rms_db = round(20.0 * np.log10(rms), 1) if rms > 0 else -100.0
peak_db = round(20.0 * np.log10(peak), 1) if peak > 0 else -100.0
crest_factor = round(peak_db - rms_db, 1)
dynamic_range = crest_factor # simplified: same as crest factor for full-file
# --- Stereo correlation ---
is_mono = audio.ndim == 1 or audio.shape[1] == 1
if not is_mono:
left = audio[:, 0]
right = audio[:, 1]
correlation = np.corrcoef(left, right)[0, 1]
stereo_correlation = round(float(correlation), 3)
else:
stereo_correlation = None
# --- Loudness (reuse existing functions, lazy import) ---
from loudness import measure_loudness, measure_true_peak
lufs = measure_loudness(audio, sample_rate)
true_peak = measure_true_peak(audio, sample_rate)
return {
"spectral_centroid_hz": round(float(spectral_centroid), 1),
"spectral_rolloff_hz": round(float(spectral_rolloff), 1),
"band_energy": band_energy,
"rms_db": float(rms_db),
"peak_db": float(peak_db),
"crest_factor_db": float(crest_factor),
"dynamic_range_db": float(dynamic_range),
"stereo_correlation": stereo_correlation,
"lufs": round(float(lufs), 1) if not np.isinf(lufs) else -100.0,
"true_peak_dbtp": float(true_peak),
"is_mono": is_mono,
}
# ---------------------------------------------------------------------------
# Detailed feature extraction β€” Super AI mode
# ---------------------------------------------------------------------------
# 24 analysis bands for fine-grained spectral view
_DETAIL_BANDS = [
("20-40", 20, 40),
("40-60", 40, 60),
("60-100", 60, 100),
("100-150", 100, 150),
("150-200", 150, 200),
("200-300", 200, 300),
("300-400", 300, 400),
("400-600", 400, 600),
("600-800", 600, 800),
("800-1k", 800, 1000),
("1k-1.5k", 1000, 1500),
("1.5k-2k", 1500, 2000),
("2k-3k", 2000, 3000),
("3k-4k", 3000, 4000),
("4k-5k", 4000, 5000),
("5k-6k", 5000, 6000),
("6k-8k", 6000, 8000),
("8k-10k", 8000, 10000),
("10k-12k", 10000, 12000),
("12k-16k", 12000, 16000),
("16k-20k", 16000, 20000),
]
# 3 compression bands matching the DSP crossover defaults
_COMP_BANDS = [
("low", 20, 200),
("mid", 200, 4000),
("high", 4000, 20000),
]
def extract_features_detailed(audio, sample_rate):
"""Extract rich spectral + dynamic features for Super AI mode.
Builds on extract_features() and adds:
- 24-band spectral profile (fine-grained EQ map)
- Spectral peak/resonance detection (top problematic frequencies)
- Per-compression-band dynamics (RMS, peak, crest factor)
- Spectral flatness (tonal vs noisy character)
- Spectral tilt (bass-heavy vs bright)
- Short-time dynamic variation (verse vs chorus energy)
- Per-band stereo correlation
All numpy β€” runs in milliseconds on CPU.
"""
from scipy.signal import find_peaks
base = extract_features(audio, sample_rate)
if audio.ndim == 2:
mono = audio.mean(axis=1)
else:
mono = audio
# --- High-resolution spectral analysis ---
nperseg = min(16384, len(mono))
freqs, psd = welch(mono, fs=sample_rate, nperseg=nperseg)
# 24-band spectral profile (dB)
spectral_profile = {}
for name, lo, hi in _DETAIL_BANDS:
mask = (freqs >= lo) & (freqs < hi)
if np.any(mask):
band_rms = float(np.sqrt(np.mean(psd[mask])))
spectral_profile[name] = round(20.0 * np.log10(max(band_rms, 1e-12)), 1)
else:
spectral_profile[name] = -100.0
# --- Spectral peaks / resonances ---
# Smooth the PSD, find prominent peaks
psd_db = 10.0 * np.log10(np.maximum(psd, 1e-20))
# Use a wider window for smoothing to avoid noise peaks
kernel_size = min(31, len(psd_db) // 4)
if kernel_size % 2 == 0:
kernel_size += 1
if kernel_size >= 3:
kernel = np.ones(kernel_size) / kernel_size
psd_smooth = np.convolve(psd_db, kernel, mode="same")
else:
psd_smooth = psd_db
# Find peaks that stand out above the smoothed curve
prominence_threshold = 3.0 # at least 3 dB above neighbors
peak_indices, peak_props = find_peaks(
psd_db,
prominence=prominence_threshold,
distance=max(1, int(50 / (freqs[1] - freqs[0]))) # at least 50 Hz apart
)
# Sort by prominence and take top 8
if len(peak_indices) > 0:
prominences = peak_props["prominences"]
top_idx = np.argsort(prominences)[::-1][:8]
resonances = []
for idx in top_idx:
pi = peak_indices[idx]
if freqs[pi] >= 30: # skip sub-bass noise
resonances.append({
"freq_hz": round(float(freqs[pi]), 1),
"level_db": round(float(psd_db[pi]), 1),
"prominence_db": round(float(prominences[idx]), 1),
})
else:
resonances = []
# --- Spectral flatness (Wiener entropy) ---
# 1.0 = white noise, 0.0 = pure tone
psd_pos = psd[psd > 0]
if len(psd_pos) > 0:
geo_mean = np.exp(np.mean(np.log(psd_pos)))
arith_mean = np.mean(psd_pos)
spectral_flatness = round(float(geo_mean / arith_mean), 4)
else:
spectral_flatness = 0.0
# --- Spectral tilt (slope of energy across frequency) ---
# Negative = bass-heavy, positive = bright
if len(freqs) > 1 and np.any(psd > 0):
log_freqs = np.log10(np.maximum(freqs[1:], 1.0)) # skip DC
log_psd = 10.0 * np.log10(np.maximum(psd[1:], 1e-20))
coeffs = np.polyfit(log_freqs, log_psd, 1)
spectral_tilt = round(float(coeffs[0]), 2) # dB/decade
else:
spectral_tilt = 0.0
# --- Per-compression-band dynamics ---
comp_band_dynamics = {}
for name, lo, hi in _COMP_BANDS:
mask = (freqs >= lo) & (freqs < hi)
if np.any(mask):
band_psd = psd[mask]
band_rms = float(np.sqrt(np.mean(band_psd)))
band_peak = float(np.sqrt(np.max(band_psd)))
rms_db = round(20.0 * np.log10(max(band_rms, 1e-12)), 1)
peak_db = round(20.0 * np.log10(max(band_peak, 1e-12)), 1)
comp_band_dynamics[name] = {
"rms_db": rms_db,
"peak_db": peak_db,
"crest_db": round(peak_db - rms_db, 1),
}
else:
comp_band_dynamics[name] = {"rms_db": -100.0, "peak_db": -100.0, "crest_db": 0.0}
# --- Short-time dynamic variation ---
# Split audio into ~4-second chunks and measure RMS of each
chunk_samples = int(4.0 * sample_rate)
n_chunks = max(1, len(mono) // chunk_samples)
chunk_rms_list = []
for i in range(n_chunks):
chunk = mono[i * chunk_samples : (i + 1) * chunk_samples]
c_rms = float(np.sqrt(np.mean(chunk ** 2)))
if c_rms > 0:
chunk_rms_list.append(20.0 * np.log10(c_rms))
else:
chunk_rms_list.append(-100.0)
if len(chunk_rms_list) > 1:
chunk_arr = np.array(chunk_rms_list)
dynamic_variation = {
"min_rms_db": round(float(np.min(chunk_arr)), 1),
"max_rms_db": round(float(np.max(chunk_arr)), 1),
"range_db": round(float(np.max(chunk_arr) - np.min(chunk_arr)), 1),
"std_db": round(float(np.std(chunk_arr)), 2),
"n_chunks": n_chunks,
}
else:
dynamic_variation = {
"min_rms_db": chunk_rms_list[0] if chunk_rms_list else -100.0,
"max_rms_db": chunk_rms_list[0] if chunk_rms_list else -100.0,
"range_db": 0.0, "std_db": 0.0, "n_chunks": 1,
}
# --- Per-band stereo correlation ---
is_mono = audio.ndim == 1 or audio.shape[1] == 1
stereo_band_corr = {}
if not is_mono:
from scipy.signal import butter, sosfilt
left = audio[:, 0].astype(np.float64)
right = audio[:, 1].astype(np.float64)
band_edges = [(20, 200), (200, 2000), (2000, 8000), (8000, min(20000, sample_rate // 2 - 1))]
band_names = ["low", "low_mid", "high_mid", "high"]
for bname, lo, hi in zip(band_names, *zip(*band_edges)):
try:
sos = butter(4, [lo, hi], btype="band", fs=sample_rate, output="sos")
l_filt = sosfilt(sos, left)
r_filt = sosfilt(sos, right)
corr = np.corrcoef(l_filt, r_filt)[0, 1]
stereo_band_corr[bname] = round(float(corr), 3)
except Exception:
stereo_band_corr[bname] = None
# --- Merge into base features ---
base["spectral_profile_24band"] = spectral_profile
base["resonances"] = resonances
base["spectral_flatness"] = spectral_flatness
base["spectral_tilt_db_per_decade"] = spectral_tilt
base["comp_band_dynamics"] = comp_band_dynamics
base["dynamic_variation"] = dynamic_variation
base["stereo_band_correlation"] = stereo_band_corr if not is_mono else None
return base
# ---------------------------------------------------------------------------
# Gemini API wrapper
# ---------------------------------------------------------------------------
def _call_gemini(system_prompt, user_prompt):
"""Call Gemini via REST API (no SDK needed). Returns response text or None."""
import requests as _requests
api_key = os.environ.get("GOOGLE_API_KEY")
if not api_key:
return None
url = (
"https://generativelanguage.googleapis.com/v1beta/models/"
f"gemini-2.5-pro:generateContent?key={api_key}"
)
payload = {
"system_instruction": {"parts": [{"text": system_prompt}]},
"contents": [{"role": "user", "parts": [{"text": user_prompt}]}],
}
try:
resp = _requests.post(url, json=payload, timeout=60)
resp.raise_for_status()
data = resp.json()
return data["candidates"][0]["content"]["parts"][0]["text"]
except Exception as e:
return f"*AI analysis unavailable: {e}*"
# ---------------------------------------------------------------------------
# Phase 1: AI-recommended settings
# ---------------------------------------------------------------------------
_SIGNAL_FLOW = """
SIGNAL FLOW (fixed processing order):
1. PRE-GAIN DROP β€” Input is normalized to -18 LUFS internal working level (stepped attenuator). This prevents EQ clipping on hot masters.
2. HPF 15 Hz β€” Always-on 2nd-order Butterworth high-pass filter (12 dB/oct). Subsonic cleanup only; -3 dB at 15 Hz, negligible loss above 35 Hz. There is NO low-pass filter β€” source is band-limited by sample rate (Nyquist).
3. 4-BAND PARAMETRIC EQ (user-adjustable):
- Bass Boost: Peak filter (Q=2.0), variable center 40-100 Hz, range 0 to +3.0 dB, step 0.5 dB
- Lows: Low shelf at 200 Hz (Q=1.0), range -3.0 to +3.0 dB, step 0.5 dB
- Highs: High shelf at 10 kHz (Q=0.7, gentle slope), range -3.0 to +3.0 dB, step 0.5 dB β€” this is the "air" band, no LPF to fight it
- Mids: Peak filter at 1.2 kHz (Q=1.0, wide bell), range -3.0 to +3.0 dB, step 0.1 dB
4. MULTIBAND COMPRESSION β€” 3-band dynamics processing (no makeup gain):
- Placed before stereo width so the compressor sees EQ'd audio without M/S side-channel energy affecting per-band behaviour.
- Two Linkwitz-Riley 4th-order crossovers split the signal at 200 Hz and 4 kHz.
- LOGARITHMIC SLIDER CURVE: The 0-100 slider uses a quadratic (tΒ²) mapping β€” the bottom half of the slider (0-50) covers the transparent-to-light range, while aggressive compression is concentrated in the top 30% (70-100). This gives fine control in the musical "sweet spot."
- LOW band (< 200 Hz): Firmest control. Attack scales 80β†’20ms (lets kick breathe at low settings, catches bass transients at high). Ratio 1.2:1β†’3.0:1, threshold -16β†’-24 dB, release 200β†’120ms.
- MID band (200 Hz – 4 kHz): Musical peak control. Attack scales 30β†’10ms (transparent at low settings, tames snare/vocal peaks at high). Ratio 1.1:1β†’2.5:1, threshold -14β†’-24 dB, release 150β†’100ms.
- HIGH band (> 4 kHz): Transient control. Attack scales 10β†’3ms (light de-essing at low, catches cymbal/click transients at high). Ratio 1.05:1β†’2.0:1, threshold -12β†’-22 dB, release 80β†’40ms.
- TRUE BYPASS when slider = 0 (no compressor in chain at all).
- Higher compression values actively reduce the crest factor (peak-to-loudness ratio), which allows LUFS normalization to push louder without the true peak ceiling pulling level back down.
- Bands are summed back to full-range after compression. No makeup gain β€” LUFS normalization handles level.
5. STEREO WIDTH β€” Frequency-selective M/S matrix (after dynamics for stable imaging):
- Linkwitz-Riley 4th-order crossover at 200 Hz splits signal into low band and high band
- Low band (< 200 Hz): untouched β€” keeps bass mono-safe
- High band (β‰₯ 200 Hz): M/S encode β†’ width scaling β†’ M/S decode
- Energy-preserving: mid_scale = sqrt(2/(1+wΒ²)), side_scale = w Γ— mid_scale
- Range: 80% (narrow) to 150% (wide). 100% = no change. Clip protection after summing.
6. LUFS NORMALIZATION β€” Measures integrated loudness (ITU-R BS.1770-4, K-weighted, gated) and applies uniform linear gain to hit the target LUFS exactly. Targets: -14 (streaming), -11 (CD), or custom.
7. SOFT CLIPPER β€” Piecewise tanh saturation after LUFS normalization. A knee sits 2 dB below the -0.1 dBTP ceiling. Everything below the knee is perfectly linear (zero processing). Only the tips of peaks above the knee are shaped with a tanh curve that asymptotes to the ceiling. This is NOT a limiter β€” it's analog-style waveshaping that typically affects only the top 1-2 dB of the loudest transients. LUFS is preserved (transient tips contribute almost nothing to integrated loudness) while true-peak is reduced significantly.
8. TRUE PEAK CEILING (safety net) β€” After the soft clipper, the true peak (4x oversampled, ITU-R BS.1770) is measured. If any residual inter-sample peaks still exceed -0.1 dBTP, the entire signal is scaled down by exactly the overshoot. This rarely engages thanks to the soft clipper, but guarantees compliance.
"""
_RECOMMEND_SYSTEM = f"""You are an expert audio mastering engineer. Analyze the audio measurements below and recommend optimal mastering settings for this tool.
{_SIGNAL_FLOW}
AVAILABLE CONTROLS (these are the ONLY parameters you can recommend):
- lows_db: Low shelf at 200 Hz, -3.0 to +3.0 dB
- mid_boost_db: Peak at 1.2 kHz (Q=1.0), -3.0 to +3.0 dB
- highs_db: High shelf at 10 kHz (Q=0.7), -3.0 to +3.0 dB
- bass_boost_db: Peak (Q=2.0), 0 to +3.0 dB
- bass_freq_hz: Center freq for bass boost, 40-100 Hz
- compression: 0 (bypass/off) to 100 (heavy). 0 = true bypass (no processing)
- stereo_width: 80-150%. 100 = no change. Only affects frequencies above 200 Hz.
IMPORTANT CONTEXT:
- The 15 Hz HPF is always active and cannot be adjusted β€” do not try to compensate for it.
- There is no LPF, so the 10 kHz high shelf has full authority over the air band with no interference.
- LUFS normalization at the end restores loudness automatically β€” do not worry about overall level, focus on spectral balance and dynamics.
- TRUE PEAK CEILING: A -0.1 dBTP ceiling is enforced after LUFS normalization. If the audio has a high crest factor (large peaks relative to loudness), the ceiling will pull the final level below the target LUFS. To allow the track to hit the target LUFS, recommend enough compression to reduce the crest factor. Look at the crest_factor_db measurement β€” values above ~10 dB suggest compression in the 40-70 range; above ~14 dB may need 60-85.
- The compression slider uses a LOGARITHMIC (quadratic) curve. Slider values 0-50 cover subtle/transparent compression. Values 50-75 are moderate. Values 75-100 are aggressive. Recommend accordingly β€” a slider value of 30 is very light, 50 is moderate, 70+ is firm.
- If the audio already sounds well-balanced, recommend conservative or zero settings. Not everything needs processing.
Return ONLY a valid JSON object with these exact keys. The "reasoning" field must contain your actual markdown explanation (3-5 bullet points explaining why you chose these values):
{{
"lows_db": number,
"mid_boost_db": number,
"highs_db": number,
"bass_boost_db": number,
"bass_freq_hz": integer,
"compression": integer,
"stereo_width": integer,
"reasoning": "### AI Analysis\\n- **Lows:** reason for lows_db choice\\n- **Highs:** reason for highs_db choice\\n- ... (write your actual analysis here, do NOT return this template literally)"
}}
Keep values within the valid ranges. Be conservative β€” subtle moves are better than aggressive ones."""
def _clamp_settings(d):
"""Clamp AI-returned slider values to valid ranges in-place and return *d*."""
d["lows_db"] = max(-3.0, min(3.0, float(d.get("lows_db", 0))))
d["mid_boost_db"] = max(-3.0, min(3.0, float(d.get("mid_boost_db", 0))))
d["highs_db"] = max(-3.0, min(3.0, float(d.get("highs_db", 0))))
d["bass_boost_db"] = max(0, min(3.0, float(d.get("bass_boost_db", 0))))
d["bass_freq_hz"] = max(40, min(100, int(d.get("bass_freq_hz", 60))))
d["compression"] = max(0, min(100, int(d.get("compression", 50))))
d["stereo_width"] = max(80, min(150, int(d.get("stereo_width", 100))))
return d
def _strip_json(text):
"""Strip markdown code fences from a JSON response and parse it."""
text = text.strip()
if text.startswith("```"):
lines = text.split("\n")
text = "\n".join(lines[1:-1])
return json.loads(text)
def recommend_settings(audio_path):
"""Analyze raw audio and return AI-recommended mastering settings.
Args:
audio_path: path to the uploaded audio file.
Returns:
dict with recommended slider values and reasoning markdown,
or None if AI is unavailable.
"""
from dsp import load_audio
audio, sr = load_audio(audio_path)
features = extract_features(audio, sr)
user_prompt = f"""Analyze this audio and recommend mastering settings:
**Audio Measurements:**
- Integrated Loudness: {features['lufs']} LUFS
- True Peak: {features['true_peak_dbtp']} dBTP
- RMS Level: {features['rms_db']} dB
- Crest Factor: {features['crest_factor_db']} dB
- Spectral Centroid: {features['spectral_centroid_hz']} Hz
- Spectral Rolloff (85%): {features['spectral_rolloff_hz']} Hz
- Stereo Correlation: {features['stereo_correlation'] if features['stereo_correlation'] is not None else 'N/A (mono)'}
- Mono: {features['is_mono']}
**Band Energy (dB):**
{chr(10).join(f'- {k}: {v} dB' for k, v in features['band_energy'].items())}
Return the JSON object with recommended settings."""
response = _call_gemini(_RECOMMEND_SYSTEM, user_prompt)
if response is None:
return None
# Parse JSON from response (Gemini may wrap it in markdown code fence)
try:
result = _strip_json(response)
_clamp_settings(result)
if "reasoning" not in result:
result["reasoning"] = "*No explanation provided.*"
return result
except (json.JSONDecodeError, KeyError, TypeError):
return {"reasoning": response, "parse_error": True}
# ---------------------------------------------------------------------------
# Phase 2: Post-master comparison report
# ---------------------------------------------------------------------------
_COMPARE_SYSTEM = f"""You are an expert audio mastering engineer reviewing a completed master. You are evaluating the output of a specific mastering tool with the following architecture:
{_SIGNAL_FLOW}
IMPORTANT β€” When assessing the master:
- The 15 Hz HPF is always active. Any sub-bass roll-off below ~30 Hz is intentional subsonic cleanup, NOT a problem. Do not flag it.
- There is no LPF. The full spectrum above the HPF is passed through, so the 10 kHz high shelf has full authority over the air band.
- LUFS normalization is the final stage β€” it applies uniform linear gain. Loudness differences between original and mastered are intentional (target LUFS). Focus on spectral shape and dynamics, not absolute level.
- Compression at slider=0 means TRUE BYPASS (compressor was not in the chain at all). Do not comment on compression characteristics if it was bypassed.
- Stereo width only affects frequencies above 200 Hz (Linkwitz-Riley crossover). Bass mono-compatibility is always preserved.
- When suggesting improvements, ONLY recommend changes to the 7 available controls (lows_db, mid_boost_db, highs_db, bass_boost_db, bass_freq_hz, compression 0-100, stereo_width 80-150%). Do not suggest changes the tool cannot make (e.g., adjusting per-band attack times, changing crossover frequencies, changing the HPF frequency). The multiband compression is automatic β€” the user only controls the single 0-100 slider.
- TRUE PEAK CEILING: If the mastered true peak is at -0.1 dBTP and the LUFS is below target, the peak ceiling pulled the level down. The fix is more compression (higher slider value) to reduce crest factor, NOT removing the ceiling. Mention this trade-off when relevant.
- The compression slider uses a LOGARITHMIC (quadratic) curve: 0-50 = subtle/transparent, 50-75 = moderate, 75-100 = aggressive. Factor this into your slider recommendations.
Format your response as markdown with these sections:
### Overall Assessment
(1-2 sentences β€” was the mastering effective for the material?)
### What Worked Well
(bullet points referencing specific measurement changes)
### Suggested Improvements
(bullet points with specific slider value recommendations using the 7 available controls. If the master is good, say so β€” not every master needs changes.)
### Technical Notes
(any concerns about dynamics, phase coherence, or frequency balance that the available controls could address)
Be concise and specific. Reference actual measurement deltas between original and mastered."""
def compare_master(original, mastered, sample_rate, settings_dict, history=None):
"""Compare original vs mastered audio and return AI quality report.
Args:
original: numpy array of original audio.
mastered: numpy array of mastered audio.
sample_rate: int.
settings_dict: dict with the mastering settings that were applied.
history: list of dicts from previous analyses (optional).
Returns:
str: markdown-formatted comparison report, or fallback message.
"""
orig_features = extract_features(original, sample_rate)
mast_features = extract_features(mastered, sample_rate)
# Build the multiband compression details from slider value
from dsp import map_multiband_compression
comp_val = settings_dict.get("compression", 50)
band_params = map_multiband_compression(comp_val)
def _fmt_band(params):
return (f"threshold {params[0]:.1f} dB, ratio {params[1]:.2f}:1, "
f"attack {params[2]:.0f} ms, release {params[3]:.0f} ms")
history_text = _format_history(history or [])
user_prompt = f"""Compare the original and mastered audio:
**ORIGINAL Audio:**
- Loudness: {orig_features['lufs']} LUFS | True Peak: {orig_features['true_peak_dbtp']} dBTP
- RMS: {orig_features['rms_db']} dB | Crest Factor: {orig_features['crest_factor_db']} dB
- Spectral Centroid: {orig_features['spectral_centroid_hz']} Hz | Rolloff: {orig_features['spectral_rolloff_hz']} Hz
- Stereo Correlation: {orig_features['stereo_correlation'] if orig_features['stereo_correlation'] is not None else 'N/A (mono)'}
- Band Energy: {json.dumps(orig_features['band_energy'])}
**MASTERED Audio:**
- Loudness: {mast_features['lufs']} LUFS | True Peak: {mast_features['true_peak_dbtp']} dBTP
- RMS: {mast_features['rms_db']} dB | Crest Factor: {mast_features['crest_factor_db']} dB
- Spectral Centroid: {mast_features['spectral_centroid_hz']} Hz | Rolloff: {mast_features['spectral_rolloff_hz']} Hz
- Stereo Correlation: {mast_features['stereo_correlation'] if mast_features['stereo_correlation'] is not None else 'N/A (mono)'}
- Band Energy: {json.dumps(mast_features['band_energy'])}
**Settings Applied:**
- Lows (200 Hz shelf): {settings_dict.get('lows_db', 0)} dB
- Mids (1.2 kHz peak): {settings_dict.get('mid_boost_db', 0)} dB
- Highs (10 kHz shelf): {settings_dict.get('highs_db', 0)} dB
- Bass Boost: {settings_dict.get('bass_boost_db', 0)} dB @ {settings_dict.get('bass_freq_hz', 60)} Hz
- Compression: slider {comp_val}/100 (multiband, 3 bands)
- Low (< 200 Hz): {_fmt_band(band_params['low'])}
- Mid (200 Hz-4 kHz): {_fmt_band(band_params['mid'])}
- High (> 4 kHz): {_fmt_band(band_params['high'])}
- Stereo Width: {settings_dict.get('stereo_width', 100)}%
- Target LUFS: {settings_dict.get('target_lufs', -14)}{history_text}"""
response = _call_gemini(_COMPARE_SYSTEM, user_prompt)
if response is None:
return "*Set GOOGLE_API_KEY to enable AI comparison report.*"
return response
# ---------------------------------------------------------------------------
# Phase 3: Structured comparison (for Auto Master loop)
# ---------------------------------------------------------------------------
_COMPARE_STRUCTURED_SYSTEM = f"""You are an expert audio mastering engineer reviewing a completed master. You are evaluating the output of a specific mastering tool with the following architecture:
{_SIGNAL_FLOW}
IMPORTANT β€” When assessing the master:
- The 15 Hz HPF is always active. Any sub-bass roll-off below ~30 Hz is intentional subsonic cleanup, NOT a problem. Do not flag it.
- There is no LPF. The full spectrum above the HPF is passed through, so the 10 kHz high shelf has full authority over the air band.
- LUFS normalization is the final stage β€” it applies uniform linear gain. Loudness differences between original and mastered are intentional (target LUFS). Focus on spectral shape and dynamics, not absolute level.
- Compression at slider=0 means TRUE BYPASS (compressor was not in the chain at all). Do not comment on compression characteristics if it was bypassed.
- Stereo width only affects frequencies above 200 Hz (Linkwitz-Riley crossover). Bass mono-compatibility is always preserved.
- When suggesting improvements, ONLY recommend changes to the 7 available controls. Do not suggest changes the tool cannot make (e.g., adjusting per-band attack times, changing crossover frequencies, changing the HPF frequency). The multiband compression is automatic β€” the user only controls the single 0-100 slider.
- TRUE PEAK CEILING: If the mastered true peak is at -0.1 dBTP and the LUFS is below target, the peak ceiling pulled the level down. The fix is more compression (higher slider value) to reduce crest factor, NOT removing the ceiling. Adjust your revised compression value accordingly.
- The compression slider uses a LOGARITHMIC (quadratic) curve: 0-50 = subtle/transparent, 50-75 = moderate, 75-100 = aggressive. Factor this into your slider recommendations.
Return ONLY a valid JSON object (no markdown fences, no extra text) with these exact keys:
{{
"lows_db": <number, -3.0 to +3.0>,
"mid_boost_db": <number, -3.0 to +3.0>,
"highs_db": <number, -3.0 to +3.0>,
"bass_boost_db": <number, 0 to +3.0>,
"bass_freq_hz": <integer, 40 to 100>,
"compression": <integer, 0 to 100>,
"stereo_width": <integer, 80 to 150>,
"report": "<your full markdown comparison report here β€” Overall Assessment, What Worked Well, Suggested Improvements, Technical Notes>"
}}
The numeric values should be your REVISED recommended settings for a re-master based on what you hear in the measurements.
The "report" field should contain the full markdown analysis.
Be concise and specific. Reference actual measurement deltas.
Do NOT return the template above literally β€” fill in your actual analysis and values."""
def _format_history(history):
"""Format analysis history for inclusion in prompts."""
if not history:
return ""
lines = ["\n\n**PREVIOUS ANALYSIS HISTORY** (oldest first β€” use this to avoid recommending settings that already failed or oscillating between values):"]
for i, entry in enumerate(history, 1):
lines.append(f"\n--- Pass {i} ---")
lines.append(f"Settings tried: {json.dumps({k: v for k, v in entry.get('settings', {}).items() if k != 'target_lufs'})}")
lines.append(f"Result: LUFS={entry.get('lufs', '?')}, True Peak={entry.get('true_peak', '?')} dBTP, Crest Factor={entry.get('crest_factor', '?')} dB")
if entry.get("summary"):
lines.append(f"AI assessment: {entry['summary']}")
lines.append("\nIMPORTANT: Do NOT oscillate. If a previous pass moved a setting in one direction and it helped, continue refining in that direction. If it didn't help, try a DIFFERENT approach rather than reverting to a value that was already tried.")
return "\n".join(lines)
def compare_master_structured(original, mastered, sample_rate, settings_dict,
history=None):
"""Compare original vs mastered and return structured values + report.
Same analysis as compare_master() but returns a dict with revised slider
values and a markdown report, for use in the Auto Master loop.
Args:
history: list of dicts from previous analyses (optional). Each entry
has keys: settings, lufs, true_peak, crest_factor, summary.
Returns:
dict with keys: lows_db, mid_boost_db, highs_db, bass_boost_db,
bass_freq_hz, compression, stereo_width, report.
On parse error: {"report": raw_text, "parse_error": True}.
On API failure: None.
"""
orig_features = extract_features(original, sample_rate)
mast_features = extract_features(mastered, sample_rate)
from dsp import map_multiband_compression
comp_val = settings_dict.get("compression", 50)
band_params = map_multiband_compression(comp_val)
def _fmt_band(params):
return (f"threshold {params[0]:.1f} dB, ratio {params[1]:.2f}:1, "
f"attack {params[2]:.0f} ms, release {params[3]:.0f} ms")
history_text = _format_history(history or [])
user_prompt = f"""Compare the original and mastered audio and return your revised settings as JSON:
**ORIGINAL Audio:**
- Loudness: {orig_features['lufs']} LUFS | True Peak: {orig_features['true_peak_dbtp']} dBTP
- RMS: {orig_features['rms_db']} dB | Crest Factor: {orig_features['crest_factor_db']} dB
- Spectral Centroid: {orig_features['spectral_centroid_hz']} Hz | Rolloff: {orig_features['spectral_rolloff_hz']} Hz
- Stereo Correlation: {orig_features['stereo_correlation'] if orig_features['stereo_correlation'] is not None else 'N/A (mono)'}
- Band Energy: {json.dumps(orig_features['band_energy'])}
**MASTERED Audio:**
- Loudness: {mast_features['lufs']} LUFS | True Peak: {mast_features['true_peak_dbtp']} dBTP
- RMS: {mast_features['rms_db']} dB | Crest Factor: {mast_features['crest_factor_db']} dB
- Spectral Centroid: {mast_features['spectral_centroid_hz']} Hz | Rolloff: {mast_features['spectral_rolloff_hz']} Hz
- Stereo Correlation: {mast_features['stereo_correlation'] if mast_features['stereo_correlation'] is not None else 'N/A (mono)'}
- Band Energy: {json.dumps(mast_features['band_energy'])}
**Settings Applied:**
- Lows (200 Hz shelf): {settings_dict.get('lows_db', 0)} dB
- Mids (1.2 kHz peak): {settings_dict.get('mid_boost_db', 0)} dB
- Highs (10 kHz shelf): {settings_dict.get('highs_db', 0)} dB
- Bass Boost: {settings_dict.get('bass_boost_db', 0)} dB @ {settings_dict.get('bass_freq_hz', 60)} Hz
- Compression: slider {comp_val}/100 (multiband, 3 bands)
- Low (< 200 Hz): {_fmt_band(band_params['low'])}
- Mid (200 Hz-4 kHz): {_fmt_band(band_params['mid'])}
- High (> 4 kHz): {_fmt_band(band_params['high'])}
- Stereo Width: {settings_dict.get('stereo_width', 100)}%
- Target LUFS: {settings_dict.get('target_lufs', -14)}{history_text}
Return the JSON object with your revised settings and comparison report."""
response = _call_gemini(_COMPARE_STRUCTURED_SYSTEM, user_prompt)
if response is None:
return None
try:
result = _strip_json(response)
_clamp_settings(result)
if "report" not in result:
result["report"] = "*No report provided.*"
return result
except (json.JSONDecodeError, KeyError, TypeError):
return {"report": response, "parse_error": True}
# ---------------------------------------------------------------------------
# Super AI mode β€” full parametric control
# ---------------------------------------------------------------------------
_SUPER_SIGNAL_FLOW = """
AUDIO ANALYSIS DATA YOU RECEIVE:
You will receive detailed measurements for each audio file, including:
- Standard: LUFS, true peak, RMS, crest factor, spectral centroid, spectral rolloff, stereo correlation
- 24-Band Spectral Profile: Fine-grained energy (dB) across 21 frequency bands from 20 Hz to 20 kHz.
USE THIS to make precise EQ decisions β€” you can see exactly where energy buildups, dips, and imbalances are.
- Spectral Resonances: Top 8 most prominent spectral peaks with frequency, level, and prominence (dB above neighbors).
USE THIS to identify harsh or ringing frequencies that need surgical EQ cuts.
- Spectral Flatness: 0.0 = pure tonal, 1.0 = white noise. Tells you how tonal vs noisy the material is.
- Spectral Tilt: dB/decade slope. Negative = bass-heavy, positive = bright. Guides overall tonal balance decisions.
- Per-Compression-Band Dynamics: RMS, peak, and crest factor for each of the 3 compression bands (low/mid/high).
USE THIS to set compression thresholds and ratios per band β€” you can see which bands need taming.
- Dynamic Variation: Min/max/range/std of RMS across 4-second chunks of the track.
Tells you how much the track varies (quiet verse vs loud chorus). High range = preserve dynamics. Low range = already compressed.
- Per-Band Stereo Correlation: Correlation for low, low-mid, high-mid, and high frequency bands.
USE THIS to make stereo width decisions β€” low correlation = wide, high = narrow/mono.
SIGNAL FLOW (fixed processing order β€” you control ALL parameters):
1. PRE-GAIN DROP β€” Input is normalized to -18 LUFS (automatic, not adjustable).
2. HIGH-PASS FILTER β€” Adjustable cutoff (10-80 Hz). Default 15 Hz, 12 dB/oct Butterworth.
3. 6-BAND FULLY PARAMETRIC EQ β€” Each band is independently configurable:
- band1 through band6: type (peak/low_shelf/high_shelf), frequency (20-20000 Hz), gain (-6 to +6 dB), Q (0.1-10.0)
You can use any combination of shelf and peak filters at any frequency. The normal UI locks these to fixed frequencies and Β±3 dB β€” you are NOT limited to that. You have full parametric EQ control.
Set gain_db to 0 on any band you don't need β€” unused bands are bypassed automatically.
4. MULTIBAND COMPRESSION β€” 3-band dynamics with per-band control:
- crossover_low: adjustable crossover frequency for low/mid split (default 200 Hz)
- crossover_high: adjustable crossover frequency for mid/high split (default 4000 Hz)
- Each band (low, mid, high) has independently adjustable: threshold (-20 to 0 dB), ratio (1.0-20.0), attack_ms (0.1-200 ms), release_ms (10-500 ms)
- Ratio 1.0 = bypass for that band
- No makeup gain β€” LUFS normalization restores level.
5. STEREO WIDTH β€” M/S matrix, frequency-selective (crossover at 200 Hz, bass stays mono). Range 80-150%.
6. LUFS NORMALIZATION β€” Automatic to target LUFS (fixed by user, DO NOT change).
7. SOFT CLIPPER β€” Piecewise tanh saturation, knee 2 dB below -0.1 dBTP ceiling. Always active. Linear below knee, tanh above. This is a safety net β€” NOT a creative tool.
8. TRUE PEAK CEILING β€” Safety net at -0.1 dBTP. Scales signal down if residual peaks exceed ceiling.
CONSTRAINTS (DO NOT VIOLATE):
- Target LUFS is fixed by the user. Do not change it.
- The soft clipper and true peak ceiling must remain as-is (automatic safety nets).
- You cannot add new processing stages β€” only adjust the parameters described above.
TRUE PEAK GUIDANCE (IMPORTANT):
- True peak between -1.0 and -0.1 dBTP is the IDEAL goal, but it is NOT always achievable.
- Source material that is already heavily limited or compressed (e.g., AI-generated tracks from Suno, Udio, etc.) has a very low crest factor (peak-to-loudness ratio). When such material is normalized DOWN to a streaming LUFS target (e.g., -14 LUFS), the true peak will naturally drop well below -1.0 dBTP. This is correct and expected behavior.
- DO NOT over-compress or crush dynamics to try to force the true peak higher. Dynamics preservation is MORE important than hitting a specific true peak number.
- If the source material has a low crest factor, acknowledge this in your analysis and accept the true peak wherever it naturally lands after LUFS normalization.
- Only use compression for tonal shaping and dynamic control β€” NEVER to artificially raise the true peak.
"""
_SUPER_RECOMMEND_SYSTEM = f"""You are a world-class audio mastering engineer with decades of experience. You have FULL control over every parameter in the mastering chain. Analyze the audio measurements and recommend optimal settings.
{_SUPER_SIGNAL_FLOW}
MASTERING PHILOSOPHY:
- LESS IS MORE. A great master sounds like a better version of the original, not a different song.
- Most EQ moves should be Β±1 to Β±2 dB. Moves beyond Β±3 dB are RARE and require strong justification.
- If the source audio already sounds good in a frequency range, LEAVE IT ALONE. Do not EQ for the sake of EQ.
- Use surgical EQ moves β€” small cuts are often more effective than boosts.
- Compression thresholds should be set so compression only engages on peaks, NOT constantly. A threshold of -35 dB or lower means the compressor is always compressing β€” that destroys dynamics. Typical mastering thresholds are -15 to -8 dB.
- Match compression to the genre and dynamic character of the material.
- Preserve the artist's intent β€” enhance, don't transform.
- True peak between -1.0 and -0.1 dBTP is ideal, but do NOT sacrifice dynamics to achieve it. If the source is already heavily compressed, the true peak may land below -1.0 dBTP at the target LUFS β€” that is acceptable.
TONAL DIRECTION (apply to all masters):
- Aim for a slightly WARM overall tone β€” a SUBTLE richness in the low-mids (200-500 Hz) and smooth, non-harsh highs. This means maybe +0.5 to +1.5 dB shelf, NOT +3 dB or more.
- High shelf boosts above +1.5 dB will make the master sound harsh and brittle β€” avoid this.
HEAVY, UNCONSTRAINED BASS β€” Protect low-end punch and transient impact (40-100 Hz) at all costs:
- Rule 1: Prioritize additive EQ (Band 1/Band 2) for bass weight. DO NOT cut any frequencies below 150 Hz with EQ. The HPF already handles rumble removal. Any EQ band targeting frequencies below 150 Hz should have POSITIVE gain (boost) or be bypassed (0 dB). Cutting sub-bass removes the punch and weight from the track.
- Rule 2: DO NOT over-compress the < 200 Hz band. If the source crest factor is already low, default the Low-Band Compressor to BYPASS (Ratio 1:1) or use a very slow attack (>60 ms) so the kick drum transient escapes untouched.
- Rule 3: The sub-bass should feel physical, anchored, and wide open.
- Rule 4: HPF cutoff must stay at or below 25 Hz for bass-heavy material. Only raise it above 30 Hz if measurements show significant rumble below 20 Hz.
Return ONLY a valid JSON object with this exact structure (no markdown fences):
{{
"hpf_freq": <float, 10-80>,
"eq": {{
"band1": {{"type": "<peak|low_shelf|high_shelf>", "freq": <float Hz>, "gain_db": <float>, "q": <float>}},
"band2": {{"type": "<peak|low_shelf|high_shelf>", "freq": <float Hz>, "gain_db": <float>, "q": <float>}},
"band3": {{"type": "<peak|low_shelf|high_shelf>", "freq": <float Hz>, "gain_db": <float>, "q": <float>}},
"band4": {{"type": "<peak|low_shelf|high_shelf>", "freq": <float Hz>, "gain_db": <float>, "q": <float>}},
"band5": {{"type": "<peak|low_shelf|high_shelf>", "freq": <float Hz>, "gain_db": <float>, "q": <float>}},
"band6": {{"type": "<peak|low_shelf|high_shelf>", "freq": <float Hz>, "gain_db": <float>, "q": <float>}}
}},
"compression": {{
"low": {{"threshold": <float dB>, "ratio": <float>, "attack_ms": <float>, "release_ms": <float>}},
"mid": {{"threshold": <float dB>, "ratio": <float>, "attack_ms": <float>, "release_ms": <float>}},
"high": {{"threshold": <float dB>, "ratio": <float>, "attack_ms": <float>, "release_ms": <float>}}
}},
"crossover_low": <float Hz>,
"crossover_high": <float Hz>,
"stereo_width": <int, 80-150>,
"reasoning": "### AI Analysis\\n- **EQ:** reason for EQ choices\\n- **Dynamics:** reason for compression settings\\n- **Stereo:** reason for width choice\\n(write your ACTUAL analysis β€” do NOT return this template literally)"
}}
Be musical and intentional. Every parameter should have a reason."""
_SUPER_COMPARE_SYSTEM = f"""You are a world-class audio mastering engineer reviewing a completed master. You have FULL control over every parameter and can make surgical adjustments.
{_SUPER_SIGNAL_FLOW}
REVIEW GUIDELINES:
- Compare original vs mastered measurements carefully.
- Make VERY SMALL adjustments β€” typically Β±0.5 dB EQ tweaks or 1-2 dB threshold changes. If you changed a parameter by more than Β±1 dB on the previous pass, do NOT change it again unless the measurements clearly show a problem.
- If something sounds good, LEAVE IT ALONE. The best revision is often the smallest one.
- Compression thresholds should be -15 to -8 dB for mastering. If you see a threshold below -20 dB, raise it β€” that compressor is over-compressing.
- Focus on what the measurements tell you: spectral balance, dynamics, stereo image.
- True peak between -1.0 and -0.1 dBTP is ideal, but do NOT over-compress to force it. If the source has a low crest factor, accept the true peak wherever it lands naturally.
- LUFS target is fixed β€” do not try to change it.
- Reference the previous analysis history to avoid oscillating between settings.
- Each revision should be a refinement, not a reset. Aim for 1-2 parameter changes per pass, not 5+.
TONAL DIRECTION (maintain across all revisions):
- The master should have a slightly WARM overall tone β€” SUBTLE richness in the low-mids (200-500 Hz) and smooth, non-harsh highs. Avoid clinical or brittle sound.
- High shelf boosts above +1.5 dB will make the master harsh β€” pull them back if present.
HEAVY, UNCONSTRAINED BASS β€” Protect low-end punch and transient impact (40-100 Hz) at all costs:
- Rule 1: Prioritize additive EQ (Band 1/Band 2) for bass weight. DO NOT cut any frequencies below 150 Hz with EQ. If a previous pass cut sub-bass, UNDO that cut (set gain to 0 or positive). Cutting sub-bass removes punch and weight.
- Rule 2: DO NOT over-compress the < 200 Hz band. If the source crest factor is already low, default the Low-Band Compressor to BYPASS (Ratio 1:1) or use a very slow attack (>60 ms) so the kick drum transient escapes untouched.
- Rule 3: The sub-bass should feel physical, anchored, and wide open.
- Rule 4: HPF cutoff must stay at or below 25 Hz for bass-heavy material. Only raise it above 30 Hz if measurements show significant rumble below 20 Hz.
CREST FACTOR CHECK (passes 2-4):
- If the crest factor in the < 200 Hz range decreases between passes, you have over-compressed the kick drum. BACK OFF the Low-Band compressor ratio or lengthen the attack time. Do not lose the punch.
- Sidechain Emulation: Treat the Low-Band compressor as if it has a 100 Hz HPF on its sidechain detector. Do not let sustained sub-bass notes clamp down on the rhythmic transients.
Return ONLY a valid JSON object with this exact structure (no markdown fences):
{{
"hpf_freq": <float, 10-80>,
"eq": {{
"band1": {{"type": "<peak|low_shelf|high_shelf>", "freq": <float Hz>, "gain_db": <float>, "q": <float>}},
"band2": {{"type": "<peak|low_shelf|high_shelf>", "freq": <float Hz>, "gain_db": <float>, "q": <float>}},
"band3": {{"type": "<peak|low_shelf|high_shelf>", "freq": <float Hz>, "gain_db": <float>, "q": <float>}},
"band4": {{"type": "<peak|low_shelf|high_shelf>", "freq": <float Hz>, "gain_db": <float>, "q": <float>}},
"band5": {{"type": "<peak|low_shelf|high_shelf>", "freq": <float Hz>, "gain_db": <float>, "q": <float>}},
"band6": {{"type": "<peak|low_shelf|high_shelf>", "freq": <float Hz>, "gain_db": <float>, "q": <float>}}
}},
"compression": {{
"low": {{"threshold": <float dB>, "ratio": <float>, "attack_ms": <float>, "release_ms": <float>}},
"mid": {{"threshold": <float dB>, "ratio": <float>, "attack_ms": <float>, "release_ms": <float>}},
"high": {{"threshold": <float dB>, "ratio": <float>, "attack_ms": <float>, "release_ms": <float>}}
}},
"crossover_low": <float Hz>,
"crossover_high": <float Hz>,
"stereo_width": <int, 80-150>,
"report": "<your full markdown comparison report β€” Overall Assessment, What Worked Well, Suggested Improvements, Technical Notes>"
}}
The numeric values should be your REVISED settings for a re-master. Make small, targeted adjustments.
The "report" field must contain your actual markdown analysis."""
_SUPER_FINAL_REPORT_SYSTEM = f"""You are a world-class audio mastering engineer writing a final quality report. You are evaluating whether a master meets professional distribution standards.
{_SUPER_SIGNAL_FLOW}
TONAL DIRECTION (evaluate against these goals):
- The desired outcome is a slightly WARM overall tone with smooth, non-harsh highs.
- HEAVY, UNCONSTRAINED BASS β€” the low end (40-100 Hz) should feel physical, punchy, and anchored. Evaluate whether the kick drum transients survived the compression stage. If sub-bass was cut by EQ or crushed by compression, flag it as a failure.
- HPF should be at or below 25 Hz for bass-heavy material.
- Evaluate whether the final master achieves this tonal character.
Write a comprehensive final report in markdown format covering:
### Overall Assessment
(Was the mastering effective? Does it meet professional standards? Does it achieve the desired warm tone with enhanced bass?)
### Spectral Balance
(Evaluate frequency balance β€” low end warmth, midrange richness, high end smoothness)
### Dynamics & Loudness
(LUFS, true peak compliance, crest factor, dynamic range preservation)
### Stereo Image
(Width, mono compatibility, balance)
### Processing Summary
(What the mastering chain did β€” EQ moves, compression character, etc.)
### Verdict
(Pass/fail for streaming distribution. Any remaining concerns?)
Be specific. Reference actual measurements. This is the FINAL report β€” no suggestions for changes, just an honest evaluation of the finished master."""
def _clamp_super_params(d):
"""Clamp Super AI parameters to safe ranges."""
d["hpf_freq"] = max(10.0, min(80.0, float(d.get("hpf_freq", 15.0))))
d["stereo_width"] = max(80, min(150, int(d.get("stereo_width", 100))))
d["crossover_low"] = max(80.0, min(500.0, float(d.get("crossover_low", 200.0))))
d["crossover_high"] = max(1000.0, min(10000.0, float(d.get("crossover_high", 4000.0))))
eq = d.get("eq", {})
for bk in ("band1", "band2", "band3", "band4", "band5", "band6"):
band = eq.get(bk, {})
band["freq"] = max(20.0, min(20000.0, float(band.get("freq", 1000))))
band["gain_db"] = max(-6.0, min(6.0, float(band.get("gain_db", 0))))
band["q"] = max(0.1, min(10.0, float(band.get("q", 1.0))))
if band.get("type") not in ("peak", "low_shelf", "high_shelf"):
band["type"] = "peak"
eq[bk] = band
d["eq"] = eq
comp = d.get("compression", {})
for bk in ("low", "mid", "high"):
bp = comp.get(bk, {})
bp["threshold"] = max(-20.0, min(0.0, float(bp.get("threshold", -14.0))))
bp["ratio"] = max(1.0, min(20.0, float(bp.get("ratio", 1.0))))
bp["attack_ms"] = max(0.1, min(200.0, float(bp.get("attack_ms", 30.0))))
bp["release_ms"] = max(10.0, min(500.0, float(bp.get("release_ms", 150.0))))
comp[bk] = bp
d["compression"] = comp
return d
def _format_super_settings(params):
"""Format Super AI parameters into readable text for prompts."""
eq = params.get("eq", {})
comp = params.get("compression", {})
lines = [
f"- HPF: {params.get('hpf_freq', 15)} Hz",
]
for bk in ("band1", "band2", "band3", "band4", "band5", "band6"):
b = eq.get(bk, {})
if abs(b.get("gain_db", 0)) < 0.01:
lines.append(f"- EQ {bk}: bypassed (0 dB)")
else:
lines.append(f"- EQ {bk}: {b.get('type','peak')} @ {b.get('freq',1000)} Hz, "
f"{b.get('gain_db',0):+.1f} dB, Q={b.get('q',1.0):.2f}")
lines.append(f"- Crossovers: {params.get('crossover_low', 200)} Hz / {params.get('crossover_high', 4000)} Hz")
for bk in ("low", "mid", "high"):
bp = comp.get(bk, {})
lines.append(f"- Comp {bk}: threshold {bp.get('threshold',-14):.1f} dB, "
f"ratio {bp.get('ratio',1.0):.2f}:1, "
f"attack {bp.get('attack_ms',30):.1f} ms, "
f"release {bp.get('release_ms',150):.1f} ms")
lines.append(f"- Stereo Width: {params.get('stereo_width', 100)}%")
return "\n".join(lines)
def _format_detailed_features(features, label="Audio"):
"""Format detailed features from extract_features_detailed() for prompts."""
lines = [
f"**{label} β€” Core Measurements:**",
f"- Loudness: {features['lufs']} LUFS | True Peak: {features['true_peak_dbtp']} dBTP",
f"- RMS: {features['rms_db']} dB | Crest Factor: {features['crest_factor_db']} dB",
f"- Spectral Centroid: {features['spectral_centroid_hz']} Hz | Rolloff: {features['spectral_rolloff_hz']} Hz",
f"- Spectral Flatness: {features.get('spectral_flatness', 'N/A')} (0=tonal, 1=noise)",
f"- Spectral Tilt: {features.get('spectral_tilt_db_per_decade', 'N/A')} dB/decade (negative=bass-heavy, positive=bright)",
f"- Stereo Correlation: {features['stereo_correlation'] if features['stereo_correlation'] is not None else 'N/A (mono)'}",
]
# 24-band spectral profile
profile = features.get("spectral_profile_24band")
if profile:
lines.append(f"\n**{label} β€” 24-Band Spectral Profile (dB):**")
for band, val in profile.items():
bar = "β–ˆ" * max(0, int((val + 60) / 2)) if val > -60 else ""
lines.append(f" {band:>8s}: {val:>7.1f} dB {bar}")
# Resonances
resonances = features.get("resonances", [])
if resonances:
lines.append(f"\n**{label} β€” Spectral Resonances (peaks above neighbors):**")
for r in resonances:
lines.append(f" {r['freq_hz']:>8.1f} Hz: {r['level_db']:+.1f} dB "
f"(prominence: {r['prominence_db']:.1f} dB)")
# Per-compression-band dynamics
cbd = features.get("comp_band_dynamics")
if cbd:
lines.append(f"\n**{label} β€” Per-Compression-Band Dynamics:**")
for band_name in ("low", "mid", "high"):
bd = cbd.get(band_name, {})
lines.append(f" {band_name:>4s}: RMS {bd.get('rms_db', '?')} dB, "
f"Peak {bd.get('peak_db', '?')} dB, "
f"Crest {bd.get('crest_db', '?')} dB")
# Dynamic variation
dv = features.get("dynamic_variation")
if dv:
lines.append(f"\n**{label} β€” Dynamic Variation (4-sec chunks, {dv.get('n_chunks', '?')} chunks):**")
lines.append(f" RMS range: {dv.get('min_rms_db', '?')} to {dv.get('max_rms_db', '?')} dB "
f"(span: {dv.get('range_db', '?')} dB, Οƒ: {dv.get('std_db', '?')} dB)")
# Per-band stereo correlation
sbc = features.get("stereo_band_correlation")
if sbc:
lines.append(f"\n**{label} β€” Per-Band Stereo Correlation:**")
for band_name in ("low", "low_mid", "high_mid", "high"):
val = sbc.get(band_name)
lines.append(f" {band_name:>8s}: {val if val is not None else 'N/A'}")
# Original 6-band energy for backward compat
lines.append(f"\n**{label} β€” 6-Band Energy Summary:**")
for k, v in features.get("band_energy", {}).items():
lines.append(f" {k}: {v} dB")
return "\n".join(lines)
def _format_super_history(history):
"""Format Super AI analysis history for prompts."""
if not history:
return ""
lines = ["\n\n**PREVIOUS ANALYSIS HISTORY** (use this to avoid oscillating β€” refine, don't reset):"]
for i, entry in enumerate(history, 1):
lines.append(f"\n--- Pass {i} ---")
lines.append(f"Settings:\n{_format_super_settings(entry.get('params', {}))}")
lines.append(f"Result: LUFS={entry.get('lufs', '?')}, True Peak={entry.get('true_peak', '?')} dBTP, "
f"Crest Factor={entry.get('crest_factor', '?')} dB")
if entry.get("summary"):
lines.append(f"AI assessment: {entry['summary']}")
lines.append("\nIMPORTANT: Do NOT oscillate. Refine incrementally. If a setting helped, keep it and fine-tune. "
"If it didn't help, try a DIFFERENT approach rather than reverting.")
return "\n".join(lines)
def super_ai_recommend(audio_path):
"""Analyze raw audio and return full-parametric AI mastering settings.
Returns:
dict with full parameter set + reasoning, or None.
"""
from dsp import load_audio
audio, sr = load_audio(audio_path)
features = extract_features_detailed(audio, sr)
user_prompt = f"""Analyze this audio and recommend full mastering parameters:
{_format_detailed_features(features, "INPUT")}
Return the JSON object with your recommended full parameter set.
Use the 24-band spectral profile to make precise EQ decisions.
Use the resonances to identify frequencies that need surgical cuts.
Use the per-band dynamics to set compression thresholds and ratios.
Use the dynamic variation to decide how aggressively to compress."""
response = _call_gemini(_SUPER_RECOMMEND_SYSTEM, user_prompt)
if response is None:
return None
try:
result = _strip_json(response)
_clamp_super_params(result)
if "reasoning" not in result:
result["reasoning"] = "*No explanation provided.*"
return result
except (json.JSONDecodeError, KeyError, TypeError):
return {"reasoning": response, "parse_error": True}
def super_ai_compare(original, mastered, sample_rate, params, target_lufs,
history=None):
"""Compare original vs mastered with full-parametric revision.
Returns:
dict with revised full params + report, or None.
"""
orig_features = extract_features_detailed(original, sample_rate)
mast_features = extract_features_detailed(mastered, sample_rate)
history_text = _format_super_history(history or [])
user_prompt = f"""Compare original vs mastered audio and return revised full parameters:
{_format_detailed_features(orig_features, "ORIGINAL")}
{_format_detailed_features(mast_features, "MASTERED")}
**Settings Applied:**
{_format_super_settings(params)}
- Target LUFS: {target_lufs}{history_text}
Return the JSON with your REVISED full parameter set and comparison report.
Make SMALL, incremental adjustments β€” refine what's working, fix what isn't.
Compare the 24-band profiles to see exactly where the EQ moved things.
Compare per-band dynamics to evaluate compression effectiveness.
Check if resonances were tamed or if new ones were introduced."""
response = _call_gemini(_SUPER_COMPARE_SYSTEM, user_prompt)
if response is None:
return None
try:
result = _strip_json(response)
_clamp_super_params(result)
if "report" not in result:
result["report"] = "*No report provided.*"
return result
except (json.JSONDecodeError, KeyError, TypeError):
return {"report": response, "parse_error": True}
def super_ai_final_report(original, mastered, sample_rate, params, target_lufs,
history=None):
"""Generate a final quality assessment report (no new settings).
Returns:
str: markdown report.
"""
orig_features = extract_features_detailed(original, sample_rate)
mast_features = extract_features_detailed(mastered, sample_rate)
history_text = _format_super_history(history or [])
user_prompt = f"""Write a final mastering quality report for this completed master:
{_format_detailed_features(orig_features, "ORIGINAL")}
{_format_detailed_features(mast_features, "MASTERED")}
**Final Settings Applied:**
{_format_super_settings(params)}
- Target LUFS: {target_lufs}{history_text}
Write the final quality report. No suggestions β€” just an honest assessment of whether this master meets professional standards.
Reference specific frequency bands and measurements from the detailed analysis above."""
response = _call_gemini(_SUPER_FINAL_REPORT_SYSTEM, user_prompt)
if response is None:
return "*AI final report unavailable.*"
return response
# v4.2