Spaces:
Sleeping
Sleeping
File size: 6,714 Bytes
5076f2a |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 |
"""
Stable Preprocessing Pipeline - v6.6.1
SINGLE source of truth for:
- Canonical SR
- Onset window lengths
- Transient/tail slicing
Key principle: Same audio → Same representation (deterministic)
"""
import os
import io
import numpy as np
import librosa
import soundfile as sf
try:
import soxr
SOXR_AVAILABLE = True
except ImportError:
SOXR_AVAILABLE = False
print("[preprocessing] Warning: soxr not available, using librosa (less deterministic)")
# =========================
# CANONICAL SETTINGS
# =========================
CANONICAL_SR = 48000
# Window around onset (ms)
ONSET_PRE_MS = 15
ONSET_POST_MS = 735 # total window = 750ms
# View presets (all in ms, relative to START OF WINDOW)
# NOTE: start of window is ONSET_PRE_MS before onset.
VIEW_PRESETS = {
# Optimized for short one-shots / CNN-style event models
"hits": {
"TRANS_END_MS": 85, # first 85ms of window (15ms pre + 70ms post)
"TAIL_START_MS": 30, # skip earliest transient region
"TAIL_END_MS": 650, # capture body/decay
},
# Sometimes helps transformer encoders on micro-clips (requires reindex to compare)
"transformer": {
"TRANS_END_MS": 140, # longer transient context
"TAIL_START_MS": 40,
"TAIL_END_MS": 700,
}
}
DEFAULT_VIEW_PRESET = os.getenv("SCOUT_VIEW_PRESET", "hits").strip().lower()
if DEFAULT_VIEW_PRESET not in VIEW_PRESETS:
DEFAULT_VIEW_PRESET = "hits"
# Export these so scout.py can’t drift
TRANS_END_MS = VIEW_PRESETS[DEFAULT_VIEW_PRESET]["TRANS_END_MS"]
TAIL_START_MS = VIEW_PRESETS[DEFAULT_VIEW_PRESET]["TAIL_START_MS"]
TAIL_END_MS = VIEW_PRESETS[DEFAULT_VIEW_PRESET]["TAIL_END_MS"]
def canonicalize_audio(audio: np.ndarray, sr: int):
"""
Deterministic audio canonicalization:
1) Resample to CANONICAL_SR
2) Mono
3) Peak normalize to ±1
4) Remove DC offset
"""
if audio.ndim > 1:
audio = np.mean(audio, axis=1)
audio = audio.astype(np.float32, copy=False)
if sr != CANONICAL_SR:
if SOXR_AVAILABLE:
audio = soxr.resample(audio, sr, CANONICAL_SR, quality="HQ")
else:
audio = librosa.resample(audio, orig_sr=sr, target_sr=CANONICAL_SR, res_type="kaiser_best")
sr = CANONICAL_SR
peak = float(np.max(np.abs(audio))) if audio.size else 0.0
if peak > 1e-6:
audio = audio / peak
audio = audio - float(np.mean(audio)) if audio.size else audio
return audio.astype(np.float32, copy=False), CANONICAL_SR
def detect_primary_onset_stable(audio: np.ndarray, sr: int) -> int:
"""
Deterministic onset detection with small zero-crossing refinement.
Returns onset sample index.
"""
hop_length = 256
onset_env = librosa.onset.onset_strength(
y=audio,
sr=sr,
hop_length=hop_length,
aggregate=np.median,
center=False
)
peaks = librosa.util.peak_pick(
onset_env,
pre_max=3,
post_max=3,
pre_avg=3,
post_avg=5,
delta=0.05,
wait=10
)
if len(peaks) == 0:
return int(0.1 * sr)
strongest_peak = int(peaks[int(np.argmax(onset_env[peaks]))])
onset_sample = int(librosa.frames_to_samples(strongest_peak, hop_length=hop_length))
# zero-crossing refinement
window = 100
start = max(0, onset_sample - window)
end = min(len(audio), onset_sample + window)
if end > start + 2:
seg = audio[start:end]
zc = np.where(np.diff(np.sign(seg)))[0]
if zc.size:
center = window if onset_sample >= window else onset_sample - start
onset_sample = start + int(zc[int(np.argmin(np.abs(zc - center)))])
return int(onset_sample)
def extract_canonical_window(audio: np.ndarray, sr: int, onset_sample: int) -> np.ndarray:
"""
Extract fixed-length window around onset.
Always returns exactly (ONSET_PRE_MS + ONSET_POST_MS) ms length at sr.
"""
pre_samples = int(ONSET_PRE_MS * sr / 1000.0)
post_samples = int(ONSET_POST_MS * sr / 1000.0)
expected = pre_samples + post_samples
start = max(0, onset_sample - pre_samples)
end = min(len(audio), onset_sample + post_samples)
w = audio[start:end].astype(np.float32, copy=False)
if w.size < expected:
w = np.pad(w, (0, expected - w.size), mode="constant")
elif w.size > expected:
w = w[:expected]
return w.astype(np.float32, copy=False)
def preprocess_audio_stable(audio_bytes: bytes):
"""
MASTER preprocessing for QUERY uploads (file bytes):
load -> canonicalize -> detect onset -> extract fixed window
"""
audio, sr = sf.read(io.BytesIO(audio_bytes), dtype="float32", always_2d=False)
audio, sr = canonicalize_audio(audio, sr)
onset_sample = detect_primary_onset_stable(audio, sr)
window = extract_canonical_window(audio, sr, onset_sample)
return {
"audio": window,
"sr": sr,
"onset_time": onset_sample / sr,
"onset_sample": onset_sample
}
def slice_views_stable(processed: dict, view_preset: str | None = None):
"""
Create full/trans/tail views from the canonical window.
view_preset:
- None => uses DEFAULT_VIEW_PRESET (env SCOUT_VIEW_PRESET)
- "hits" or "transformer"
"""
audio = processed["audio"]
sr = processed["sr"]
preset = (view_preset or DEFAULT_VIEW_PRESET).strip().lower()
if preset not in VIEW_PRESETS:
preset = "hits"
trans_end = int(VIEW_PRESETS[preset]["TRANS_END_MS"] * sr / 1000.0)
tail_start = int(VIEW_PRESETS[preset]["TAIL_START_MS"] * sr / 1000.0)
tail_end = int(VIEW_PRESETS[preset]["TAIL_END_MS"] * sr / 1000.0)
full = audio
trans = audio[:max(0, min(trans_end, audio.size))]
tail = audio[max(0, min(tail_start, audio.size)):max(0, min(tail_end, audio.size))]
return {"full": full, "trans": trans, "tail": tail}
def verify_stability():
print("[preprocessing] Running stability test...")
sr = 48000
t = np.linspace(0, 1.0, int(sr * 1.0), endpoint=False)
audio = (np.sin(2 * np.pi * 200 * t) * np.exp(-t * 5)).astype(np.float32)
bio = io.BytesIO()
sf.write(bio, audio, sr, format="WAV")
audio_bytes = bio.getvalue()
outs = []
for _ in range(5):
p = preprocess_audio_stable(audio_bytes)
outs.append(p["audio"])
for i in range(1, len(outs)):
diff = float(np.max(np.abs(outs[0] - outs[i])))
if diff > 1e-6:
print(f"[preprocessing] ⚠️ Instability detected: {diff}")
return False
print("[preprocessing] ✓ Stability test passed")
return True
if __name__ == "__main__":
verify_stability()
|