% ============================================================================= % Spatial-BEATs v13d Hyperparameter table*s for NeurIPS Appendix % ============================================================================= % Required packages (add to preamble if not already loaded): % \usepackage{booktabs} % \usepackage{multirow} % \usepackage{amsmath} % \usepackage{array} % ============================================================================= % ----------------------------------------------------------------------------- % Table 1: FOA input and feature extraction % ----------------------------------------------------------------------------- \begin{table*}[h] \centering \caption{FOA input and feature extraction parameters.} \label{tab:v13d-foa-input} \small \begin{tabular}{lll} \toprule \textbf{Parameter} & \textbf{Value} & \textbf{Notes} \\ \midrule Sample rate & 16 kHz & 4-channel FOA, order [W, X, Y, Z] \\ Clip duration & 10 s & Waveform shape $[B, 4, 160000]$ \\ STFT $n_{\text{fft}}$ & 400 & Aligned with Qwen-2.5-Omni \\ STFT hop length & 160 & 10 ms hop \\ STFT window length & 400 & 25 ms window \\ Window function & Hann & — \\ Mel filterbank size & 128 & $f_{\min}=0$, $f_{\max}=8000$ \\ Time frames $T_f$ & 1000 & 100 frames/s $\times$ 10 s \\ Input channels & 7 & 4 mel (W/X/Y/Z) + 3 IV (x/y/z) \\ IV formula & $\mathrm{IV}_d = \mathrm{Re}[W \cdot \overline{X_d}] / (|W|^2 + \varepsilon)$ & $\varepsilon=10^{-8}$, clamp $\pm 10$ after mel \\ W-channel mean & 15.41663 & BEATs pretrain statistic \\ W-channel std & 6.55582 & BEATs pretrain statistic \\ SpecAugment (W only) & 2$\times$time mask (100), 2$\times$freq mask (27) & Training only \\ \bottomrule \end{tabular} \end{table*} % ----------------------------------------------------------------------------- % Table 2: Model architecture, prediction heads, and inference (merged 2+3+9) % ----------------------------------------------------------------------------- \begin{table*}[h] \centering \caption{Spatial-BEATs v13d architecture, prediction heads, and inference / matching configuration. Architecture is identical to v12.} \label{tab:v13d-architecture} \small \begin{tabular}{llc} \toprule \textbf{Module} & \textbf{Parameter} & \textbf{Value} \\ \midrule \multicolumn{3}{l}{\emph{Encoder backbone}} \\ \midrule \multirow{4}{*}{SpatialDeltaPatchAdapter (v1)} & Input / hidden / output channels & 7 / 32 / 512 \\ & Patch size, stride & $16{\times}16$, 16 \\ & Residual scale $\alpha$ (init) & 0.1 (learnable) \\ & Layers & Conv$(7{\to}32, 1{\times}1)$$\to$DW$(3{\times}3)$$\to$Conv$(32{\to}512, 16{\times}16)$ \\ \midrule \multirow{2}{*}{SpatialPatchEmbedding} & Patch tokens (10 s clip) & 496 \\ & Embed dim & 512 (projected to 768) \\ \midrule \multirow{5}{*}{BEATs Transformer Trunk} & Layers & 12 \\ & Hidden / FFN dim & 768 / 3072 \\ & Attention heads & 12 \\ & Position encoding & Sinusoidal relative + GRU gating \\ & Trunk adapter & 1-layer spectral demixer (zero-gated init) \\ \midrule \multirow{8}{*}{LocalSpatialEncoder} & Input shape & $[B, 7, T_f, 128]$ \\ & CNN stage 1 & Conv2d$(7{\to}64, 3{\times}3)$ + GN(8) + GELU \\ & CNN stage 2 & Conv2d$(64{\to}128, 3{\times}3, s{=}(1,2))$ + GN(8) + GELU \\ & CNN stage 3 & Conv2d$(128{\to}256, 3{\times}3, s{=}(1,2))$ + GN(16) + GELU \\ & Frequency reduction & Mean over freq axis $\to [B, T_f, 256]$ \\ & Transformer layers / heads & 2 / 4 (pre-LN, $d{=}256$) \\ & Dropout & 0.1 \\ & Output projection & Linear$(256 \to 768)$ \\ \midrule \multirow{2}{*}{Frequency Pool + Resampler} & Frequency pool & $[B, 496, 768] \to [B, 62, 768]$ (mean) \\ & Token rate (output) & 10 Hz, $T_s = 100$ \\ \midrule \multirow{6}{*}{LocalSpatialCrossFuser} & Mode & \texttt{cross\_attn\_gated} \\ & Layers / heads / dim & 2 / 8 / 768 \\ & Gate bias (init) & $-2.0$ \quad ($\sigma{\approx}0.119$) \\ & Direct gate bias (init) & $-1.5$ \quad ($\sigma{\approx}0.182$) \\ & Readout & 1-layer Transformer + LayerNorm \\ & Output & $[B, T_s{=}100, 768]$ \\ \midrule \multirow{5}{*}{SourceQueryDecoder} & Track queries $K$ & 4 \\ & Stage-1 layers & 2 (TransformerDecoder) \\ & Stage-2 layers & 1 (per-frame refinement + LN) \\ & Heads / FFN dim & 8 / 3072 \\ & Output & $[B, K{=}4, T_s{=}100, 768]$ \\ \midrule \multicolumn{3}{l}{\emph{Per-(track, frame) prediction heads}} \\ \midrule Activity head & Structure / output & LayerNorm + Linear$(768{\to}1)$ $\to$ logit $\ell \in \mathbb{R}$ \\ Class head & Structure / output & MLP + residual + spectral demixer $\to$ 63 logits \\ Direction head & Structure / output & MLP$(768{\to}768{\to}3)$ + L2-norm $\to$ unit vector $\in \mathbb{R}^3$ \\ Distance head & Structure / output & MLP$(768{\to}768{\to}1)$ + softplus $\to$ distance (m) \\ \midrule \multicolumn{3}{l}{\emph{Inference and Hungarian matching}} \\ \midrule Matching granularity & — & Segment-level (stable within same-active-set window) \\ Matching cost & — & Activity + class CE + direction cosine + distance $\ell_1$ \\ Active track selection & — & Top-$\hat{K}$ (DCASE SELD evaluator) \\ Train-eval alignment & — & Top-$K$ rank loss aligned with top-$\hat{K}$ selection \\ \bottomrule \end{tabular} \end{table*} % ----------------------------------------------------------------------------- % Table 3: Loss + training schedule + optimizer + scale (merged 4+5+6+7) % ----------------------------------------------------------------------------- \begin{table*}[h] \centering \caption{Spatial-BEATs v13d training configuration: loss weights, two-phase schedule, optimizer, learning rate, distributed setup, and EMA. The Top-$K$ rank activity loss (D-2) is the v13d core innovation; D-1/D-5/D-6 are stabilizers.} \label{tab:v13d-training} \small \begin{tabular}{lll} \toprule \textbf{Group} & \textbf{Parameter} & \textbf{Value} \\ \midrule \multirow{5}{*}{Loss weights} & $\lambda_{\text{frame\_class}}$ & 1.0 \quad (63-way cross-entropy) \\ & $\lambda_{\text{frame\_activity}}$ & 1.0 \quad (\textbf{Top-$K$ rank}, replaces BCE) \\ & $\lambda_{\text{frame\_direction}}$ & 1.0 \quad ($1 - \cos(\hat{\mathbf{d}}, \mathbf{d})$) \\ & $\lambda_{\text{frame\_distance}}$ & 1.0 \quad (smooth-$\ell_1$) \\ & $\lambda_{\text{frame\_hemisphere}}$ & 1.0 \quad (hemisphere BCE, from v11a) \\ \midrule \multirow{3}{*}{Top-$K$ rank loss (D-2)} & \texttt{frame\_activity\_loss\_type} & \texttt{topk\_rank} \\ & Margin $m$ & 2.0 \\ & BCE anchor weight & 0.1 \\ \midrule \multirow{4}{*}{Two-phase schedule (D-1)} & LR linear warmup (ep 0 -- 2) & spatial weight $0$, LR $0 \to 1.5{\times}10^{-5}$ \\ & Cls-only warmup cont. (ep 3 -- 7) & spatial weight $0$, cosine decay \\ & Spatial linear ramp (ep 8 -- 9) & spatial weight $0 \to 1$, cosine decay \\ & Full joint training (ep 10 -- 24) & spatial weight $1$, cosine $\to 7.5{\times}10^{-7}$ \\ \midrule \multirow{5}{*}{Optimizer} & Optimizer & AdamW \\ & $(\beta_1, \beta_2)$ & $(0.9, 0.999)$ \\ & $\varepsilon$ & $10^{-8}$ \\ & Weight decay & 0.01 \\ & Gradient clipping (global $\ell_2$) & 1.0 \\ \midrule \multirow{5}{*}{LR schedule} & LR schedule & Linear warmup + cosine decay \\ & Peak LR & $1.5 \times 10^{-5}$ \\ & LR warmup epochs & 3 \\ & Cosine decay epochs & 22 \\ & Min LR ratio & 0.05 ($\to 7.5\times 10^{-7}$) \\ \midrule \multirow{6}{*}{Training scale} & Total epochs & 25 \\ & GPUs & 8 $\times$ A100 \\ & Batch size (per GPU / total) & 8 / 64 \\ & Numerical precision & fp32 \\ & Data loader workers / GPU & 8 \\ & Distributed framework & torchrun + DDP \\ \midrule \multirow{2}{*}{Hot-start (D-5)} & Resume checkpoint & v12 best.pt (strict=False, missing/unexpected = 0) \\ & Resume optimizer state & True (Adam moments preserved from v12) \\ \midrule \multirow{4}{*}{EMA (D-6)} & \texttt{use\_ema} & True \\ & EMA decay & 0.9995 \\ & EMA start epoch & 3 \\ & EMA application & Validation \& best.pt; restored before next training step \\ \bottomrule \end{tabular} \end{table*} \noindent The Top-$K$ rank activity loss is \begin{equation} \mathcal{L}_{\text{rank}} = \frac{1}{|P|} \sum_{(i,j) \in P} \max\!\left(0,\; m + \ell_j - \ell_i\right),\qquad \mathcal{L}_{\text{act}} = \mathcal{L}_{\text{rank}} + 0.1 \cdot \mathcal{L}_{\text{BCE}}, \label{eq:topk-rank} \end{equation} where $P$ is the set of (active slot $i$, inactive slot $j$) pairs within each frame. % ----------------------------------------------------------------------------- % Table 4: Dataset % ----------------------------------------------------------------------------- \begin{table*}[h] \centering \caption{Training and validation data composition.} \label{tab:v13d-data} \small \begin{tabular}{lrl} \toprule \textbf{Split} & \textbf{Clips} & \textbf{Notes} \\ \midrule Training (\texttt{unified\_spatial\_foa\_fsd63\_all/train.jsonl}) & $\sim$329 K & — \\ \quad sim\_static & 304 K & Simulated static sources \\ \quad qa\_sim & 74 K & QA-derived simulated \\ \quad dcase\_real & 20 K & DCASE real recordings \\ Manifest replication & $(1,)$ & No real-data oversampling in v13d \\ Vocabulary & 63 classes & FSD50K-derived (\texttt{final\_vocabulary.csv}) \\ \midrule Validation & $\sim$35 K + ov\{1,2,3\} sim/real + DCASE STARSS valid & Multi-subset evaluation \\ \bottomrule \end{tabular} \end{table*} % ----------------------------------------------------------------------------- % Table 5: Training trajectory (representative checkpoints) % ----------------------------------------------------------------------------- \begin{table*}[h] \centering \caption{Representative v13d training trajectory. Note the expected F20 dip during cls-only warmup (ep 1--7), followed by the +107\% jump when spatial loss activates at ep 8.} \label{tab:v13d-trajectory} \small \begin{tabular}{lccc} \toprule \textbf{Epoch} & \textbf{F20} & \textbf{oracle\_cls} & \textbf{azi MAE} \\ \midrule 0 & 0.311 & 0.650 & 28.6$^\circ$ \\ 7 (end of cls warmup) & 0.193 & 0.786 & 31.0$^\circ$ \\ 8 (spatial loss activates) & 0.397 & 0.876 & 18.5$^\circ$ \\ 10 (current best) & 0.402 & 0.864 & 17.2$^\circ$ \\ 25 (projected) & 0.43 -- 0.46 & $\sim$0.88 & 17 -- 19$^\circ$ \\ \midrule v12 best (reference) & 0.378 & 0.834 & 19.7$^\circ$ \\ \bottomrule \end{tabular} \end{table*}