% =============================================================================
% Spatial-BEATs v13d Hyperparameter table*s for NeurIPS Appendix
% =============================================================================
% Required packages (add to preamble if not already loaded):
%   \usepackage{booktabs}
%   \usepackage{multirow}
%   \usepackage{amsmath}
%   \usepackage{array}
% =============================================================================


% -----------------------------------------------------------------------------
% Table 1: FOA input and feature extraction
% -----------------------------------------------------------------------------
\begin{table*}[h]
\centering
\caption{FOA input and feature extraction parameters.}
\label{tab:v13d-foa-input}
\small
\begin{tabular}{lll}
\toprule
\textbf{Parameter} & \textbf{Value} & \textbf{Notes} \\
\midrule
Sample rate & 16 kHz & 4-channel FOA, order [W, X, Y, Z] \\
Clip duration & 10 s & Waveform shape $[B, 4, 160000]$ \\
STFT $n_{\text{fft}}$ & 400 & Aligned with Qwen-2.5-Omni \\
STFT hop length & 160 & 10 ms hop \\
STFT window length & 400 & 25 ms window \\
Window function & Hann & — \\
Mel filterbank size & 128 & $f_{\min}=0$, $f_{\max}=8000$ \\
Time frames $T_f$ & 1000 & 100 frames/s $\times$ 10 s \\
Input channels & 7 & 4 mel (W/X/Y/Z) + 3 IV (x/y/z) \\
IV formula & $\mathrm{IV}_d = \mathrm{Re}[W \cdot \overline{X_d}] / (|W|^2 + \varepsilon)$ & $\varepsilon=10^{-8}$, clamp $\pm 10$ after mel \\
W-channel mean & 15.41663 & BEATs pretrain statistic \\
W-channel std & 6.55582 & BEATs pretrain statistic \\
SpecAugment (W only) & 2$\times$time mask (100), 2$\times$freq mask (27) & Training only \\
\bottomrule
\end{tabular}
\end{table*}


% -----------------------------------------------------------------------------
% Table 2: Model architecture, prediction heads, and inference (merged 2+3+9)
% -----------------------------------------------------------------------------
\begin{table*}[h]
\centering
\caption{Spatial-BEATs v13d architecture, prediction heads, and inference / matching configuration. Architecture is identical to v12.}
\label{tab:v13d-architecture}
\small
\begin{tabular}{llc}
\toprule
\textbf{Module} & \textbf{Parameter} & \textbf{Value} \\
\midrule
\multicolumn{3}{l}{\emph{Encoder backbone}} \\
\midrule
\multirow{4}{*}{SpatialDeltaPatchAdapter (v1)}
 & Input / hidden / output channels & 7 / 32 / 512 \\
 & Patch size, stride & $16{\times}16$, 16 \\
 & Residual scale $\alpha$ (init) & 0.1 (learnable) \\
 & Layers & Conv$(7{\to}32, 1{\times}1)$$\to$DW$(3{\times}3)$$\to$Conv$(32{\to}512, 16{\times}16)$ \\
\midrule
\multirow{2}{*}{SpatialPatchEmbedding}
 & Patch tokens (10 s clip) & 496 \\
 & Embed dim & 512 (projected to 768) \\
\midrule
\multirow{5}{*}{BEATs Transformer Trunk}
 & Layers & 12 \\
 & Hidden / FFN dim & 768 / 3072 \\
 & Attention heads & 12 \\
 & Position encoding & Sinusoidal relative + GRU gating \\
 & Trunk adapter & 1-layer spectral demixer (zero-gated init) \\
\midrule
\multirow{8}{*}{LocalSpatialEncoder}
 & Input shape & $[B, 7, T_f, 128]$ \\
 & CNN stage 1 & Conv2d$(7{\to}64, 3{\times}3)$ + GN(8) + GELU \\
 & CNN stage 2 & Conv2d$(64{\to}128, 3{\times}3, s{=}(1,2))$ + GN(8) + GELU \\
 & CNN stage 3 & Conv2d$(128{\to}256, 3{\times}3, s{=}(1,2))$ + GN(16) + GELU \\
 & Frequency reduction & Mean over freq axis $\to [B, T_f, 256]$ \\
 & Transformer layers / heads & 2 / 4 (pre-LN, $d{=}256$) \\
 & Dropout & 0.1 \\
 & Output projection & Linear$(256 \to 768)$ \\
\midrule
\multirow{2}{*}{Frequency Pool + Resampler}
 & Frequency pool & $[B, 496, 768] \to [B, 62, 768]$ (mean) \\
 & Token rate (output) & 10 Hz, $T_s = 100$ \\
\midrule
\multirow{6}{*}{LocalSpatialCrossFuser}
 & Mode & \texttt{cross\_attn\_gated} \\
 & Layers / heads / dim & 2 / 8 / 768 \\
 & Gate bias (init) & $-2.0$ \quad ($\sigma{\approx}0.119$) \\
 & Direct gate bias (init) & $-1.5$ \quad ($\sigma{\approx}0.182$) \\
 & Readout & 1-layer Transformer + LayerNorm \\
 & Output & $[B, T_s{=}100, 768]$ \\
\midrule
\multirow{5}{*}{SourceQueryDecoder}
 & Track queries $K$ & 4 \\
 & Stage-1 layers & 2 (TransformerDecoder) \\
 & Stage-2 layers & 1 (per-frame refinement + LN) \\
 & Heads / FFN dim & 8 / 3072 \\
 & Output & $[B, K{=}4, T_s{=}100, 768]$ \\
\midrule
\multicolumn{3}{l}{\emph{Per-(track, frame) prediction heads}} \\
\midrule
Activity head  & Structure / output & LayerNorm + Linear$(768{\to}1)$ $\to$ logit $\ell \in \mathbb{R}$ \\
Class head     & Structure / output & MLP + residual + spectral demixer $\to$ 63 logits \\
Direction head & Structure / output & MLP$(768{\to}768{\to}3)$ + L2-norm $\to$ unit vector $\in \mathbb{R}^3$ \\
Distance head  & Structure / output & MLP$(768{\to}768{\to}1)$ + softplus $\to$ distance (m) \\
\midrule
\multicolumn{3}{l}{\emph{Inference and Hungarian matching}} \\
\midrule
Matching granularity   & — & Segment-level (stable within same-active-set window) \\
Matching cost          & — & Activity + class CE + direction cosine + distance $\ell_1$ \\
Active track selection & — & Top-$\hat{K}$ (DCASE SELD evaluator) \\
Train-eval alignment   & — & Top-$K$ rank loss aligned with top-$\hat{K}$ selection \\
\bottomrule
\end{tabular}
\end{table*}


% -----------------------------------------------------------------------------
% Table 3: Loss + training schedule + optimizer + scale (merged 4+5+6+7)
% -----------------------------------------------------------------------------
\begin{table*}[h]
\centering
\caption{Spatial-BEATs v13d training configuration: loss weights, two-phase schedule, optimizer, learning rate, distributed setup, and EMA. The Top-$K$ rank activity loss (D-2) is the v13d core innovation; D-1/D-5/D-6 are stabilizers.}
\label{tab:v13d-training}
\small
\begin{tabular}{lll}
\toprule
\textbf{Group} & \textbf{Parameter} & \textbf{Value} \\
\midrule
\multirow{5}{*}{Loss weights}
 & $\lambda_{\text{frame\_class}}$       & 1.0 \quad (63-way cross-entropy) \\
 & $\lambda_{\text{frame\_activity}}$    & 1.0 \quad (\textbf{Top-$K$ rank}, replaces BCE) \\
 & $\lambda_{\text{frame\_direction}}$   & 1.0 \quad ($1 - \cos(\hat{\mathbf{d}}, \mathbf{d})$) \\
 & $\lambda_{\text{frame\_distance}}$    & 1.0 \quad (smooth-$\ell_1$) \\
 & $\lambda_{\text{frame\_hemisphere}}$  & 1.0 \quad (hemisphere BCE, from v11a) \\
\midrule
\multirow{3}{*}{Top-$K$ rank loss (D-2)}
 & \texttt{frame\_activity\_loss\_type}  & \texttt{topk\_rank} \\
 & Margin $m$                            & 2.0 \\
 & BCE anchor weight                     & 0.1 \\
\midrule
\multirow{4}{*}{Two-phase schedule (D-1)}
 & LR linear warmup (ep 0 -- 2)          & spatial weight $0$, LR $0 \to 1.5{\times}10^{-5}$ \\
 & Cls-only warmup cont. (ep 3 -- 7)     & spatial weight $0$, cosine decay \\
 & Spatial linear ramp (ep 8 -- 9)       & spatial weight $0 \to 1$, cosine decay \\
 & Full joint training (ep 10 -- 24)     & spatial weight $1$, cosine $\to 7.5{\times}10^{-7}$ \\
\midrule
\multirow{5}{*}{Optimizer}
 & Optimizer                             & AdamW \\
 & $(\beta_1, \beta_2)$                  & $(0.9, 0.999)$ \\
 & $\varepsilon$                         & $10^{-8}$ \\
 & Weight decay                          & 0.01 \\
 & Gradient clipping (global $\ell_2$)   & 1.0 \\
\midrule
\multirow{5}{*}{LR schedule}
 & LR schedule                           & Linear warmup + cosine decay \\
 & Peak LR                               & $1.5 \times 10^{-5}$ \\
 & LR warmup epochs                      & 3 \\
 & Cosine decay epochs                   & 22 \\
 & Min LR ratio                          & 0.05 ($\to 7.5\times 10^{-7}$) \\
\midrule
\multirow{6}{*}{Training scale}
 & Total epochs                          & 25 \\
 & GPUs                                  & 8 $\times$ A100 \\
 & Batch size (per GPU / total)          & 8 / 64 \\
 & Numerical precision                   & fp32 \\
 & Data loader workers / GPU             & 8 \\
 & Distributed framework                 & torchrun + DDP \\
\midrule
\multirow{2}{*}{Hot-start (D-5)}
 & Resume checkpoint                     & v12 best.pt (strict=False, missing/unexpected = 0) \\
 & Resume optimizer state                & True (Adam moments preserved from v12) \\
\midrule
\multirow{4}{*}{EMA (D-6)}
 & \texttt{use\_ema}                     & True \\
 & EMA decay                             & 0.9995 \\
 & EMA start epoch                       & 3 \\
 & EMA application                       & Validation \& best.pt; restored before next training step \\
\bottomrule
\end{tabular}
\end{table*}

\noindent The Top-$K$ rank activity loss is
\begin{equation}
\mathcal{L}_{\text{rank}} = \frac{1}{|P|} \sum_{(i,j) \in P} \max\!\left(0,\; m + \ell_j - \ell_i\right),\qquad
\mathcal{L}_{\text{act}} = \mathcal{L}_{\text{rank}} + 0.1 \cdot \mathcal{L}_{\text{BCE}},
\label{eq:topk-rank}
\end{equation}
where $P$ is the set of (active slot $i$, inactive slot $j$) pairs within each frame.


% -----------------------------------------------------------------------------
% Table 4: Dataset
% -----------------------------------------------------------------------------
\begin{table*}[h]
\centering
\caption{Training and validation data composition.}
\label{tab:v13d-data}
\small
\begin{tabular}{lrl}
\toprule
\textbf{Split} & \textbf{Clips} & \textbf{Notes} \\
\midrule
Training (\texttt{unified\_spatial\_foa\_fsd63\_all/train.jsonl}) & $\sim$329 K & — \\
\quad sim\_static       & 304 K & Simulated static sources \\
\quad qa\_sim           & 74 K  & QA-derived simulated \\
\quad dcase\_real       & 20 K  & DCASE real recordings \\
Manifest replication    & $(1,)$ & No real-data oversampling in v13d \\
Vocabulary              & 63 classes & FSD50K-derived (\texttt{final\_vocabulary.csv}) \\
\midrule
Validation              & $\sim$35 K + ov\{1,2,3\} sim/real + DCASE STARSS valid & Multi-subset evaluation \\
\bottomrule
\end{tabular}
\end{table*}


% -----------------------------------------------------------------------------
% Table 5: Training trajectory (representative checkpoints)
% -----------------------------------------------------------------------------
\begin{table*}[h]
\centering
\caption{Representative v13d training trajectory. Note the expected F20 dip during cls-only warmup (ep 1--7), followed by the +107\% jump when spatial loss activates at ep 8.}
\label{tab:v13d-trajectory}
\small
\begin{tabular}{lccc}
\toprule
\textbf{Epoch} & \textbf{F20} & \textbf{oracle\_cls} & \textbf{azi MAE} \\
\midrule
0                                & 0.311 & 0.650 & 28.6$^\circ$ \\
7 (end of cls warmup)            & 0.193 & 0.786 & 31.0$^\circ$ \\
8 (spatial loss activates)       & 0.397 & 0.876 & 18.5$^\circ$ \\
10 (current best)                & 0.402 & 0.864 & 17.2$^\circ$ \\
25 (projected)                   & 0.43 -- 0.46 & $\sim$0.88 & 17 -- 19$^\circ$ \\
\midrule
v12 best (reference)             & 0.378 & 0.834 & 19.7$^\circ$ \\
\bottomrule
\end{tabular}
\end{table*}