Spatial-BEATs / docs /v13d_appendix_tables.tex
dieKarotte's picture
Add files using upload-large-folder tool
86cbd36 verified
Raw
History Blame Contribute Delete
10.9 kB
% =============================================================================
% Spatial-BEATs v13d Hyperparameter table*s for NeurIPS Appendix
% =============================================================================
% Required packages (add to preamble if not already loaded):
% \usepackage{booktabs}
% \usepackage{multirow}
% \usepackage{amsmath}
% \usepackage{array}
% =============================================================================
% -----------------------------------------------------------------------------
% Table 1: FOA input and feature extraction
% -----------------------------------------------------------------------------
\begin{table*}[h]
\centering
\caption{FOA input and feature extraction parameters.}
\label{tab:v13d-foa-input}
\small
\begin{tabular}{lll}
\toprule
\textbf{Parameter} & \textbf{Value} & \textbf{Notes} \\
\midrule
Sample rate & 16 kHz & 4-channel FOA, order [W, X, Y, Z] \\
Clip duration & 10 s & Waveform shape $[B, 4, 160000]$ \\
STFT $n_{\text{fft}}$ & 400 & Aligned with Qwen-2.5-Omni \\
STFT hop length & 160 & 10 ms hop \\
STFT window length & 400 & 25 ms window \\
Window function & Hann &\\
Mel filterbank size & 128 & $f_{\min}=0$, $f_{\max}=8000$ \\
Time frames $T_f$ & 1000 & 100 frames/s $\times$ 10 s \\
Input channels & 7 & 4 mel (W/X/Y/Z) + 3 IV (x/y/z) \\
IV formula & $\mathrm{IV}_d = \mathrm{Re}[W \cdot \overline{X_d}] / (|W|^2 + \varepsilon)$ & $\varepsilon=10^{-8}$, clamp $\pm 10$ after mel \\
W-channel mean & 15.41663 & BEATs pretrain statistic \\
W-channel std & 6.55582 & BEATs pretrain statistic \\
SpecAugment (W only) & 2$\times$time mask (100), 2$\times$freq mask (27) & Training only \\
\bottomrule
\end{tabular}
\end{table*}
% -----------------------------------------------------------------------------
% Table 2: Model architecture, prediction heads, and inference (merged 2+3+9)
% -----------------------------------------------------------------------------
\begin{table*}[h]
\centering
\caption{Spatial-BEATs v13d architecture, prediction heads, and inference / matching configuration. Architecture is identical to v12.}
\label{tab:v13d-architecture}
\small
\begin{tabular}{llc}
\toprule
\textbf{Module} & \textbf{Parameter} & \textbf{Value} \\
\midrule
\multicolumn{3}{l}{\emph{Encoder backbone}} \\
\midrule
\multirow{4}{*}{SpatialDeltaPatchAdapter (v1)}
& Input / hidden / output channels & 7 / 32 / 512 \\
& Patch size, stride & $16{\times}16$, 16 \\
& Residual scale $\alpha$ (init) & 0.1 (learnable) \\
& Layers & Conv$(7{\to}32, 1{\times}1)$$\to$DW$(3{\times}3)$$\to$Conv$(32{\to}512, 16{\times}16)$ \\
\midrule
\multirow{2}{*}{SpatialPatchEmbedding}
& Patch tokens (10 s clip) & 496 \\
& Embed dim & 512 (projected to 768) \\
\midrule
\multirow{5}{*}{BEATs Transformer Trunk}
& Layers & 12 \\
& Hidden / FFN dim & 768 / 3072 \\
& Attention heads & 12 \\
& Position encoding & Sinusoidal relative + GRU gating \\
& Trunk adapter & 1-layer spectral demixer (zero-gated init) \\
\midrule
\multirow{8}{*}{LocalSpatialEncoder}
& Input shape & $[B, 7, T_f, 128]$ \\
& CNN stage 1 & Conv2d$(7{\to}64, 3{\times}3)$ + GN(8) + GELU \\
& CNN stage 2 & Conv2d$(64{\to}128, 3{\times}3, s{=}(1,2))$ + GN(8) + GELU \\
& CNN stage 3 & Conv2d$(128{\to}256, 3{\times}3, s{=}(1,2))$ + GN(16) + GELU \\
& Frequency reduction & Mean over freq axis $\to [B, T_f, 256]$ \\
& Transformer layers / heads & 2 / 4 (pre-LN, $d{=}256$) \\
& Dropout & 0.1 \\
& Output projection & Linear$(256 \to 768)$ \\
\midrule
\multirow{2}{*}{Frequency Pool + Resampler}
& Frequency pool & $[B, 496, 768] \to [B, 62, 768]$ (mean) \\
& Token rate (output) & 10 Hz, $T_s = 100$ \\
\midrule
\multirow{6}{*}{LocalSpatialCrossFuser}
& Mode & \texttt{cross\_attn\_gated} \\
& Layers / heads / dim & 2 / 8 / 768 \\
& Gate bias (init) & $-2.0$ \quad ($\sigma{\approx}0.119$) \\
& Direct gate bias (init) & $-1.5$ \quad ($\sigma{\approx}0.182$) \\
& Readout & 1-layer Transformer + LayerNorm \\
& Output & $[B, T_s{=}100, 768]$ \\
\midrule
\multirow{5}{*}{SourceQueryDecoder}
& Track queries $K$ & 4 \\
& Stage-1 layers & 2 (TransformerDecoder) \\
& Stage-2 layers & 1 (per-frame refinement + LN) \\
& Heads / FFN dim & 8 / 3072 \\
& Output & $[B, K{=}4, T_s{=}100, 768]$ \\
\midrule
\multicolumn{3}{l}{\emph{Per-(track, frame) prediction heads}} \\
\midrule
Activity head & Structure / output & LayerNorm + Linear$(768{\to}1)$ $\to$ logit $\ell \in \mathbb{R}$ \\
Class head & Structure / output & MLP + residual + spectral demixer $\to$ 63 logits \\
Direction head & Structure / output & MLP$(768{\to}768{\to}3)$ + L2-norm $\to$ unit vector $\in \mathbb{R}^3$ \\
Distance head & Structure / output & MLP$(768{\to}768{\to}1)$ + softplus $\to$ distance (m) \\
\midrule
\multicolumn{3}{l}{\emph{Inference and Hungarian matching}} \\
\midrule
Matching granularity && Segment-level (stable within same-active-set window) \\
Matching cost && Activity + class CE + direction cosine + distance $\ell_1$ \\
Active track selection && Top-$\hat{K}$ (DCASE SELD evaluator) \\
Train-eval alignment && Top-$K$ rank loss aligned with top-$\hat{K}$ selection \\
\bottomrule
\end{tabular}
\end{table*}
% -----------------------------------------------------------------------------
% Table 3: Loss + training schedule + optimizer + scale (merged 4+5+6+7)
% -----------------------------------------------------------------------------
\begin{table*}[h]
\centering
\caption{Spatial-BEATs v13d training configuration: loss weights, two-phase schedule, optimizer, learning rate, distributed setup, and EMA. The Top-$K$ rank activity loss (D-2) is the v13d core innovation; D-1/D-5/D-6 are stabilizers.}
\label{tab:v13d-training}
\small
\begin{tabular}{lll}
\toprule
\textbf{Group} & \textbf{Parameter} & \textbf{Value} \\
\midrule
\multirow{5}{*}{Loss weights}
& $\lambda_{\text{frame\_class}}$ & 1.0 \quad (63-way cross-entropy) \\
& $\lambda_{\text{frame\_activity}}$ & 1.0 \quad (\textbf{Top-$K$ rank}, replaces BCE) \\
& $\lambda_{\text{frame\_direction}}$ & 1.0 \quad ($1 - \cos(\hat{\mathbf{d}}, \mathbf{d})$) \\
& $\lambda_{\text{frame\_distance}}$ & 1.0 \quad (smooth-$\ell_1$) \\
& $\lambda_{\text{frame\_hemisphere}}$ & 1.0 \quad (hemisphere BCE, from v11a) \\
\midrule
\multirow{3}{*}{Top-$K$ rank loss (D-2)}
& \texttt{frame\_activity\_loss\_type} & \texttt{topk\_rank} \\
& Margin $m$ & 2.0 \\
& BCE anchor weight & 0.1 \\
\midrule
\multirow{4}{*}{Two-phase schedule (D-1)}
& LR linear warmup (ep 0 -- 2) & spatial weight $0$, LR $0 \to 1.5{\times}10^{-5}$ \\
& Cls-only warmup cont. (ep 3 -- 7) & spatial weight $0$, cosine decay \\
& Spatial linear ramp (ep 8 -- 9) & spatial weight $0 \to 1$, cosine decay \\
& Full joint training (ep 10 -- 24) & spatial weight $1$, cosine $\to 7.5{\times}10^{-7}$ \\
\midrule
\multirow{5}{*}{Optimizer}
& Optimizer & AdamW \\
& $(\beta_1, \beta_2)$ & $(0.9, 0.999)$ \\
& $\varepsilon$ & $10^{-8}$ \\
& Weight decay & 0.01 \\
& Gradient clipping (global $\ell_2$) & 1.0 \\
\midrule
\multirow{5}{*}{LR schedule}
& LR schedule & Linear warmup + cosine decay \\
& Peak LR & $1.5 \times 10^{-5}$ \\
& LR warmup epochs & 3 \\
& Cosine decay epochs & 22 \\
& Min LR ratio & 0.05 ($\to 7.5\times 10^{-7}$) \\
\midrule
\multirow{6}{*}{Training scale}
& Total epochs & 25 \\
& GPUs & 8 $\times$ A100 \\
& Batch size (per GPU / total) & 8 / 64 \\
& Numerical precision & fp32 \\
& Data loader workers / GPU & 8 \\
& Distributed framework & torchrun + DDP \\
\midrule
\multirow{2}{*}{Hot-start (D-5)}
& Resume checkpoint & v12 best.pt (strict=False, missing/unexpected = 0) \\
& Resume optimizer state & True (Adam moments preserved from v12) \\
\midrule
\multirow{4}{*}{EMA (D-6)}
& \texttt{use\_ema} & True \\
& EMA decay & 0.9995 \\
& EMA start epoch & 3 \\
& EMA application & Validation \& best.pt; restored before next training step \\
\bottomrule
\end{tabular}
\end{table*}
\noindent The Top-$K$ rank activity loss is
\begin{equation}
\mathcal{L}_{\text{rank}} = \frac{1}{|P|} \sum_{(i,j) \in P} \max\!\left(0,\; m + \ell_j - \ell_i\right),\qquad
\mathcal{L}_{\text{act}} = \mathcal{L}_{\text{rank}} + 0.1 \cdot \mathcal{L}_{\text{BCE}},
\label{eq:topk-rank}
\end{equation}
where $P$ is the set of (active slot $i$, inactive slot $j$) pairs within each frame.
% -----------------------------------------------------------------------------
% Table 4: Dataset
% -----------------------------------------------------------------------------
\begin{table*}[h]
\centering
\caption{Training and validation data composition.}
\label{tab:v13d-data}
\small
\begin{tabular}{lrl}
\toprule
\textbf{Split} & \textbf{Clips} & \textbf{Notes} \\
\midrule
Training (\texttt{unified\_spatial\_foa\_fsd63\_all/train.jsonl}) & $\sim$329 K &\\
\quad sim\_static & 304 K & Simulated static sources \\
\quad qa\_sim & 74 K & QA-derived simulated \\
\quad dcase\_real & 20 K & DCASE real recordings \\
Manifest replication & $(1,)$ & No real-data oversampling in v13d \\
Vocabulary & 63 classes & FSD50K-derived (\texttt{final\_vocabulary.csv}) \\
\midrule
Validation & $\sim$35 K + ov\{1,2,3\} sim/real + DCASE STARSS valid & Multi-subset evaluation \\
\bottomrule
\end{tabular}
\end{table*}
% -----------------------------------------------------------------------------
% Table 5: Training trajectory (representative checkpoints)
% -----------------------------------------------------------------------------
\begin{table*}[h]
\centering
\caption{Representative v13d training trajectory. Note the expected F20 dip during cls-only warmup (ep 1--7), followed by the +107\% jump when spatial loss activates at ep 8.}
\label{tab:v13d-trajectory}
\small
\begin{tabular}{lccc}
\toprule
\textbf{Epoch} & \textbf{F20} & \textbf{oracle\_cls} & \textbf{azi MAE} \\
\midrule
0 & 0.311 & 0.650 & 28.6$^\circ$ \\
7 (end of cls warmup) & 0.193 & 0.786 & 31.0$^\circ$ \\
8 (spatial loss activates) & 0.397 & 0.876 & 18.5$^\circ$ \\
10 (current best) & 0.402 & 0.864 & 17.2$^\circ$ \\
25 (projected) & 0.43 -- 0.46 & $\sim$0.88 & 17 -- 19$^\circ$ \\
\midrule
v12 best (reference) & 0.378 & 0.834 & 19.7$^\circ$ \\
\bottomrule
\end{tabular}
\end{table*}