Spatial-BEATs / docs /v13d_appendix_tables.tex

Add files using upload-large-folder tool

86cbd36 verified 19 days ago

10.9 kB

	% =============================================================================
	% Spatial-BEATs v13d Hyperparameter table*s for NeurIPS Appendix
	% =============================================================================
	% Required packages (add to preamble if not already loaded):
	% \usepackage{booktabs}
	% \usepackage{multirow}
	% \usepackage{amsmath}
	% \usepackage{array}
	% =============================================================================


	% -----------------------------------------------------------------------------
	% Table 1: FOA input and feature extraction
	% -----------------------------------------------------------------------------
	\begin{table*}[h]
	\centering
	\caption{FOA input and feature extraction parameters.}
	\label{tab:v13d-foa-input}
	\small
	\begin{tabular}{lll}
	\toprule
	\textbf{Parameter} & \textbf{Value} & \textbf{Notes} \\
	\midrule
	Sample rate & 16 kHz & 4-channel FOA, order [W, X, Y, Z] \\
	Clip duration & 10 s & Waveform shape $[B, 4, 160000]$ \\
	STFT $n_{\text{fft}}$ & 400 & Aligned with Qwen-2.5-Omni \\
	STFT hop length & 160 & 10 ms hop \\
	STFT window length & 400 & 25 ms window \\
	Window function & Hann & — \\
	Mel filterbank size & 128 & $f_{\min}=0$, $f_{\max}=8000$ \\
	Time frames $T_f$ & 1000 & 100 frames/s $\times$ 10 s \\
	Input channels & 7 & 4 mel (W/X/Y/Z) + 3 IV (x/y/z) \\
	IV formula & $\mathrm{IV}_d = \mathrm{Re}[W \cdot \overline{X_d}] / (\|W\|^2 + \varepsilon)$ & $\varepsilon=10^{-8}$, clamp $\pm 10$ after mel \\
	W-channel mean & 15.41663 & BEATs pretrain statistic \\
	W-channel std & 6.55582 & BEATs pretrain statistic \\
	SpecAugment (W only) & 2$\times$time mask (100), 2$\times$freq mask (27) & Training only \\
	\bottomrule
	\end{tabular}
	\end{table*}


	% -----------------------------------------------------------------------------
	% Table 2: Model architecture, prediction heads, and inference (merged 2+3+9)
	% -----------------------------------------------------------------------------
	\begin{table*}[h]
	\centering
	\caption{Spatial-BEATs v13d architecture, prediction heads, and inference / matching configuration. Architecture is identical to v12.}
	\label{tab:v13d-architecture}
	\small
	\begin{tabular}{llc}
	\toprule
	\textbf{Module} & \textbf{Parameter} & \textbf{Value} \\
	\midrule
	\multicolumn{3}{l}{\emph{Encoder backbone}} \\
	\midrule
	\multirow{4}{*}{SpatialDeltaPatchAdapter (v1)}
	& Input / hidden / output channels & 7 / 32 / 512 \\
	& Patch size, stride & $16{\times}16$, 16 \\
	& Residual scale $\alpha$ (init) & 0.1 (learnable) \\
	& Layers & Conv$(7{\to}32, 1{\times}1)$$\to$DW$(3{\times}3)$$\to$Conv$(32{\to}512, 16{\times}16)$ \\
	\midrule
	\multirow{2}{*}{SpatialPatchEmbedding}
	& Patch tokens (10 s clip) & 496 \\
	& Embed dim & 512 (projected to 768) \\
	\midrule
	\multirow{5}{*}{BEATs Transformer Trunk}
	& Layers & 12 \\
	& Hidden / FFN dim & 768 / 3072 \\
	& Attention heads & 12 \\
	& Position encoding & Sinusoidal relative + GRU gating \\
	& Trunk adapter & 1-layer spectral demixer (zero-gated init) \\
	\midrule
	\multirow{8}{*}{LocalSpatialEncoder}
	& Input shape & $[B, 7, T_f, 128]$ \\
	& CNN stage 1 & Conv2d$(7{\to}64, 3{\times}3)$ + GN(8) + GELU \\
	& CNN stage 2 & Conv2d$(64{\to}128, 3{\times}3, s{=}(1,2))$ + GN(8) + GELU \\
	& CNN stage 3 & Conv2d$(128{\to}256, 3{\times}3, s{=}(1,2))$ + GN(16) + GELU \\
	& Frequency reduction & Mean over freq axis $\to [B, T_f, 256]$ \\
	& Transformer layers / heads & 2 / 4 (pre-LN, $d{=}256$) \\
	& Dropout & 0.1 \\
	& Output projection & Linear$(256 \to 768)$ \\
	\midrule
	\multirow{2}{*}{Frequency Pool + Resampler}
	& Frequency pool & $[B, 496, 768] \to [B, 62, 768]$ (mean) \\
	& Token rate (output) & 10 Hz, $T_s = 100$ \\
	\midrule
	\multirow{6}{*}{LocalSpatialCrossFuser}
	& Mode & \texttt{cross\_attn\_gated} \\
	& Layers / heads / dim & 2 / 8 / 768 \\
	& Gate bias (init) & $-2.0$ \quad ($\sigma{\approx}0.119$) \\
	& Direct gate bias (init) & $-1.5$ \quad ($\sigma{\approx}0.182$) \\
	& Readout & 1-layer Transformer + LayerNorm \\
	& Output & $[B, T_s{=}100, 768]$ \\
	\midrule
	\multirow{5}{*}{SourceQueryDecoder}
	& Track queries $K$ & 4 \\
	& Stage-1 layers & 2 (TransformerDecoder) \\
	& Stage-2 layers & 1 (per-frame refinement + LN) \\
	& Heads / FFN dim & 8 / 3072 \\
	& Output & $[B, K{=}4, T_s{=}100, 768]$ \\
	\midrule
	\multicolumn{3}{l}{\emph{Per-(track, frame) prediction heads}} \\
	\midrule
	Activity head & Structure / output & LayerNorm + Linear$(768{\to}1)$ $\to$ logit $\ell \in \mathbb{R}$ \\
	Class head & Structure / output & MLP + residual + spectral demixer $\to$ 63 logits \\
	Direction head & Structure / output & MLP$(768{\to}768{\to}3)$ + L2-norm $\to$ unit vector $\in \mathbb{R}^3$ \\
	Distance head & Structure / output & MLP$(768{\to}768{\to}1)$ + softplus $\to$ distance (m) \\
	\midrule
	\multicolumn{3}{l}{\emph{Inference and Hungarian matching}} \\
	\midrule
	Matching granularity & — & Segment-level (stable within same-active-set window) \\
	Matching cost & — & Activity + class CE + direction cosine + distance $\ell_1$ \\
	Active track selection & — & Top-$\hat{K}$ (DCASE SELD evaluator) \\
	Train-eval alignment & — & Top-$K$ rank loss aligned with top-$\hat{K}$ selection \\
	\bottomrule
	\end{tabular}
	\end{table*}


	% -----------------------------------------------------------------------------
	% Table 3: Loss + training schedule + optimizer + scale (merged 4+5+6+7)
	% -----------------------------------------------------------------------------
	\begin{table*}[h]
	\centering
	\caption{Spatial-BEATs v13d training configuration: loss weights, two-phase schedule, optimizer, learning rate, distributed setup, and EMA. The Top-$K$ rank activity loss (D-2) is the v13d core innovation; D-1/D-5/D-6 are stabilizers.}
	\label{tab:v13d-training}
	\small
	\begin{tabular}{lll}
	\toprule
	\textbf{Group} & \textbf{Parameter} & \textbf{Value} \\
	\midrule
	\multirow{5}{*}{Loss weights}
	& $\lambda_{\text{frame\_class}}$ & 1.0 \quad (63-way cross-entropy) \\
	& $\lambda_{\text{frame\_activity}}$ & 1.0 \quad (\textbf{Top-$K$ rank}, replaces BCE) \\
	& $\lambda_{\text{frame\_direction}}$ & 1.0 \quad ($1 - \cos(\hat{\mathbf{d}}, \mathbf{d})$) \\
	& $\lambda_{\text{frame\_distance}}$ & 1.0 \quad (smooth-$\ell_1$) \\
	& $\lambda_{\text{frame\_hemisphere}}$ & 1.0 \quad (hemisphere BCE, from v11a) \\
	\midrule
	\multirow{3}{*}{Top-$K$ rank loss (D-2)}
	& \texttt{frame\_activity\_loss\_type} & \texttt{topk\_rank} \\
	& Margin $m$ & 2.0 \\
	& BCE anchor weight & 0.1 \\
	\midrule
	\multirow{4}{*}{Two-phase schedule (D-1)}
	& LR linear warmup (ep 0 -- 2) & spatial weight $0$, LR $0 \to 1.5{\times}10^{-5}$ \\
	& Cls-only warmup cont. (ep 3 -- 7) & spatial weight $0$, cosine decay \\
	& Spatial linear ramp (ep 8 -- 9) & spatial weight $0 \to 1$, cosine decay \\
	& Full joint training (ep 10 -- 24) & spatial weight $1$, cosine $\to 7.5{\times}10^{-7}$ \\
	\midrule
	\multirow{5}{*}{Optimizer}
	& Optimizer & AdamW \\
	& $(\beta_1, \beta_2)$ & $(0.9, 0.999)$ \\
	& $\varepsilon$ & $10^{-8}$ \\
	& Weight decay & 0.01 \\
	& Gradient clipping (global $\ell_2$) & 1.0 \\
	\midrule
	\multirow{5}{*}{LR schedule}
	& LR schedule & Linear warmup + cosine decay \\
	& Peak LR & $1.5 \times 10^{-5}$ \\
	& LR warmup epochs & 3 \\
	& Cosine decay epochs & 22 \\
	& Min LR ratio & 0.05 ($\to 7.5\times 10^{-7}$) \\
	\midrule
	\multirow{6}{*}{Training scale}
	& Total epochs & 25 \\
	& GPUs & 8 $\times$ A100 \\
	& Batch size (per GPU / total) & 8 / 64 \\
	& Numerical precision & fp32 \\
	& Data loader workers / GPU & 8 \\
	& Distributed framework & torchrun + DDP \\
	\midrule
	\multirow{2}{*}{Hot-start (D-5)}
	& Resume checkpoint & v12 best.pt (strict=False, missing/unexpected = 0) \\
	& Resume optimizer state & True (Adam moments preserved from v12) \\
	\midrule
	\multirow{4}{*}{EMA (D-6)}
	& \texttt{use\_ema} & True \\
	& EMA decay & 0.9995 \\
	& EMA start epoch & 3 \\
	& EMA application & Validation \& best.pt; restored before next training step \\
	\bottomrule
	\end{tabular}
	\end{table*}

	\noindent The Top-$K$ rank activity loss is
	\begin{equation}
	\mathcal{L}_{\text{rank}} = \frac{1}{\|P\|} \sum_{(i,j) \in P} \max\!\left(0,\; m + \ell_j - \ell_i\right),\qquad
	\mathcal{L}_{\text{act}} = \mathcal{L}_{\text{rank}} + 0.1 \cdot \mathcal{L}_{\text{BCE}},
	\label{eq:topk-rank}
	\end{equation}
	where $P$ is the set of (active slot $i$, inactive slot $j$) pairs within each frame.


	% -----------------------------------------------------------------------------
	% Table 4: Dataset
	% -----------------------------------------------------------------------------
	\begin{table*}[h]
	\centering
	\caption{Training and validation data composition.}
	\label{tab:v13d-data}
	\small
	\begin{tabular}{lrl}
	\toprule
	\textbf{Split} & \textbf{Clips} & \textbf{Notes} \\
	\midrule
	Training (\texttt{unified\_spatial\_foa\_fsd63\_all/train.jsonl}) & $\sim$329 K & — \\
	\quad sim\_static & 304 K & Simulated static sources \\
	\quad qa\_sim & 74 K & QA-derived simulated \\
	\quad dcase\_real & 20 K & DCASE real recordings \\
	Manifest replication & $(1,)$ & No real-data oversampling in v13d \\
	Vocabulary & 63 classes & FSD50K-derived (\texttt{final\_vocabulary.csv}) \\
	\midrule
	Validation & $\sim$35 K + ov\{1,2,3\} sim/real + DCASE STARSS valid & Multi-subset evaluation \\
	\bottomrule
	\end{tabular}
	\end{table*}


	% -----------------------------------------------------------------------------
	% Table 5: Training trajectory (representative checkpoints)
	% -----------------------------------------------------------------------------
	\begin{table*}[h]
	\centering
	\caption{Representative v13d training trajectory. Note the expected F20 dip during cls-only warmup (ep 1--7), followed by the +107\% jump when spatial loss activates at ep 8.}
	\label{tab:v13d-trajectory}
	\small
	\begin{tabular}{lccc}
	\toprule
	\textbf{Epoch} & \textbf{F20} & \textbf{oracle\_cls} & \textbf{azi MAE} \\
	\midrule
	0 & 0.311 & 0.650 & 28.6$^\circ$ \\
	7 (end of cls warmup) & 0.193 & 0.786 & 31.0$^\circ$ \\
	8 (spatial loss activates) & 0.397 & 0.876 & 18.5$^\circ$ \\
	10 (current best) & 0.402 & 0.864 & 17.2$^\circ$ \\
	25 (projected) & 0.43 -- 0.46 & $\sim$0.88 & 17 -- 19$^\circ$ \\
	\midrule
	v12 best (reference) & 0.378 & 0.834 & 19.7$^\circ$ \\
	\bottomrule
	\end{tabular}
	\end{table*}