\documentclass{article} \usepackage{amsmath, amssymb} \usepackage[ruled,vlined]{algorithm2e} \usepackage{geometry} \geometry{margin=1in} \usepackage{setspace} \usepackage{graphicx} \usepackage{hyperref} \title{Efficient Large Language Model Inference with Neural Block Linearization (NBL)} \author{Mete Erdogan, Francesco Tonin, Volkan Cevher \\ EPFL, 2025} \date{} \begin{document} \maketitle \doublespacing \section*{1. Motivation} Transformer 的 Multi-Head Self-Attention (MHSA) 在推理阶段具有二次复杂度: \begin{equation} O(n^2 d) \end{equation} 其中 $n$ 为上下文长度,$d$ 为隐藏维度。 NBL 的核心思想:某些注意力层在输入输出关系上高度线性,可用一次线性变换近似。 因此可用一个线性层(矩阵乘 + 偏置加)替代整个 attention。 \section*{2. Method Overview} \subsection*{Step 1. 核心替换公式} 原 Attention 层: \begin{equation} Y = \text{softmax}\left(\frac{QK^\top}{\sqrt{d_k}}\right)V \end{equation} 其中 $ Q = XW_Q, K = XW_K, V = XW_V $。 NBL 用线性层替代: \begin{equation} \hat{Y} = W X + b \end{equation} 其中 $W, b$ 通过 \textbf{线性最小均方误差 (LMMSE)} 闭式求解。 \subsection*{Step 2. 数据采样(Calibration Data)} 采样少量无标注文本序列(如来自 C4 或 WikiText-2); 通常取 256 条文本,长度约 128–512; 对每层记录输入输出激活: \begin{equation} X_k^{(i)}, Y_k^{(i)} \quad \text{for } i=1,\dots,s \end{equation} 叠加形成: \begin{equation} X \in \mathbb{R}^{(s\cdot t)\times d_{in}}, \quad Y \in \mathbb{R}^{(s\cdot t)\times d_{out}} \end{equation} \subsection*{Step 3. LMMSE 求解线性映射} 最小化均方误差: \begin{equation} \min_{W,b} \mathbb{E}\|Y - WX - b\|_2^2 \end{equation} 闭式解为: \begin{equation} W = C_{YX} C_{XX}^{-1}, \quad b = \mathbb{E}[Y] - W\mathbb{E}[X] \end{equation} \subsection*{Step 4. 层选择标准(CCA-Based Criterion)} 计算输入输出典型相关系数 $\rho_1, \rho_2, \dots, \rho_r$,用于衡量线性对齐程度: \begin{equation} \text{NMSE}(Y, \hat{Y}) \le (h_{out}-r) + \sum_{i=1}^r (1 - \rho_i^2) \end{equation} 选择 NMSE 最低的 $m$ 层作为替换目标。 \subsection*{Step 5. 替换算法(Algorithm 1)} 见下方 Algorithm~\ref{alg:nbl}. \begin{algorithm}[H] \caption{Neural Block Linearization (NBL)} \label{alg:nbl} \KwIn{Attention layer inputs $X$ and outputs $Y$ from calibration dataset $\mathcal{D}$, number of layers to linearize $m$.} \KwOut{Compressed LLM with $m$ linearized attention layers.} \textbf{Initialization:} Load the pre-trained LLM, extract all attention layers $\mathcal{A} = \{A_1, A_2, \dots, A_K\}$.\; \For{each attention layer $A_k \in \mathcal{A}$}{ Collect input-output pairs $(X_k, Y_k)$ from $\mathcal{D}$ corresponding to layer $A_k$.\; Compute the bound on the linearization NMSE (Theorem 3.2) using canonical correlations $\rho_{k,i}$ between $Y_k$ and $X_k$: \[ \mathrm{NMSE}_k = \frac{\mathrm{MSE}(Y_k, \hat{Y}_k)}{\mathrm{Tr}(C_{Y_k Y_k})} \le \sum_i (1 - \rho_{k,i}^2) \]\; } Select $m$ layers with the lowest NMSE bounds: $\mathcal{A}_{\text{low}} = \{A_{j_1}, \dots, A_{j_m}\} \subset \mathcal{A}$.\; \For{each layer $A_j \in \mathcal{A}_{\text{low}}$}{ Replace $A_j$ with its linear approximation (LMMSE linear layer): \[ W_j = C_{Y_j X_j} C_{X_j X_j}^{-1}, \quad b_j = \mathbb{E}[Y_j] - W_j \mathbb{E}[X_j]. \] } \end{algorithm} \begin{algorithm}[H] \caption{NBL Weight/Bias and CCA Bound Computation for a Given Self-Attention Layer} \label{alg:cca-bound} \KwIn{$X$ (attention layer input), $Y$ (attention layer output).} \KwOut{Weight matrix $W$, bias $b$, and CCA-bound.} $Y_+ \gets Y + X$ \tcp*{Residual connection output} \textbf{Mean Computations:} $\mathbb{E}[X] \gets \mathrm{Mean}(X)$, $\mathbb{E}[Y] \gets \mathrm{Mean}(Y)$, $\mathbb{E}[Y_+] \gets \mathrm{Mean}(Y_+)$.\; \textbf{Cross-Covariance:} $C_{Y X} \gets \mathrm{Cov}(Y, X)$, $C_{Y_+ X} \gets \mathrm{Cov}(Y_+, X)$.\; \textbf{Covariance Matrices:} $C_{X X} \gets \mathrm{Cov}(X, X)$, $C_{Y_+ Y_+} \gets \mathrm{Cov}(Y_+, Y_+)$.\; \textbf{Eigen-Decomposition:} $(\Lambda_{Y_+ Y_+}, V_{Y_+ Y_+}) \gets \mathrm{Eigh}(C_{Y_+ Y_+})$, $(\Lambda_{X X}, V_{X X}) \gets \mathrm{Eigh}(C_{X X})$.\; \textbf{Inverse Square Roots:} $C_{Y_+ Y_+}^{-1/2} \gets V_{Y_+ Y_+} \, \mathrm{diag}(\Lambda_{Y_+ Y_+}^{-1/2}) \, V_{Y_+ Y_+}^\top$, $C_{X X}^{-1/2} \gets V_{X X} \, \mathrm{diag}(\Lambda_{X X}^{-1/2}) \, V_{X X}^\top$.\; \textbf{Correlation Matrix:} $C_W \gets C_{Y_+ Y_+}^{-1/2} \, C_{Y_+ X} \, C_{X X}^{-1/2}$.\; \textbf{Singular Value Decomposition (SVD):} $(U, S, V) \gets \mathrm{SVD}(C_W)$; $\rho_i \gets \mathrm{diag}(S)$; CCA-bound $\gets \sum_i (1 - \rho_i^2)$.\; \textbf{Weight Matrix and Bias:} $W \gets C_{Y X} C_{X X}^{-1}$, $b \gets \mathbb{E}[Y] - W \mathbb{E}[X]$.\; \end{algorithm} \end{document}