| \documentclass{article} | |
| \usepackage{amsmath, amssymb} | |
| \usepackage[ruled,vlined]{algorithm2e} | |
| \usepackage{geometry} | |
| \geometry{margin=1in} | |
| \usepackage{setspace} | |
| \usepackage{graphicx} | |
| \usepackage{hyperref} | |
| \title{Efficient Large Language Model Inference with Neural Block Linearization (NBL)} | |
| \author{Mete Erdogan, Francesco Tonin, Volkan Cevher \\ EPFL, 2025} | |
| \date{} | |
| \begin{document} | |
| \maketitle | |
| \doublespacing | |
| \section*{1. Motivation} | |
| Transformer 的 Multi-Head Self-Attention (MHSA) 在推理阶段具有二次复杂度: | |
| \begin{equation} | |
| O(n^2 d) | |
| \end{equation} | |
| 其中 $n$ 为上下文长度,$d$ 为隐藏维度。 | |
| NBL 的核心思想:某些注意力层在输入输出关系上高度线性,可用一次线性变换近似。 | |
| 因此可用一个线性层(矩阵乘 + 偏置加)替代整个 attention。 | |
| \section*{2. Method Overview} | |
| \subsection*{Step 1. 核心替换公式} | |
| 原 Attention 层: | |
| \begin{equation} | |
| Y = \text{softmax}\left(\frac{QK^\top}{\sqrt{d_k}}\right)V | |
| \end{equation} | |
| 其中 $ Q = XW_Q, K = XW_K, V = XW_V $。 | |
| NBL 用线性层替代: | |
| \begin{equation} | |
| \hat{Y} = W X + b | |
| \end{equation} | |
| 其中 $W, b$ 通过 \textbf{线性最小均方误差 (LMMSE)} 闭式求解。 | |
| \subsection*{Step 2. 数据采样(Calibration Data)} | |
| 采样少量无标注文本序列(如来自 C4 或 WikiText-2); | |
| 通常取 256 条文本,长度约 128–512; | |
| 对每层记录输入输出激活: | |
| \begin{equation} | |
| X_k^{(i)}, Y_k^{(i)} \quad \text{for } i=1,\dots,s | |
| \end{equation} | |
| 叠加形成: | |
| \begin{equation} | |
| X \in \mathbb{R}^{(s\cdot t)\times d_{in}}, \quad Y \in \mathbb{R}^{(s\cdot t)\times d_{out}} | |
| \end{equation} | |
| \subsection*{Step 3. LMMSE 求解线性映射} | |
| 最小化均方误差: | |
| \begin{equation} | |
| \min_{W,b} \mathbb{E}\|Y - WX - b\|_2^2 | |
| \end{equation} | |
| 闭式解为: | |
| \begin{equation} | |
| W = C_{YX} C_{XX}^{-1}, \quad b = \mathbb{E}[Y] - W\mathbb{E}[X] | |
| \end{equation} | |
| \subsection*{Step 4. 层选择标准(CCA-Based Criterion)} | |
| 计算输入输出典型相关系数 $\rho_1, \rho_2, \dots, \rho_r$,用于衡量线性对齐程度: | |
| \begin{equation} | |
| \text{NMSE}(Y, \hat{Y}) \le (h_{out}-r) + \sum_{i=1}^r (1 - \rho_i^2) | |
| \end{equation} | |
| 选择 NMSE 最低的 $m$ 层作为替换目标。 | |
| \subsection*{Step 5. 替换算法(Algorithm 1)} | |
| 见下方 Algorithm~\ref{alg:nbl}. | |
| \begin{algorithm}[H] | |
| \caption{Neural Block Linearization (NBL)} | |
| \label{alg:nbl} | |
| \KwIn{Attention layer inputs $X$ and outputs $Y$ from calibration dataset $\mathcal{D}$, number of layers to linearize $m$.} | |
| \KwOut{Compressed LLM with $m$ linearized attention layers.} | |
| \textbf{Initialization:} Load the pre-trained LLM, extract all attention layers $\mathcal{A} = \{A_1, A_2, \dots, A_K\}$.\; | |
| \For{each attention layer $A_k \in \mathcal{A}$}{ | |
| Collect input-output pairs $(X_k, Y_k)$ from $\mathcal{D}$ corresponding to layer $A_k$.\; | |
| Compute the bound on the linearization NMSE (Theorem 3.2) using canonical correlations $\rho_{k,i}$ between $Y_k$ and $X_k$: | |
| \[ | |
| \mathrm{NMSE}_k = \frac{\mathrm{MSE}(Y_k, \hat{Y}_k)}{\mathrm{Tr}(C_{Y_k Y_k})} | |
| \le \sum_i (1 - \rho_{k,i}^2) | |
| \]\; | |
| } | |
| Select $m$ layers with the lowest NMSE bounds: $\mathcal{A}_{\text{low}} = \{A_{j_1}, \dots, A_{j_m}\} \subset \mathcal{A}$.\; | |
| \For{each layer $A_j \in \mathcal{A}_{\text{low}}$}{ | |
| Replace $A_j$ with its linear approximation (LMMSE linear layer): | |
| \[ | |
| W_j = C_{Y_j X_j} C_{X_j X_j}^{-1}, \quad | |
| b_j = \mathbb{E}[Y_j] - W_j \mathbb{E}[X_j]. | |
| \] | |
| } | |
| \end{algorithm} | |
| \begin{algorithm}[H] | |
| \caption{NBL Weight/Bias and CCA Bound Computation for a Given Self-Attention Layer} | |
| \label{alg:cca-bound} | |
| \KwIn{$X$ (attention layer input), $Y$ (attention layer output).} | |
| \KwOut{Weight matrix $W$, bias $b$, and CCA-bound.} | |
| $Y_+ \gets Y + X$ \tcp*{Residual connection output} | |
| \textbf{Mean Computations:} $\mathbb{E}[X] \gets \mathrm{Mean}(X)$, | |
| $\mathbb{E}[Y] \gets \mathrm{Mean}(Y)$, | |
| $\mathbb{E}[Y_+] \gets \mathrm{Mean}(Y_+)$.\; | |
| \textbf{Cross-Covariance:} $C_{Y X} \gets \mathrm{Cov}(Y, X)$, | |
| $C_{Y_+ X} \gets \mathrm{Cov}(Y_+, X)$.\; | |
| \textbf{Covariance Matrices:} | |
| $C_{X X} \gets \mathrm{Cov}(X, X)$, | |
| $C_{Y_+ Y_+} \gets \mathrm{Cov}(Y_+, Y_+)$.\; | |
| \textbf{Eigen-Decomposition:} | |
| $(\Lambda_{Y_+ Y_+}, V_{Y_+ Y_+}) \gets \mathrm{Eigh}(C_{Y_+ Y_+})$, | |
| $(\Lambda_{X X}, V_{X X}) \gets \mathrm{Eigh}(C_{X X})$.\; | |
| \textbf{Inverse Square Roots:} | |
| $C_{Y_+ Y_+}^{-1/2} \gets V_{Y_+ Y_+} \, \mathrm{diag}(\Lambda_{Y_+ Y_+}^{-1/2}) \, V_{Y_+ Y_+}^\top$, | |
| $C_{X X}^{-1/2} \gets V_{X X} \, \mathrm{diag}(\Lambda_{X X}^{-1/2}) \, V_{X X}^\top$.\; | |
| \textbf{Correlation Matrix:} | |
| $C_W \gets C_{Y_+ Y_+}^{-1/2} \, C_{Y_+ X} \, C_{X X}^{-1/2}$.\; | |
| \textbf{Singular Value Decomposition (SVD):} | |
| $(U, S, V) \gets \mathrm{SVD}(C_W)$; | |
| $\rho_i \gets \mathrm{diag}(S)$; | |
| CCA-bound $\gets \sum_i (1 - \rho_i^2)$.\; | |
| \textbf{Weight Matrix and Bias:} | |
| $W \gets C_{Y X} C_{X X}^{-1}$, | |
| $b \gets \mathbb{E}[Y] - W \mathbb{E}[X]$.\; | |
| \end{algorithm} | |
| \end{document} | |