File size: 4,942 Bytes
d73500e |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 |
\documentclass{article}
\usepackage{amsmath, amssymb}
\usepackage[ruled,vlined]{algorithm2e}
\usepackage{geometry}
\geometry{margin=1in}
\usepackage{setspace}
\usepackage{graphicx}
\usepackage{hyperref}
\title{Efficient Large Language Model Inference with Neural Block Linearization (NBL)}
\author{Mete Erdogan, Francesco Tonin, Volkan Cevher \\ EPFL, 2025}
\date{}
\begin{document}
\maketitle
\doublespacing
\section*{1. Motivation}
Transformer 的 Multi-Head Self-Attention (MHSA) 在推理阶段具有二次复杂度:
\begin{equation}
O(n^2 d)
\end{equation}
其中 $n$ 为上下文长度,$d$ 为隐藏维度。
NBL 的核心思想:某些注意力层在输入输出关系上高度线性,可用一次线性变换近似。
因此可用一个线性层(矩阵乘 + 偏置加)替代整个 attention。
\section*{2. Method Overview}
\subsection*{Step 1. 核心替换公式}
原 Attention 层:
\begin{equation}
Y = \text{softmax}\left(\frac{QK^\top}{\sqrt{d_k}}\right)V
\end{equation}
其中 $ Q = XW_Q, K = XW_K, V = XW_V $。
NBL 用线性层替代:
\begin{equation}
\hat{Y} = W X + b
\end{equation}
其中 $W, b$ 通过 \textbf{线性最小均方误差 (LMMSE)} 闭式求解。
\subsection*{Step 2. 数据采样(Calibration Data)}
采样少量无标注文本序列(如来自 C4 或 WikiText-2);
通常取 256 条文本,长度约 128–512;
对每层记录输入输出激活:
\begin{equation}
X_k^{(i)}, Y_k^{(i)} \quad \text{for } i=1,\dots,s
\end{equation}
叠加形成:
\begin{equation}
X \in \mathbb{R}^{(s\cdot t)\times d_{in}}, \quad Y \in \mathbb{R}^{(s\cdot t)\times d_{out}}
\end{equation}
\subsection*{Step 3. LMMSE 求解线性映射}
最小化均方误差:
\begin{equation}
\min_{W,b} \mathbb{E}\|Y - WX - b\|_2^2
\end{equation}
闭式解为:
\begin{equation}
W = C_{YX} C_{XX}^{-1}, \quad b = \mathbb{E}[Y] - W\mathbb{E}[X]
\end{equation}
\subsection*{Step 4. 层选择标准(CCA-Based Criterion)}
计算输入输出典型相关系数 $\rho_1, \rho_2, \dots, \rho_r$,用于衡量线性对齐程度:
\begin{equation}
\text{NMSE}(Y, \hat{Y}) \le (h_{out}-r) + \sum_{i=1}^r (1 - \rho_i^2)
\end{equation}
选择 NMSE 最低的 $m$ 层作为替换目标。
\subsection*{Step 5. 替换算法(Algorithm 1)}
见下方 Algorithm~\ref{alg:nbl}.
\begin{algorithm}[H]
\caption{Neural Block Linearization (NBL)}
\label{alg:nbl}
\KwIn{Attention layer inputs $X$ and outputs $Y$ from calibration dataset $\mathcal{D}$, number of layers to linearize $m$.}
\KwOut{Compressed LLM with $m$ linearized attention layers.}
\textbf{Initialization:} Load the pre-trained LLM, extract all attention layers $\mathcal{A} = \{A_1, A_2, \dots, A_K\}$.\;
\For{each attention layer $A_k \in \mathcal{A}$}{
Collect input-output pairs $(X_k, Y_k)$ from $\mathcal{D}$ corresponding to layer $A_k$.\;
Compute the bound on the linearization NMSE (Theorem 3.2) using canonical correlations $\rho_{k,i}$ between $Y_k$ and $X_k$:
\[
\mathrm{NMSE}_k = \frac{\mathrm{MSE}(Y_k, \hat{Y}_k)}{\mathrm{Tr}(C_{Y_k Y_k})}
\le \sum_i (1 - \rho_{k,i}^2)
\]\;
}
Select $m$ layers with the lowest NMSE bounds: $\mathcal{A}_{\text{low}} = \{A_{j_1}, \dots, A_{j_m}\} \subset \mathcal{A}$.\;
\For{each layer $A_j \in \mathcal{A}_{\text{low}}$}{
Replace $A_j$ with its linear approximation (LMMSE linear layer):
\[
W_j = C_{Y_j X_j} C_{X_j X_j}^{-1}, \quad
b_j = \mathbb{E}[Y_j] - W_j \mathbb{E}[X_j].
\]
}
\end{algorithm}
\begin{algorithm}[H]
\caption{NBL Weight/Bias and CCA Bound Computation for a Given Self-Attention Layer}
\label{alg:cca-bound}
\KwIn{$X$ (attention layer input), $Y$ (attention layer output).}
\KwOut{Weight matrix $W$, bias $b$, and CCA-bound.}
$Y_+ \gets Y + X$ \tcp*{Residual connection output}
\textbf{Mean Computations:} $\mathbb{E}[X] \gets \mathrm{Mean}(X)$,
$\mathbb{E}[Y] \gets \mathrm{Mean}(Y)$,
$\mathbb{E}[Y_+] \gets \mathrm{Mean}(Y_+)$.\;
\textbf{Cross-Covariance:} $C_{Y X} \gets \mathrm{Cov}(Y, X)$,
$C_{Y_+ X} \gets \mathrm{Cov}(Y_+, X)$.\;
\textbf{Covariance Matrices:}
$C_{X X} \gets \mathrm{Cov}(X, X)$,
$C_{Y_+ Y_+} \gets \mathrm{Cov}(Y_+, Y_+)$.\;
\textbf{Eigen-Decomposition:}
$(\Lambda_{Y_+ Y_+}, V_{Y_+ Y_+}) \gets \mathrm{Eigh}(C_{Y_+ Y_+})$,
$(\Lambda_{X X}, V_{X X}) \gets \mathrm{Eigh}(C_{X X})$.\;
\textbf{Inverse Square Roots:}
$C_{Y_+ Y_+}^{-1/2} \gets V_{Y_+ Y_+} \, \mathrm{diag}(\Lambda_{Y_+ Y_+}^{-1/2}) \, V_{Y_+ Y_+}^\top$,
$C_{X X}^{-1/2} \gets V_{X X} \, \mathrm{diag}(\Lambda_{X X}^{-1/2}) \, V_{X X}^\top$.\;
\textbf{Correlation Matrix:}
$C_W \gets C_{Y_+ Y_+}^{-1/2} \, C_{Y_+ X} \, C_{X X}^{-1/2}$.\;
\textbf{Singular Value Decomposition (SVD):}
$(U, S, V) \gets \mathrm{SVD}(C_W)$;
$\rho_i \gets \mathrm{diag}(S)$;
CCA-bound $\gets \sum_i (1 - \rho_i^2)$.\;
\textbf{Weight Matrix and Bias:}
$W \gets C_{Y X} C_{X X}^{-1}$,
$b \gets \mathbb{E}[Y] - W \mathbb{E}[X]$.\;
\end{algorithm}
\end{document}
|