nbl_try / LLM-Drop /NBL_algorithm.tex
s1ghhh's picture
Upload folder using huggingface_hub
d73500e verified
\documentclass{article}
\usepackage{amsmath, amssymb}
\usepackage[ruled,vlined]{algorithm2e}
\usepackage{geometry}
\geometry{margin=1in}
\usepackage{setspace}
\usepackage{graphicx}
\usepackage{hyperref}
\title{Efficient Large Language Model Inference with Neural Block Linearization (NBL)}
\author{Mete Erdogan, Francesco Tonin, Volkan Cevher \\ EPFL, 2025}
\date{}
\begin{document}
\maketitle
\doublespacing
\section*{1. Motivation}
Transformer 的 Multi-Head Self-Attention (MHSA) 在推理阶段具有二次复杂度:
\begin{equation}
O(n^2 d)
\end{equation}
其中 $n$ 为上下文长度,$d$ 为隐藏维度。
NBL 的核心思想:某些注意力层在输入输出关系上高度线性,可用一次线性变换近似。
因此可用一个线性层(矩阵乘 + 偏置加)替代整个 attention。
\section*{2. Method Overview}
\subsection*{Step 1. 核心替换公式}
原 Attention 层:
\begin{equation}
Y = \text{softmax}\left(\frac{QK^\top}{\sqrt{d_k}}\right)V
\end{equation}
其中 $ Q = XW_Q, K = XW_K, V = XW_V $
NBL 用线性层替代:
\begin{equation}
\hat{Y} = W X + b
\end{equation}
其中 $W, b$ 通过 \textbf{线性最小均方误差 (LMMSE)} 闭式求解。
\subsection*{Step 2. 数据采样(Calibration Data)}
采样少量无标注文本序列(如来自 C4 或 WikiText-2);
通常取 256 条文本,长度约 128–512;
对每层记录输入输出激活:
\begin{equation}
X_k^{(i)}, Y_k^{(i)} \quad \text{for } i=1,\dots,s
\end{equation}
叠加形成:
\begin{equation}
X \in \mathbb{R}^{(s\cdot t)\times d_{in}}, \quad Y \in \mathbb{R}^{(s\cdot t)\times d_{out}}
\end{equation}
\subsection*{Step 3. LMMSE 求解线性映射}
最小化均方误差:
\begin{equation}
\min_{W,b} \mathbb{E}\|Y - WX - b\|_2^2
\end{equation}
闭式解为:
\begin{equation}
W = C_{YX} C_{XX}^{-1}, \quad b = \mathbb{E}[Y] - W\mathbb{E}[X]
\end{equation}
\subsection*{Step 4. 层选择标准(CCA-Based Criterion)}
计算输入输出典型相关系数 $\rho_1, \rho_2, \dots, \rho_r$,用于衡量线性对齐程度:
\begin{equation}
\text{NMSE}(Y, \hat{Y}) \le (h_{out}-r) + \sum_{i=1}^r (1 - \rho_i^2)
\end{equation}
选择 NMSE 最低的 $m$ 层作为替换目标。
\subsection*{Step 5. 替换算法(Algorithm 1)}
见下方 Algorithm~\ref{alg:nbl}.
\begin{algorithm}[H]
\caption{Neural Block Linearization (NBL)}
\label{alg:nbl}
\KwIn{Attention layer inputs $X$ and outputs $Y$ from calibration dataset $\mathcal{D}$, number of layers to linearize $m$.}
\KwOut{Compressed LLM with $m$ linearized attention layers.}
\textbf{Initialization:} Load the pre-trained LLM, extract all attention layers $\mathcal{A} = \{A_1, A_2, \dots, A_K\}$.\;
\For{each attention layer $A_k \in \mathcal{A}$}{
Collect input-output pairs $(X_k, Y_k)$ from $\mathcal{D}$ corresponding to layer $A_k$.\;
Compute the bound on the linearization NMSE (Theorem 3.2) using canonical correlations $\rho_{k,i}$ between $Y_k$ and $X_k$:
\[
\mathrm{NMSE}_k = \frac{\mathrm{MSE}(Y_k, \hat{Y}_k)}{\mathrm{Tr}(C_{Y_k Y_k})}
\le \sum_i (1 - \rho_{k,i}^2)
\]\;
}
Select $m$ layers with the lowest NMSE bounds: $\mathcal{A}_{\text{low}} = \{A_{j_1}, \dots, A_{j_m}\} \subset \mathcal{A}$.\;
\For{each layer $A_j \in \mathcal{A}_{\text{low}}$}{
Replace $A_j$ with its linear approximation (LMMSE linear layer):
\[
W_j = C_{Y_j X_j} C_{X_j X_j}^{-1}, \quad
b_j = \mathbb{E}[Y_j] - W_j \mathbb{E}[X_j].
\]
}
\end{algorithm}
\begin{algorithm}[H]
\caption{NBL Weight/Bias and CCA Bound Computation for a Given Self-Attention Layer}
\label{alg:cca-bound}
\KwIn{$X$ (attention layer input), $Y$ (attention layer output).}
\KwOut{Weight matrix $W$, bias $b$, and CCA-bound.}
$Y_+ \gets Y + X$ \tcp*{Residual connection output}
\textbf{Mean Computations:} $\mathbb{E}[X] \gets \mathrm{Mean}(X)$,
$\mathbb{E}[Y] \gets \mathrm{Mean}(Y)$,
$\mathbb{E}[Y_+] \gets \mathrm{Mean}(Y_+)$.\;
\textbf{Cross-Covariance:} $C_{Y X} \gets \mathrm{Cov}(Y, X)$,
$C_{Y_+ X} \gets \mathrm{Cov}(Y_+, X)$.\;
\textbf{Covariance Matrices:}
$C_{X X} \gets \mathrm{Cov}(X, X)$,
$C_{Y_+ Y_+} \gets \mathrm{Cov}(Y_+, Y_+)$.\;
\textbf{Eigen-Decomposition:}
$(\Lambda_{Y_+ Y_+}, V_{Y_+ Y_+}) \gets \mathrm{Eigh}(C_{Y_+ Y_+})$,
$(\Lambda_{X X}, V_{X X}) \gets \mathrm{Eigh}(C_{X X})$.\;
\textbf{Inverse Square Roots:}
$C_{Y_+ Y_+}^{-1/2} \gets V_{Y_+ Y_+} \, \mathrm{diag}(\Lambda_{Y_+ Y_+}^{-1/2}) \, V_{Y_+ Y_+}^\top$,
$C_{X X}^{-1/2} \gets V_{X X} \, \mathrm{diag}(\Lambda_{X X}^{-1/2}) \, V_{X X}^\top$.\;
\textbf{Correlation Matrix:}
$C_W \gets C_{Y_+ Y_+}^{-1/2} \, C_{Y_+ X} \, C_{X X}^{-1/2}$.\;
\textbf{Singular Value Decomposition (SVD):}
$(U, S, V) \gets \mathrm{SVD}(C_W)$;
$\rho_i \gets \mathrm{diag}(S)$;
CCA-bound $\gets \sum_i (1 - \rho_i^2)$.\;
\textbf{Weight Matrix and Bias:}
$W \gets C_{Y X} C_{X X}^{-1}$,
$b \gets \mathbb{E}[Y] - W \mathbb{E}[X]$.\;
\end{algorithm}
\end{document}