nbl_try / LLM-Drop /NBL_algorithm.tex

Upload folder using huggingface_hub

d73500e verified 2 months ago

4.94 kB


	\documentclass{article}
	\usepackage{amsmath, amssymb}
	\usepackage[ruled,vlined]{algorithm2e}
	\usepackage{geometry}
	\geometry{margin=1in}
	\usepackage{setspace}
	\usepackage{graphicx}
	\usepackage{hyperref}

	\title{Efficient Large Language Model Inference with Neural Block Linearization (NBL)}
	\author{Mete Erdogan, Francesco Tonin, Volkan Cevher \\ EPFL, 2025}
	\date{}

	\begin{document}
	\maketitle
	\doublespacing

	\section*{1. Motivation}
	Transformer 的 Multi-Head Self-Attention (MHSA) 在推理阶段具有二次复杂度：
	\begin{equation}
	O(n^2 d)
	\end{equation}
	其中 $n$ 为上下文长度，$d$ 为隐藏维度。

	NBL 的核心思想：某些注意力层在输入输出关系上高度线性，可用一次线性变换近似。
	因此可用一个线性层（矩阵乘 + 偏置加）替代整个 attention。

	\section*{2. Method Overview}

	\subsection*{Step 1. 核心替换公式}
	原 Attention 层：
	\begin{equation}
	Y = \text{softmax}\left(\frac{QK^\top}{\sqrt{d_k}}\right)V
	\end{equation}
	其中 $ Q = XW_Q, K = XW_K, V = XW_V $。

	NBL 用线性层替代：
	\begin{equation}
	\hat{Y} = W X + b
	\end{equation}
	其中 $W, b$ 通过 \textbf{线性最小均方误差 (LMMSE)} 闭式求解。

	\subsection*{Step 2. 数据采样（Calibration Data）}
	采样少量无标注文本序列（如来自 C4 或 WikiText-2）；
	通常取 256 条文本，长度约 128–512；
	对每层记录输入输出激活：
	\begin{equation}
	X_k^{(i)}, Y_k^{(i)} \quad \text{for } i=1,\dots,s
	\end{equation}
	叠加形成：
	\begin{equation}
	X \in \mathbb{R}^{(s\cdot t)\times d_{in}}, \quad Y \in \mathbb{R}^{(s\cdot t)\times d_{out}}
	\end{equation}

	\subsection*{Step 3. LMMSE 求解线性映射}
	最小化均方误差：
	\begin{equation}
	\min_{W,b} \mathbb{E}\\|Y - WX - b\\|_2^2
	\end{equation}
	闭式解为：
	\begin{equation}
	W = C_{YX} C_{XX}^{-1}, \quad b = \mathbb{E}[Y] - W\mathbb{E}[X]
	\end{equation}

	\subsection*{Step 4. 层选择标准（CCA-Based Criterion）}
	计算输入输出典型相关系数 $\rho_1, \rho_2, \dots, \rho_r$，用于衡量线性对齐程度：
	\begin{equation}
	\text{NMSE}(Y, \hat{Y}) \le (h_{out}-r) + \sum_{i=1}^r (1 - \rho_i^2)
	\end{equation}
	选择 NMSE 最低的 $m$ 层作为替换目标。

	\subsection*{Step 5. 替换算法（Algorithm 1）}
	见下方 Algorithm~\ref{alg:nbl}.

	\begin{algorithm}[H]
	\caption{Neural Block Linearization (NBL)}
	\label{alg:nbl}
	\KwIn{Attention layer inputs $X$ and outputs $Y$ from calibration dataset $\mathcal{D}$, number of layers to linearize $m$.}
	\KwOut{Compressed LLM with $m$ linearized attention layers.}

	\textbf{Initialization:} Load the pre-trained LLM, extract all attention layers $\mathcal{A} = \{A_1, A_2, \dots, A_K\}$.\;

	\For{each attention layer $A_k \in \mathcal{A}$}{
	Collect input-output pairs $(X_k, Y_k)$ from $\mathcal{D}$ corresponding to layer $A_k$.\;
	Compute the bound on the linearization NMSE (Theorem 3.2) using canonical correlations $\rho_{k,i}$ between $Y_k$ and $X_k$:
	\[
	\mathrm{NMSE}_k = \frac{\mathrm{MSE}(Y_k, \hat{Y}_k)}{\mathrm{Tr}(C_{Y_k Y_k})}
	\le \sum_i (1 - \rho_{k,i}^2)
	\]\;
	}
	Select $m$ layers with the lowest NMSE bounds: $\mathcal{A}_{\text{low}} = \{A_{j_1}, \dots, A_{j_m}\} \subset \mathcal{A}$.\;
	\For{each layer $A_j \in \mathcal{A}_{\text{low}}$}{
	Replace $A_j$ with its linear approximation (LMMSE linear layer):
	\[
	W_j = C_{Y_j X_j} C_{X_j X_j}^{-1}, \quad
	b_j = \mathbb{E}[Y_j] - W_j \mathbb{E}[X_j].
	\]
	}
	\end{algorithm}

	\begin{algorithm}[H]
	\caption{NBL Weight/Bias and CCA Bound Computation for a Given Self-Attention Layer}
	\label{alg:cca-bound}
	\KwIn{$X$ (attention layer input), $Y$ (attention layer output).}
	\KwOut{Weight matrix $W$, bias $b$, and CCA-bound.}

	$Y_+ \gets Y + X$ \tcp*{Residual connection output}

	\textbf{Mean Computations:} $\mathbb{E}[X] \gets \mathrm{Mean}(X)$,
	$\mathbb{E}[Y] \gets \mathrm{Mean}(Y)$,
	$\mathbb{E}[Y_+] \gets \mathrm{Mean}(Y_+)$.\;

	\textbf{Cross-Covariance:} $C_{Y X} \gets \mathrm{Cov}(Y, X)$,
	$C_{Y_+ X} \gets \mathrm{Cov}(Y_+, X)$.\;

	\textbf{Covariance Matrices:}
	$C_{X X} \gets \mathrm{Cov}(X, X)$,
	$C_{Y_+ Y_+} \gets \mathrm{Cov}(Y_+, Y_+)$.\;

	\textbf{Eigen-Decomposition:}
	$(\Lambda_{Y_+ Y_+}, V_{Y_+ Y_+}) \gets \mathrm{Eigh}(C_{Y_+ Y_+})$,
	$(\Lambda_{X X}, V_{X X}) \gets \mathrm{Eigh}(C_{X X})$.\;

	\textbf{Inverse Square Roots:}
	$C_{Y_+ Y_+}^{-1/2} \gets V_{Y_+ Y_+} \, \mathrm{diag}(\Lambda_{Y_+ Y_+}^{-1/2}) \, V_{Y_+ Y_+}^\top$,
	$C_{X X}^{-1/2} \gets V_{X X} \, \mathrm{diag}(\Lambda_{X X}^{-1/2}) \, V_{X X}^\top$.\;

	\textbf{Correlation Matrix:}
	$C_W \gets C_{Y_+ Y_+}^{-1/2} \, C_{Y_+ X} \, C_{X X}^{-1/2}$.\;

	\textbf{Singular Value Decomposition (SVD):}
	$(U, S, V) \gets \mathrm{SVD}(C_W)$;
	$\rho_i \gets \mathrm{diag}(S)$;
	CCA-bound $\gets \sum_i (1 - \rho_i^2)$.\;

	\textbf{Weight Matrix and Bias:}
	$W \gets C_{Y X} C_{X X}^{-1}$,
	$b \gets \mathbb{E}[Y] - W \mathbb{E}[X]$.\;

	\end{algorithm}

	\end{document}