File size: 4,942 Bytes

d73500e


\documentclass{article}
\usepackage{amsmath, amssymb}
\usepackage[ruled,vlined]{algorithm2e}
\usepackage{geometry}
\geometry{margin=1in}
\usepackage{setspace}
\usepackage{graphicx}
\usepackage{hyperref}

\title{Efficient Large Language Model Inference with Neural Block Linearization (NBL)}
\author{Mete Erdogan, Francesco Tonin, Volkan Cevher \\ EPFL, 2025}
\date{}

\begin{document}
\maketitle
\doublespacing

\section*{1. Motivation}
Transformer 的 Multi-Head Self-Attention (MHSA) 在推理阶段具有二次复杂度：
\begin{equation}
O(n^2 d)
\end{equation}
其中 $n$ 为上下文长度，$d$ 为隐藏维度。

NBL 的核心思想：某些注意力层在输入输出关系上高度线性，可用一次线性变换近似。  
因此可用一个线性层（矩阵乘 + 偏置加）替代整个 attention。

\section*{2. Method Overview}

\subsection*{Step 1. 核心替换公式}
原 Attention 层：
\begin{equation}
Y = \text{softmax}\left(\frac{QK^\top}{\sqrt{d_k}}\right)V
\end{equation}
其中 $ Q = XW_Q, K = XW_K, V = XW_V $。

NBL 用线性层替代：
\begin{equation}
\hat{Y} = W X + b
\end{equation}
其中 $W, b$ 通过 \textbf{线性最小均方误差 (LMMSE)} 闭式求解。

\subsection*{Step 2. 数据采样（Calibration Data）}
采样少量无标注文本序列（如来自 C4 或 WikiText-2）；  
通常取 256 条文本，长度约 128–512；  
对每层记录输入输出激活：
\begin{equation}
X_k^{(i)}, Y_k^{(i)} \quad \text{for } i=1,\dots,s
\end{equation}
叠加形成：
\begin{equation}
X \in \mathbb{R}^{(s\cdot t)\times d_{in}}, \quad Y \in \mathbb{R}^{(s\cdot t)\times d_{out}}
\end{equation}

\subsection*{Step 3. LMMSE 求解线性映射}
最小化均方误差：
\begin{equation}
\min_{W,b} \mathbb{E}\|Y - WX - b\|_2^2
\end{equation}
闭式解为：
\begin{equation}
W = C_{YX} C_{XX}^{-1}, \quad b = \mathbb{E}[Y] - W\mathbb{E}[X]
\end{equation}

\subsection*{Step 4. 层选择标准（CCA-Based Criterion）}
计算输入输出典型相关系数 $\rho_1, \rho_2, \dots, \rho_r$，用于衡量线性对齐程度：
\begin{equation}
\text{NMSE}(Y, \hat{Y}) \le (h_{out}-r) + \sum_{i=1}^r (1 - \rho_i^2)
\end{equation}
选择 NMSE 最低的 $m$ 层作为替换目标。

\subsection*{Step 5. 替换算法（Algorithm 1）}
见下方 Algorithm~\ref{alg:nbl}.

\begin{algorithm}[H]
\caption{Neural Block Linearization (NBL)}
\label{alg:nbl}
\KwIn{Attention layer inputs $X$ and outputs $Y$ from calibration dataset $\mathcal{D}$, number of layers to linearize $m$.}
\KwOut{Compressed LLM with $m$ linearized attention layers.}

\textbf{Initialization:} Load the pre-trained LLM, extract all attention layers $\mathcal{A} = \{A_1, A_2, \dots, A_K\}$.\;

\For{each attention layer $A_k \in \mathcal{A}$}{
  Collect input-output pairs $(X_k, Y_k)$ from $\mathcal{D}$ corresponding to layer $A_k$.\;
  Compute the bound on the linearization NMSE (Theorem 3.2) using canonical correlations $\rho_{k,i}$ between $Y_k$ and $X_k$:
  \[
  \mathrm{NMSE}_k = \frac{\mathrm{MSE}(Y_k, \hat{Y}_k)}{\mathrm{Tr}(C_{Y_k Y_k})}
  \le \sum_i (1 - \rho_{k,i}^2)
  \]\;
}
Select $m$ layers with the lowest NMSE bounds: $\mathcal{A}_{\text{low}} = \{A_{j_1}, \dots, A_{j_m}\} \subset \mathcal{A}$.\;
\For{each layer $A_j \in \mathcal{A}_{\text{low}}$}{
  Replace $A_j$ with its linear approximation (LMMSE linear layer):
  \[
  W_j = C_{Y_j X_j} C_{X_j X_j}^{-1}, \quad
  b_j = \mathbb{E}[Y_j] - W_j \mathbb{E}[X_j].
  \]
}
\end{algorithm}

\begin{algorithm}[H]
\caption{NBL Weight/Bias and CCA Bound Computation for a Given Self-Attention Layer}
\label{alg:cca-bound}
\KwIn{$X$ (attention layer input), $Y$ (attention layer output).}
\KwOut{Weight matrix $W$, bias $b$, and CCA-bound.}

$Y_+ \gets Y + X$ \tcp*{Residual connection output}

\textbf{Mean Computations:} $\mathbb{E}[X] \gets \mathrm{Mean}(X)$, 
$\mathbb{E}[Y] \gets \mathrm{Mean}(Y)$, 
$\mathbb{E}[Y_+] \gets \mathrm{Mean}(Y_+)$.\;

\textbf{Cross-Covariance:} $C_{Y X} \gets \mathrm{Cov}(Y, X)$, 
$C_{Y_+ X} \gets \mathrm{Cov}(Y_+, X)$.\;

\textbf{Covariance Matrices:} 
$C_{X X} \gets \mathrm{Cov}(X, X)$, 
$C_{Y_+ Y_+} \gets \mathrm{Cov}(Y_+, Y_+)$.\;

\textbf{Eigen-Decomposition:} 
$(\Lambda_{Y_+ Y_+}, V_{Y_+ Y_+}) \gets \mathrm{Eigh}(C_{Y_+ Y_+})$, 
$(\Lambda_{X X}, V_{X X}) \gets \mathrm{Eigh}(C_{X X})$.\;

\textbf{Inverse Square Roots:} 
$C_{Y_+ Y_+}^{-1/2} \gets V_{Y_+ Y_+} \, \mathrm{diag}(\Lambda_{Y_+ Y_+}^{-1/2}) \, V_{Y_+ Y_+}^\top$, 
$C_{X X}^{-1/2} \gets V_{X X} \, \mathrm{diag}(\Lambda_{X X}^{-1/2}) \, V_{X X}^\top$.\;

\textbf{Correlation Matrix:} 
$C_W \gets C_{Y_+ Y_+}^{-1/2} \, C_{Y_+ X} \, C_{X X}^{-1/2}$.\;

\textbf{Singular Value Decomposition (SVD):} 
$(U, S, V) \gets \mathrm{SVD}(C_W)$; 
$\rho_i \gets \mathrm{diag}(S)$; 
CCA-bound $\gets \sum_i (1 - \rho_i^2)$.\;

\textbf{Weight Matrix and Bias:} 
$W \gets C_{Y X} C_{X X}^{-1}$, 
$b \gets \mathbb{E}[Y] - W \mathbb{E}[X]$.\;

\end{algorithm}

\end{document}