File size: 4,942 Bytes
d73500e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144

\documentclass{article}
\usepackage{amsmath, amssymb}
\usepackage[ruled,vlined]{algorithm2e}
\usepackage{geometry}
\geometry{margin=1in}
\usepackage{setspace}
\usepackage{graphicx}
\usepackage{hyperref}

\title{Efficient Large Language Model Inference with Neural Block Linearization (NBL)}
\author{Mete Erdogan, Francesco Tonin, Volkan Cevher \\ EPFL, 2025}
\date{}

\begin{document}
\maketitle
\doublespacing

\section*{1. Motivation}
Transformer 的 Multi-Head Self-Attention (MHSA) 在推理阶段具有二次复杂度:
\begin{equation}
O(n^2 d)
\end{equation}
其中 $n$ 为上下文长度,$d$ 为隐藏维度。

NBL 的核心思想:某些注意力层在输入输出关系上高度线性,可用一次线性变换近似。  
因此可用一个线性层(矩阵乘 + 偏置加)替代整个 attention。

\section*{2. Method Overview}

\subsection*{Step 1. 核心替换公式}
原 Attention 层:
\begin{equation}
Y = \text{softmax}\left(\frac{QK^\top}{\sqrt{d_k}}\right)V
\end{equation}
其中 $ Q = XW_Q, K = XW_K, V = XW_V $。

NBL 用线性层替代:
\begin{equation}
\hat{Y} = W X + b
\end{equation}
其中 $W, b$ 通过 \textbf{线性最小均方误差 (LMMSE)} 闭式求解。

\subsection*{Step 2. 数据采样(Calibration Data)}
采样少量无标注文本序列(如来自 C4 或 WikiText-2);  
通常取 256 条文本,长度约 128–512;  
对每层记录输入输出激活:
\begin{equation}
X_k^{(i)}, Y_k^{(i)} \quad \text{for } i=1,\dots,s
\end{equation}
叠加形成:
\begin{equation}
X \in \mathbb{R}^{(s\cdot t)\times d_{in}}, \quad Y \in \mathbb{R}^{(s\cdot t)\times d_{out}}
\end{equation}

\subsection*{Step 3. LMMSE 求解线性映射}
最小化均方误差:
\begin{equation}
\min_{W,b} \mathbb{E}\|Y - WX - b\|_2^2
\end{equation}
闭式解为:
\begin{equation}
W = C_{YX} C_{XX}^{-1}, \quad b = \mathbb{E}[Y] - W\mathbb{E}[X]
\end{equation}

\subsection*{Step 4. 层选择标准(CCA-Based Criterion)}
计算输入输出典型相关系数 $\rho_1, \rho_2, \dots, \rho_r$,用于衡量线性对齐程度:
\begin{equation}
\text{NMSE}(Y, \hat{Y}) \le (h_{out}-r) + \sum_{i=1}^r (1 - \rho_i^2)
\end{equation}
选择 NMSE 最低的 $m$ 层作为替换目标。

\subsection*{Step 5. 替换算法(Algorithm 1)}
见下方 Algorithm~\ref{alg:nbl}.

\begin{algorithm}[H]
\caption{Neural Block Linearization (NBL)}
\label{alg:nbl}
\KwIn{Attention layer inputs $X$ and outputs $Y$ from calibration dataset $\mathcal{D}$, number of layers to linearize $m$.}
\KwOut{Compressed LLM with $m$ linearized attention layers.}

\textbf{Initialization:} Load the pre-trained LLM, extract all attention layers $\mathcal{A} = \{A_1, A_2, \dots, A_K\}$.\;

\For{each attention layer $A_k \in \mathcal{A}$}{
  Collect input-output pairs $(X_k, Y_k)$ from $\mathcal{D}$ corresponding to layer $A_k$.\;
  Compute the bound on the linearization NMSE (Theorem 3.2) using canonical correlations $\rho_{k,i}$ between $Y_k$ and $X_k$:
  \[
  \mathrm{NMSE}_k = \frac{\mathrm{MSE}(Y_k, \hat{Y}_k)}{\mathrm{Tr}(C_{Y_k Y_k})}
  \le \sum_i (1 - \rho_{k,i}^2)
  \]\;
}
Select $m$ layers with the lowest NMSE bounds: $\mathcal{A}_{\text{low}} = \{A_{j_1}, \dots, A_{j_m}\} \subset \mathcal{A}$.\;
\For{each layer $A_j \in \mathcal{A}_{\text{low}}$}{
  Replace $A_j$ with its linear approximation (LMMSE linear layer):
  \[
  W_j = C_{Y_j X_j} C_{X_j X_j}^{-1}, \quad
  b_j = \mathbb{E}[Y_j] - W_j \mathbb{E}[X_j].
  \]
}
\end{algorithm}

\begin{algorithm}[H]
\caption{NBL Weight/Bias and CCA Bound Computation for a Given Self-Attention Layer}
\label{alg:cca-bound}
\KwIn{$X$ (attention layer input), $Y$ (attention layer output).}
\KwOut{Weight matrix $W$, bias $b$, and CCA-bound.}

$Y_+ \gets Y + X$ \tcp*{Residual connection output}

\textbf{Mean Computations:} $\mathbb{E}[X] \gets \mathrm{Mean}(X)$, 
$\mathbb{E}[Y] \gets \mathrm{Mean}(Y)$, 
$\mathbb{E}[Y_+] \gets \mathrm{Mean}(Y_+)$.\;

\textbf{Cross-Covariance:} $C_{Y X} \gets \mathrm{Cov}(Y, X)$, 
$C_{Y_+ X} \gets \mathrm{Cov}(Y_+, X)$.\;

\textbf{Covariance Matrices:} 
$C_{X X} \gets \mathrm{Cov}(X, X)$, 
$C_{Y_+ Y_+} \gets \mathrm{Cov}(Y_+, Y_+)$.\;

\textbf{Eigen-Decomposition:} 
$(\Lambda_{Y_+ Y_+}, V_{Y_+ Y_+}) \gets \mathrm{Eigh}(C_{Y_+ Y_+})$, 
$(\Lambda_{X X}, V_{X X}) \gets \mathrm{Eigh}(C_{X X})$.\;

\textbf{Inverse Square Roots:} 
$C_{Y_+ Y_+}^{-1/2} \gets V_{Y_+ Y_+} \, \mathrm{diag}(\Lambda_{Y_+ Y_+}^{-1/2}) \, V_{Y_+ Y_+}^\top$, 
$C_{X X}^{-1/2} \gets V_{X X} \, \mathrm{diag}(\Lambda_{X X}^{-1/2}) \, V_{X X}^\top$.\;

\textbf{Correlation Matrix:} 
$C_W \gets C_{Y_+ Y_+}^{-1/2} \, C_{Y_+ X} \, C_{X X}^{-1/2}$.\;

\textbf{Singular Value Decomposition (SVD):} 
$(U, S, V) \gets \mathrm{SVD}(C_W)$; 
$\rho_i \gets \mathrm{diag}(S)$; 
CCA-bound $\gets \sum_i (1 - \rho_i^2)$.\;

\textbf{Weight Matrix and Bias:} 
$W \gets C_{Y X} C_{X X}^{-1}$, 
$b \gets \mathbb{E}[Y] - W \mathbb{E}[X]$.\;

\end{algorithm}

\end{document}