Commit Β·
13fc2ef
1
Parent(s): 1309cfb
LaTeX: move tab:quirke-subtasks to appendix, update main body ref
Browse files
app.py
CHANGED
|
@@ -51,47 +51,13 @@ sum-9 boundary (SS), and \texttt{t16}/\texttt{t3} at the trivial positions (SC/S
|
|
| 51 |
At each answer-digit position $n$, the local computation falls into one
|
| 52 |
of ten mutually exclusive subtasks determined by the operand digits and
|
| 53 |
the carry or borrow state propagating from lower positions
|
| 54 |
-
(Table~\ref{tab:quirke-subtasks}).
|
| 55 |
A \emph{carry cascade} arises when consecutive digit pairs sum to exactly
|
| 56 |
9 (SS positions in addition): whether a carry propagates through such a
|
| 57 |
run depends on a single carry entering from the right, requiring the model
|
| 58 |
to track state across multiple positions.
|
| 59 |
Borrow cascades (UD) are the analogous structure in subtraction.
|
| 60 |
|
| 61 |
-
\begin{table}[h]
|
| 62 |
-
\centering
|
| 63 |
-
\small
|
| 64 |
-
\setlength{\tabcolsep}{6pt}
|
| 65 |
-
\begin{tabular}{llp{8.2cm}}
|
| 66 |
-
\toprule
|
| 67 |
-
& Label & Condition at digit position $n$ \\
|
| 68 |
-
\midrule
|
| 69 |
-
\multirow{5}{*}{\rotatebox[origin=c]{90}{Addition\;}}
|
| 70 |
-
& \textbf{SA} & $d_1{+}d_2 \leq 8$;\; no carry in or out \\
|
| 71 |
-
& \textbf{SC} & $d_1{+}d_2 \geq 10$;\; generates a carry \\
|
| 72 |
-
& \textbf{SS} & $d_1{+}d_2 = 9$;\; carry state \emph{uncertain}
|
| 73 |
-
(cascade boundary) \\
|
| 74 |
-
& \textbf{UC} & carry arrives from position $n{-}1$;\;
|
| 75 |
-
answer digit depends on it \\
|
| 76 |
-
& \textbf{US} & carry propagates through a run of SS positions
|
| 77 |
-
(sum-of-9 cascade) \\
|
| 78 |
-
\midrule
|
| 79 |
-
\multirow{5}{*}{\rotatebox[origin=c]{90}{Subtraction\;}}
|
| 80 |
-
& \textbf{MD} & $d_1 \geq d_2$;\; no borrow \\
|
| 81 |
-
& \textbf{MB} & $d_1 < d_2$;\; generates a borrow \\
|
| 82 |
-
& \textbf{ME} & $d_1 = d_2$;\; borrow state \emph{uncertain} \\
|
| 83 |
-
& \textbf{UB} & borrow arrives from position $n{-}1$ \\
|
| 84 |
-
& \textbf{UD} & borrow propagates through a run of ME positions \\
|
| 85 |
-
\bottomrule
|
| 86 |
-
\end{tabular}
|
| 87 |
-
\caption{Per-digit subtask labels for six-digit addition and
|
| 88 |
-
subtraction~\citep{quirke_2024_addsub_preprint}.
|
| 89 |
-
Cascades (US, UD) are the hardest: the answer digit cannot
|
| 90 |
-
be determined locally and requires resolving multi-position
|
| 91 |
-
carry/borrow propagation.}
|
| 92 |
-
\label{tab:quirke-subtasks}
|
| 93 |
-
\end{table}
|
| 94 |
-
|
| 95 |
\paragraph{Models and training.}
|
| 96 |
We evaluate three undersized architectures:
|
| 97 |
\texttt{1L/2H/256d} (1 transformer layer, 2 attention heads, hidden size 256),
|
|
@@ -356,6 +322,35 @@ LATEX_APPENDIX = r"""% βββββββββββββββββββ
|
|
| 356 |
\section{Arithmetic case study: interpretability analysis}
|
| 357 |
\label{app:arithmetic}
|
| 358 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 359 |
\paragraph{Setup.}
|
| 360 |
All interpretability analyses use model
|
| 361 |
\texttt{add\_sub\_sorl\_v1\_abs30\_K1\_100K\_2L1H128d}
|
|
|
|
| 51 |
At each answer-digit position $n$, the local computation falls into one
|
| 52 |
of ten mutually exclusive subtasks determined by the operand digits and
|
| 53 |
the carry or borrow state propagating from lower positions
|
| 54 |
+
(Table~\ref{tab:quirke-subtasks} in Appendix~\ref{app:arithmetic}).
|
| 55 |
A \emph{carry cascade} arises when consecutive digit pairs sum to exactly
|
| 56 |
9 (SS positions in addition): whether a carry propagates through such a
|
| 57 |
run depends on a single carry entering from the right, requiring the model
|
| 58 |
to track state across multiple positions.
|
| 59 |
Borrow cascades (UD) are the analogous structure in subtraction.
|
| 60 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 61 |
\paragraph{Models and training.}
|
| 62 |
We evaluate three undersized architectures:
|
| 63 |
\texttt{1L/2H/256d} (1 transformer layer, 2 attention heads, hidden size 256),
|
|
|
|
| 322 |
\section{Arithmetic case study: interpretability analysis}
|
| 323 |
\label{app:arithmetic}
|
| 324 |
|
| 325 |
+
\begin{table}[h]
|
| 326 |
+
\centering\small
|
| 327 |
+
\setlength{\tabcolsep}{6pt}
|
| 328 |
+
\begin{tabular}{llp{7.8cm}}
|
| 329 |
+
\toprule
|
| 330 |
+
& Label & Condition at digit position $n$ \\
|
| 331 |
+
\midrule
|
| 332 |
+
\multirow{5}{*}{\rotatebox[origin=c]{90}{Addition\;}}
|
| 333 |
+
& \textbf{SA} & $d_1{+}d_2 \leq 8$;\; no carry in or out \\
|
| 334 |
+
& \textbf{SC} & $d_1{+}d_2 \geq 10$;\; generates a carry \\
|
| 335 |
+
& \textbf{SS} & $d_1{+}d_2 = 9$;\; carry state \emph{uncertain} (cascade boundary) \\
|
| 336 |
+
& \textbf{UC} & carry arrives from position $n{-}1$;\; answer digit depends on it \\
|
| 337 |
+
& \textbf{US} & carry propagates through a run of SS positions (sum-of-9 cascade) \\
|
| 338 |
+
\midrule
|
| 339 |
+
\multirow{5}{*}{\rotatebox[origin=c]{90}{Subtraction\;}}
|
| 340 |
+
& \textbf{MD} & $d_1 \geq d_2$;\; no borrow \\
|
| 341 |
+
& \textbf{MB} & $d_1 < d_2$;\; generates a borrow \\
|
| 342 |
+
& \textbf{ME} & $d_1 = d_2$;\; borrow state \emph{uncertain} \\
|
| 343 |
+
& \textbf{UB} & borrow arrives from position $n{-}1$ \\
|
| 344 |
+
& \textbf{UD} & borrow propagates through a run of ME positions \\
|
| 345 |
+
\bottomrule
|
| 346 |
+
\end{tabular}
|
| 347 |
+
\caption{Per-digit subtask labels for six-digit addition and
|
| 348 |
+
subtraction~\citep{quirke_2024_addsub_preprint}.
|
| 349 |
+
Cascades (US, UD) require tracking carry/borrow state across
|
| 350 |
+
multiple positions and are the hardest splits.}
|
| 351 |
+
\label{tab:quirke-subtasks}
|
| 352 |
+
\end{table}
|
| 353 |
+
|
| 354 |
\paragraph{Setup.}
|
| 355 |
All interpretability analyses use model
|
| 356 |
\texttt{add\_sub\_sorl\_v1\_abs30\_K1\_100K\_2L1H128d}
|