Commit Β·
ba15dfe
1
Parent(s): f8026c4
finding boxes: description in title
Browse files
app.py
CHANGED
|
@@ -76,10 +76,9 @@ The margin grows with cascade depth, consistent with explicit carry/borrow
|
|
| 76 |
routing being the mechanism behind the gain.
|
| 77 |
|
| 78 |
\begin{tcolorbox}[colback=gray!6, colframe=gray!40,
|
| 79 |
-
fonttitle=\bfseries\small, title={Finding \#1},
|
| 80 |
left=5pt, right=5pt, top=4pt, bottom=4pt]
|
| 81 |
-
\small \sorl{}
|
| 82 |
-
externalizing carry routing as named tokens that recover Quirke's taxonomy without supervision.
|
| 83 |
\end{tcolorbox}
|
| 84 |
"""
|
| 85 |
|
|
@@ -413,10 +412,9 @@ Three patterns are notable:
|
|
| 413 |
\end{itemize}
|
| 414 |
|
| 415 |
\begin{tcolorbox}[colback=gray!6, colframe=gray!40,
|
| 416 |
-
fonttitle=\bfseries\small, title={Finding \#2},
|
| 417 |
left=5pt, right=5pt, top=4pt, bottom=4pt]
|
| 418 |
-
\small
|
| 419 |
-
shuffle hurts more than random because wrong-position tokens cause systematic carry errors.
|
| 420 |
\end{tcolorbox}
|
| 421 |
|
| 422 |
% βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
|
@@ -465,9 +463,9 @@ token \texttt{t23} is the subtraction mirror (UD, 88\%, position $d_3$).
|
|
| 465 |
\end{table}
|
| 466 |
|
| 467 |
\begin{tcolorbox}[colback=gray!6, colframe=gray!40,
|
| 468 |
-
fonttitle=\bfseries\small, title={Finding \#3},
|
| 469 |
left=5pt, right=5pt, top=4pt, bottom=4pt]
|
| 470 |
-
\small
|
| 471 |
\end{tcolorbox}
|
| 472 |
|
| 473 |
% βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
|
@@ -496,9 +494,9 @@ consistent with those positions encoding longer-range carry state that a
|
|
| 496 |
single-position swap cannot resolve.
|
| 497 |
|
| 498 |
\begin{tcolorbox}[colback=gray!6, colframe=gray!40,
|
| 499 |
-
fonttitle=\bfseries\small, title={Finding \#4},
|
| 500 |
left=5pt, right=5pt, top=4pt, bottom=4pt]
|
| 501 |
-
\small
|
| 502 |
\end{tcolorbox}
|
| 503 |
|
| 504 |
% βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
|
@@ -540,10 +538,9 @@ readable token in the output sequence, accessible without any
|
|
| 540 |
post-hoc analysis.
|
| 541 |
|
| 542 |
\begin{tcolorbox}[colback=gray!6, colframe=gray!40,
|
| 543 |
-
fonttitle=\bfseries\small, title={Finding \#5},
|
| 544 |
left=5pt, right=5pt, top=4pt, bottom=4pt]
|
| 545 |
-
\small \sorl{} rediscovers
|
| 546 |
-
what \citet{quirke_2024_addsub_preprint} needed PCA to reveal, \sorl{} externalizes as a routing token.
|
| 547 |
\end{tcolorbox}
|
| 548 |
|
| 549 |
% βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
|
@@ -592,10 +589,9 @@ variable. The specialist tokens concentrate at mid-sequence positions
|
|
| 592 |
($d_2$--$d_4$) where carry propagation is most structured.
|
| 593 |
|
| 594 |
\begin{tcolorbox}[colback=gray!6, colframe=gray!40,
|
| 595 |
-
fonttitle=\bfseries\small, title={Finding \#6},
|
| 596 |
left=5pt, right=5pt, top=4pt, bottom=4pt]
|
| 597 |
-
\small
|
| 598 |
-
polysemanticity concentrates at the most variable overflow positions.
|
| 599 |
\end{tcolorbox}
|
| 600 |
|
| 601 |
% βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
|
@@ -634,10 +630,9 @@ The full procedure is in \texttt{experiments/11\_auto\_interp/run.py}.
|
|
| 634 |
\end{table}
|
| 635 |
|
| 636 |
\begin{tcolorbox}[colback=gray!6, colframe=gray!40,
|
| 637 |
-
fonttitle=\bfseries\small, title={Finding \#7},
|
| 638 |
left=5pt, right=5pt, top=4pt, bottom=4pt]
|
| 639 |
-
\small Automated interpretation
|
| 640 |
-
high-confidence specialists get crisp descriptions; polysemantic tokens get vague ones.
|
| 641 |
\end{tcolorbox}
|
| 642 |
|
| 643 |
% βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
|
|
|
| 76 |
routing being the mechanism behind the gain.
|
| 77 |
|
| 78 |
\begin{tcolorbox}[colback=gray!6, colframe=gray!40,
|
| 79 |
+
fonttitle=\bfseries\small, title={Finding \#1: \sorl{} increases accuracy on 6-digit arithmetic dramatically},
|
| 80 |
left=5pt, right=5pt, top=4pt, bottom=4pt]
|
| 81 |
+
\small \sorl{} increases accuracy on 6-digit arithmetic dramatically, winning on all cascade splits.
|
|
|
|
| 82 |
\end{tcolorbox}
|
| 83 |
"""
|
| 84 |
|
|
|
|
| 412 |
\end{itemize}
|
| 413 |
|
| 414 |
\begin{tcolorbox}[colback=gray!6, colframe=gray!40,
|
| 415 |
+
fonttitle=\bfseries\small, title={Finding \#2: Abstraction tokens are causally necessary},
|
| 416 |
left=5pt, right=5pt, top=4pt, bottom=4pt]
|
| 417 |
+
\small \sorl{} abstraction tokens are causally necessary for correct computation.
|
|
|
|
| 418 |
\end{tcolorbox}
|
| 419 |
|
| 420 |
% βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
|
|
|
| 463 |
\end{table}
|
| 464 |
|
| 465 |
\begin{tcolorbox}[colback=gray!6, colframe=gray!40,
|
| 466 |
+
fonttitle=\bfseries\small, title={Finding \#3: Tokens spontaneously specialize by subtask and position},
|
| 467 |
left=5pt, right=5pt, top=4pt, bottom=4pt]
|
| 468 |
+
\small \sorl{} tokens spontaneously specialize by subtask and position without supervision.
|
| 469 |
\end{tcolorbox}
|
| 470 |
|
| 471 |
% βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
|
|
|
| 494 |
single-position swap cannot resolve.
|
| 495 |
|
| 496 |
\begin{tcolorbox}[colback=gray!6, colframe=gray!40,
|
| 497 |
+
fonttitle=\bfseries\small, title={Finding \#4: Single token swaps fix mispredicted carry-heavy examples},
|
| 498 |
left=5pt, right=5pt, top=4pt, bottom=4pt]
|
| 499 |
+
\small Single token swaps fix mispredicted carry-heavy examples, enabling targeted model correction.
|
| 500 |
\end{tcolorbox}
|
| 501 |
|
| 502 |
% βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
|
|
|
| 538 |
post-hoc analysis.
|
| 539 |
|
| 540 |
\begin{tcolorbox}[colback=gray!6, colframe=gray!40,
|
| 541 |
+
fonttitle=\bfseries\small, title={Finding \#5: \sorl{} rediscovers known arithmetic circuits without supervision},
|
| 542 |
left=5pt, right=5pt, top=4pt, bottom=4pt]
|
| 543 |
+
\small \sorl{} rediscovers known arithmetic circuits without supervision or activation access.
|
|
|
|
| 544 |
\end{tcolorbox}
|
| 545 |
|
| 546 |
% βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
|
|
|
| 589 |
($d_2$--$d_4$) where carry propagation is most structured.
|
| 590 |
|
| 591 |
\begin{tcolorbox}[colback=gray!6, colframe=gray!40,
|
| 592 |
+
fonttitle=\bfseries\small, title={Finding \#6: The codebook mixes specialist and polysemantic tokens},
|
| 593 |
left=5pt, right=5pt, top=4pt, bottom=4pt]
|
| 594 |
+
\small The codebook mixes highly specialist tokens with polysemantic fallback tokens.
|
|
|
|
| 595 |
\end{tcolorbox}
|
| 596 |
|
| 597 |
% βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
|
|
|
| 630 |
\end{table}
|
| 631 |
|
| 632 |
\begin{tcolorbox}[colback=gray!6, colframe=gray!40,
|
| 633 |
+
fonttitle=\bfseries\small, title={Finding \#7: Automated interpretation produces human-readable token roles},
|
| 634 |
left=5pt, right=5pt, top=4pt, bottom=4pt]
|
| 635 |
+
\small Automated interpretation produces human-readable role descriptions that match ground-truth subtask labels.
|
|
|
|
| 636 |
\end{tcolorbox}
|
| 637 |
|
| 638 |
% βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|