amirali1985 commited on
Commit
ba15dfe
Β·
1 Parent(s): f8026c4

finding boxes: description in title

Browse files
Files changed (1) hide show
  1. app.py +14 -19
app.py CHANGED
@@ -76,10 +76,9 @@ The margin grows with cascade depth, consistent with explicit carry/borrow
76
  routing being the mechanism behind the gain.
77
 
78
  \begin{tcolorbox}[colback=gray!6, colframe=gray!40,
79
- fonttitle=\bfseries\small, title={Finding \#1},
80
  left=5pt, right=5pt, top=4pt, bottom=4pt]
81
- \small \sorl{} wins on 12/13 configurations overall and all 13 on C6 ($+50$\,pp),
82
- externalizing carry routing as named tokens that recover Quirke's taxonomy without supervision.
83
  \end{tcolorbox}
84
  """
85
 
@@ -413,10 +412,9 @@ Three patterns are notable:
413
  \end{itemize}
414
 
415
  \begin{tcolorbox}[colback=gray!6, colframe=gray!40,
416
- fonttitle=\bfseries\small, title={Finding \#2},
417
  left=5pt, right=5pt, top=4pt, bottom=4pt]
418
- \small Tokens are causally necessary (knockout $\to$ 0.1\%);
419
- shuffle hurts more than random because wrong-position tokens cause systematic carry errors.
420
  \end{tcolorbox}
421
 
422
  % ─────────────────────────────────────────────────────────────────────────────
@@ -465,9 +463,9 @@ token \texttt{t23} is the subtraction mirror (UD, 88\%, position $d_3$).
465
  \end{table}
466
 
467
  \begin{tcolorbox}[colback=gray!6, colframe=gray!40,
468
- fonttitle=\bfseries\small, title={Finding \#3},
469
  left=5pt, right=5pt, top=4pt, bottom=4pt]
470
- \small 23/30 tokens active; each locks to 1--2 Quirke subtasks (${\geq}70\%$ purity) and 1--2 answer positions.
471
  \end{tcolorbox}
472
 
473
  % ─────────────────────────────────────────────────────────────────────────────
@@ -496,9 +494,9 @@ consistent with those positions encoding longer-range carry state that a
496
  single-position swap cannot resolve.
497
 
498
  \begin{tcolorbox}[colback=gray!6, colframe=gray!40,
499
- fonttitle=\bfseries\small, title={Finding \#4},
500
  left=5pt, right=5pt, top=4pt, bottom=4pt]
501
- \small Swapping one token fixes 27--31\% of mispredicted carry-heavy examples β€” no weight updates needed.
502
  \end{tcolorbox}
503
 
504
  % ─────────────────────────────────────────────────────────────────────────────
@@ -540,10 +538,9 @@ readable token in the output sequence, accessible without any
540
  post-hoc analysis.
541
 
542
  \begin{tcolorbox}[colback=gray!6, colframe=gray!40,
543
- fonttitle=\bfseries\small, title={Finding \#5},
544
  left=5pt, right=5pt, top=4pt, bottom=4pt]
545
- \small \sorl{} rediscovers the carry-state tri-classifier ($\{0,U,1\}$) unsupervised;
546
- what \citet{quirke_2024_addsub_preprint} needed PCA to reveal, \sorl{} externalizes as a routing token.
547
  \end{tcolorbox}
548
 
549
  % ─────────────────────────────────────────────────────────────────────────────
@@ -592,10 +589,9 @@ variable. The specialist tokens concentrate at mid-sequence positions
592
  ($d_2$--$d_4$) where carry propagation is most structured.
593
 
594
  \begin{tcolorbox}[colback=gray!6, colframe=gray!40,
595
- fonttitle=\bfseries\small, title={Finding \#6},
596
  left=5pt, right=5pt, top=4pt, bottom=4pt]
597
- \small Specialist tokens (\texttt{t21}: 94\% purity) coexist with polysemantic fallbacks (\texttt{t1}: 24\%);
598
- polysemanticity concentrates at the most variable overflow positions.
599
  \end{tcolorbox}
600
 
601
  % ─────────────────────────────────────────────────────────────────────────────
@@ -634,10 +630,9 @@ The full procedure is in \texttt{experiments/11\_auto\_interp/run.py}.
634
  \end{table}
635
 
636
  \begin{tcolorbox}[colback=gray!6, colframe=gray!40,
637
- fonttitle=\bfseries\small, title={Finding \#7},
638
  left=5pt, right=5pt, top=4pt, bottom=4pt]
639
- \small Automated interpretation matches Quirke labels without accessing them:
640
- high-confidence specialists get crisp descriptions; polysemantic tokens get vague ones.
641
  \end{tcolorbox}
642
 
643
  % ─────────────────────────────────────────────────────────────────────────────
 
76
  routing being the mechanism behind the gain.
77
 
78
  \begin{tcolorbox}[colback=gray!6, colframe=gray!40,
79
+ fonttitle=\bfseries\small, title={Finding \#1: \sorl{} increases accuracy on 6-digit arithmetic dramatically},
80
  left=5pt, right=5pt, top=4pt, bottom=4pt]
81
+ \small \sorl{} increases accuracy on 6-digit arithmetic dramatically, winning on all cascade splits.
 
82
  \end{tcolorbox}
83
  """
84
 
 
412
  \end{itemize}
413
 
414
  \begin{tcolorbox}[colback=gray!6, colframe=gray!40,
415
+ fonttitle=\bfseries\small, title={Finding \#2: Abstraction tokens are causally necessary},
416
  left=5pt, right=5pt, top=4pt, bottom=4pt]
417
+ \small \sorl{} abstraction tokens are causally necessary for correct computation.
 
418
  \end{tcolorbox}
419
 
420
  % ─────────────────────────────────────────────────────────────────────────────
 
463
  \end{table}
464
 
465
  \begin{tcolorbox}[colback=gray!6, colframe=gray!40,
466
+ fonttitle=\bfseries\small, title={Finding \#3: Tokens spontaneously specialize by subtask and position},
467
  left=5pt, right=5pt, top=4pt, bottom=4pt]
468
+ \small \sorl{} tokens spontaneously specialize by subtask and position without supervision.
469
  \end{tcolorbox}
470
 
471
  % ─────────────────────────────────────────────────────────────────────────────
 
494
  single-position swap cannot resolve.
495
 
496
  \begin{tcolorbox}[colback=gray!6, colframe=gray!40,
497
+ fonttitle=\bfseries\small, title={Finding \#4: Single token swaps fix mispredicted carry-heavy examples},
498
  left=5pt, right=5pt, top=4pt, bottom=4pt]
499
+ \small Single token swaps fix mispredicted carry-heavy examples, enabling targeted model correction.
500
  \end{tcolorbox}
501
 
502
  % ─────────────────────────────────────────────────────────────────────────────
 
538
  post-hoc analysis.
539
 
540
  \begin{tcolorbox}[colback=gray!6, colframe=gray!40,
541
+ fonttitle=\bfseries\small, title={Finding \#5: \sorl{} rediscovers known arithmetic circuits without supervision},
542
  left=5pt, right=5pt, top=4pt, bottom=4pt]
543
+ \small \sorl{} rediscovers known arithmetic circuits without supervision or activation access.
 
544
  \end{tcolorbox}
545
 
546
  % ─────────────────────────────────────────────────────────────────────────────
 
589
  ($d_2$--$d_4$) where carry propagation is most structured.
590
 
591
  \begin{tcolorbox}[colback=gray!6, colframe=gray!40,
592
+ fonttitle=\bfseries\small, title={Finding \#6: The codebook mixes specialist and polysemantic tokens},
593
  left=5pt, right=5pt, top=4pt, bottom=4pt]
594
+ \small The codebook mixes highly specialist tokens with polysemantic fallback tokens.
 
595
  \end{tcolorbox}
596
 
597
  % ─────────────────────────────────────────────────────────────────────────────
 
630
  \end{table}
631
 
632
  \begin{tcolorbox}[colback=gray!6, colframe=gray!40,
633
+ fonttitle=\bfseries\small, title={Finding \#7: Automated interpretation produces human-readable token roles},
634
  left=5pt, right=5pt, top=4pt, bottom=4pt]
635
+ \small Automated interpretation produces human-readable role descriptions that match ground-truth subtask labels.
 
636
  \end{tcolorbox}
637
 
638
  % ─────────────────────────────────────────────────────────────────────────────