Spaces:

thoughtworks
/

arithmetic-sorl-dashboard

Sleeping

App Files Files Community

amirali1985 commited on 26 days ago

Commit

d1a4946

1 Parent(s): bc93f38

LaTeX tab: add training details, results table, appendix; fix figure legend overlap

Browse files

Files changed (1) hide show

app.py +153 -3

app.py CHANGED Viewed

@@ -92,14 +92,28 @@ Borrow cascades (UD) are the analogous structure in subtraction.
   \label{tab:quirke-subtasks}
 \end{table}
 \paragraph{Evaluation splits.}
 We evaluate on \emph{C-splits} (C1--C6) grouping problems by the length
 of the longest consecutive hot-carry chain with varied answer digits,
 following~\citet{quirke_2024_addsub_preprint}.
 C6 (six consecutive carries) is the hardest split.
-Across three undersized architectures and data sizes from 10K to 100K,
-\sorl{} wins on C6 in all 13 tested configurations, with gains as large
-as $+50$\,pp (44\% $\to$ 94\% on the smallest model at 50K examples;
 Table~\ref{tab:undersized-wins}).
 """
@@ -249,6 +263,133 @@ LATEX_FIGURE_EXAMPLE = r"""% fig_arithmetic_example.tex
 \end{figure}
 """
 HARD_SPLITS = ["add_C4", "add_C5", "add_C6", "sub_M4", "sub_M5"]
 ALL_SPLITS = [
     "add_S0", "add_S1", "add_S2", "add_S3", "add_S4", "add_S5", "add_S6", "add_random",
@@ -866,11 +1007,20 @@ hidden activations — but here it is readable directly from the token sequence.
             gr.Code(value=LATEX_ARITHMETIC_SETUP, label="arithmetic_setup.tex",
                     language=None, interactive=False)
             gr.Markdown("#### § Carry-cascade example figure (TikZ)")
             gr.Markdown("Requires: `\\usepackage{tikz}`, `\\usetikzlibrary{matrix}`, `\\usepackage{xcolor}`, and `\\providecommand{\\sorl}{\\textsc{DLR}}`.")
             gr.Code(value=LATEX_FIGURE_EXAMPLE, label="fig_arithmetic_example.tex",
                     language=None, interactive=False)
         # ── Tab 4: About ──
         with gr.TabItem("About"):
             eval_info_md = gr.Markdown("")

   \label{tab:quirke-subtasks}
 \end{table}
+\paragraph{Models and training.}
+We evaluate three undersized architectures:
+\texttt{1L/2H/256d} (1 transformer layer, 2 attention heads, hidden size 256),
+\texttt{1L/3H/510d} (1 layer, 3 heads, hidden size 510),
+and \texttt{2L/1H/128d} (2 layers, 1 head, hidden size 128).
+All models are trained from scratch with AdamW
+($\eta = 8{\times}10^{-5}$, $\beta_1{=}0.9$, $\beta_2{=}0.999$,
+weight decay $0.01$, 3\% linear warmup),
+batch size 64, for 20 epochs on fixed datasets of
+10K--100K six-digit addition/subtraction problems.
+The abstraction codebook has $|\mathcal{A}|{=}30$ tokens with $K{=}1$
+(one routing token per answer-digit position).
 \paragraph{Evaluation splits.}
 We evaluate on \emph{C-splits} (C1--C6) grouping problems by the length
 of the longest consecutive hot-carry chain with varied answer digits,
 following~\citet{quirke_2024_addsub_preprint}.
 C6 (six consecutive carries) is the hardest split.
+Across the three architectures and data sizes from 10K to 100K,
+\sorl{} wins in 12 of 13 tested configurations overall, and on C6 in
+all 13, with gains as large as $+50$\,pp
+(44\% $\to$ 94\% on the smallest model at 50K examples;
 Table~\ref{tab:undersized-wins}).
 """
 \end{figure}
 """
+LATEX_TABLE_UNDERSIZED = r"""% tab:undersized-wins — SoRL vs SFT on undersized architectures
+% Generated by arithmetic/paper/results/result_low_data_wins/run.py
+% Requires: \usepackage{booktabs}, \usepackage{xcolor}
+\begin{table}[t]
+  \centering
+  \small
+  \begin{tabular}{llrrrr}
+    \toprule
+    Architecture & Data & Baseline & SoRL & Gap & C6 gap \\
+    \midrule
+    \texttt{1L/2H/256d} & 10K  & 10\% & \textbf{19\%} & \textcolor{green!50!black}{\textbf{+9\%}}  & \textcolor{green!50!black}{\textbf{+18\%}} \\
+                        & 25K  & 32\% & 26\%          & $-7\%$                                    & \textcolor{green!50!black}{\textbf{+10\%}} \\
+                        & 50K  & 44\% & \textbf{65\%} & \textcolor{green!50!black}{\textbf{+21\%}} & \textcolor{green!50!black}{\textbf{+34\%}} \\
+                        & 100K & 49\% & \textbf{65\%} & \textcolor{green!50!black}{\textbf{+16\%}} & \textcolor{green!50!black}{\textbf{+31\%}} \\
+    \midrule
+    \texttt{1L/3H/510d} & 10K  & 36\% & \textbf{52\%} & \textcolor{green!50!black}{\textbf{+16\%}} & \textcolor{green!50!black}{\textbf{+30\%}} \\
+                        & 25K  & 46\% & \textbf{60\%} & \textcolor{green!50!black}{\textbf{+14\%}} & \textcolor{green!50!black}{\textbf{+22\%}} \\
+                        & 50K  & 53\% & \textbf{72\%} & \textcolor{green!50!black}{\textbf{+19\%}} & \textcolor{green!50!black}{\textbf{+38\%}} \\
+                        & 100K & 67\% & \textbf{83\%} & \textcolor{green!50!black}{\textbf{+16\%}} & \textcolor{green!50!black}{\textbf{+26\%}} \\
+    \midrule
+    \texttt{2L/1H/128d} & 10K  & 16\% & \textbf{36\%} & \textcolor{green!50!black}{\textbf{+21\%}} & \textcolor{green!50!black}{\textbf{+39\%}} \\
+                        & 25K  & 40\% & \textbf{55\%} & \textcolor{green!50!black}{\textbf{+15\%}} & \textcolor{green!50!black}{\textbf{+23\%}} \\
+                        & 50K  & 59\% & \textbf{87\%} & \textcolor{green!50!black}{\textbf{+28\%}} & \textcolor{green!50!black}{\textbf{+50\%}} \\
+                        & 75K  & 75\% & \textbf{87\%} & \textcolor{green!50!black}{\textbf{+12\%}} & \textcolor{green!50!black}{\textbf{+5\%}}  \\
+                        & 100K & 73\% & \textbf{95\%} & \textcolor{green!50!black}{\textbf{+22\%}} & \textcolor{green!50!black}{\textbf{+33\%}} \\
+    \bottomrule
+  \end{tabular}
+  \caption{\sorl{} ($K{=}1$, $|\mathcal{A}|{=}30$) vs.\ \sft{} baseline on
+    undersized architectures across data sizes.
+    \textbf{Gap} = overall accuracy gain; \textbf{C6 gap} = gain on
+    6-deep carry cascades (the hardest split).
+    \sorl{} wins in \textbf{12 of 13} (architecture, data-size) pairs;
+    the single exception is \texttt{1L/2H/256d} at 25K, where the model
+    is undertrained (accuracy still rising at epoch 20).
+    \sorl{} wins on C6 in \textbf{all 13} configurations.}
+  \label{tab:undersized-wins}
+\end{table}
+"""
+LATEX_APPENDIX = r"""% ── Appendix: Arithmetic interpretability details ──────────────────────────
+\section{Arithmetic case study: experimental details}
+\label{app:arithmetic}
+\subsection{Task and data}
+Six-digit addition and subtraction problems are formatted as:
+\[
+  \underbrace{d_1 d_2 d_3 d_4 d_5 d_6}_{\text{operand A}}
+  \; \mathtt{+/-} \;
+  \underbrace{d_1 d_2 d_3 d_4 d_5 d_6}_{\text{operand B}}
+  \; \mathtt{=} \;
+  \underbrace{d_0 d_1 d_2 d_3 d_4 d_5 d_6}_{\text{answer (7 digits)}}
+\]
+where all operands are zero-padded to 6 digits and answers to 7 digits
+(the leading $d_0$ captures the overflow carry or borrow).
+Each symbol is mapped to a unique token via a fixed 13-symbol vocabulary
+($\mathtt{0}$--$\mathtt{9}$, $\mathtt{+}$, $\mathtt{-}$, $\mathtt{=}$),
+giving sequences of exactly 21 tokens (14 prompt, 7 answer).
+Training data is drawn uniformly at random from all valid 6-digit
+addition/subtraction problems; subtraction problems are enriched to
+over-represent borrow cascades (40\% of digit positions forced equal,
+giving MB:3\%, MB$_3$:0.8\% vs.\ 0.7\%/0.04\% without enrichment).
+Datasets are fixed (seed 42) and hosted on HuggingFace
+(\texttt{thoughtworks/arithmetic-sorl-data}).
+\subsection{Model architectures}
+All models are decoder-only transformers trained from scratch using a
+Qwen3-0.6B tokenizer (digit-level; each symbol = 1 token).
+The three undersized configurations evaluated in Table~\ref{tab:undersized-wins}:
+\begin{table}[h]
+  \centering
+  \small
+  \begin{tabular}{lrrrrr}
+    \toprule
+    Name & Layers & Heads & Hidden & FFN & Parameters \\
+    \midrule
+    \texttt{1L/2H/256d} & 1 & 2 & 256 & 1024 & $\sim$0.3M \\
+    \texttt{1L/3H/510d} & 1 & 3 & 510 & 2040 & $\sim$2.0M \\
+    \texttt{2L/1H/128d} & 2 & 1 & 128 & 512  & $\sim$0.1M \\
+    \bottomrule
+  \end{tabular}
+  \caption{Undersized architectures. All use pre-norm, GeLU activation,
+    and the Qwen3 tokenizer.}
+\end{table}
+\subsection{Training hyperparameters}
+\begin{table}[h]
+  \centering
+  \small
+  \begin{tabular}{ll}
+    \toprule
+    Hyperparameter & Value \\
+    \midrule
+    Optimizer        & AdamW \\
+    Learning rate    & $8 \times 10^{-5}$ \\
+    $\beta_1, \beta_2$ & $0.9,\; 0.999$ \\
+    Weight decay     & $0.01$ \\
+    LR schedule      & Linear warmup (3\%) then constant \\
+    Batch size       & 64 \\
+    Epochs           & 20 \\
+    \sorl{} codebook & $|\mathcal{A}|=30$, $K=1$ \\
+    \sorl{} loss weights & $\alpha_{\text{info-gain}}=10$, $\alpha_{\text{abs}}=0.1$, $\alpha_{\text{zipf}}=1.0$ \\
+    \bottomrule
+  \end{tabular}
+  \caption{Training hyperparameters shared across all undersized-architecture runs.}
+\end{table}
+\subsection{Evaluation}
+We evaluate using fixed-length autoregressive decoding: the model generates
+answer digits one at a time (left-to-right, $d_0 \to d_6$) using its own
+predictions. Abstraction tokens are inserted via the SoRL recursion
+(search-then-recurse), not sampled autoregressively, matching the training
+procedure. We never use teacher forcing at eval time.
+Accuracy is measured on 100 held-out examples per evaluation split
+(seed 42, hosted on HuggingFace). The C-splits (C1--C6) group problems
+by the length of the longest consecutive carry chain with varied answer digits,
+following~\citet{quirke_2024_addsub_preprint}.
+"""
 HARD_SPLITS = ["add_C4", "add_C5", "add_C6", "sub_M4", "sub_M5"]
 ALL_SPLITS = [
     "add_S0", "add_S1", "add_S2", "add_S3", "add_S4", "add_S5", "add_S6", "add_random",
             gr.Code(value=LATEX_ARITHMETIC_SETUP, label="arithmetic_setup.tex",
                     language=None, interactive=False)
+            gr.Markdown("#### § Results table — SoRL vs baseline on undersized architectures")
+            gr.Markdown("Requires: `\\usepackage{booktabs}`, `\\usepackage{xcolor}`.")
+            gr.Code(value=LATEX_TABLE_UNDERSIZED, label="tab_undersized_wins.tex",
+                    language=None, interactive=False)
             gr.Markdown("#### § Carry-cascade example figure (TikZ)")
             gr.Markdown("Requires: `\\usepackage{tikz}`, `\\usetikzlibrary{matrix}`, `\\usepackage{xcolor}`, and `\\providecommand{\\sorl}{\\textsc{DLR}}`.")
             gr.Code(value=LATEX_FIGURE_EXAMPLE, label="fig_arithmetic_example.tex",
                     language=None, interactive=False)
+            gr.Markdown("#### § Appendix — full experimental details")
+            gr.Code(value=LATEX_APPENDIX, label="appendix_arithmetic.tex",
+                    language=None, interactive=False)
         # ── Tab 4: About ──
         with gr.TabItem("About"):
             eval_info_md = gr.Markdown("")